b8d73c350d82ff9671161004e7e3897e4696ba3f
1 require 'html/tokenizer'
3 require 'html/selector'
4 require 'html/sanitizer'
7 # A top-level HTMl document. You give it a body of text, and it will parse that
8 # text into a tree of nodes.
9 class Document
#:nodoc:
11 # The root of the parsed document.
14 # Create a new Document from the given text.
15 def initialize(text
, strict
=false, xml
=false)
16 tokenizer
= Tokenizer
.new(text
)
18 node_stack
= [ @root ]
19 while token
= tokenizer
.next
20 node
= Node
.parse(node_stack
.last
, tokenizer
.line
, tokenizer
.position
, token
, strict
)
22 node_stack
.last
.children
<< node
unless node
.tag
? && node
.closing
== :close
24 if node_stack
.length
> 1 && node
.closing
== :close
25 if node_stack
.last
.name
== node
.name
26 if node_stack
.last
.children
.empty
?
27 node_stack
.last
.children
<< Text
.new(node_stack
.last
, node
.line
, node
.position
, "")
31 open_start
= node_stack
.last
.position
- 20
32 open_start
= 0 if open_start
< 0
33 close_start
= node
.position
- 20
34 close_start
= 0 if close_start
< 0
36 ignoring attempt to close #{node_stack.last.name} with #{node.name}
37 opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
38 closed at byte #{node.position}, line #{node.line}
39 attributes at open: #{node_stack.last.attributes.inspect}
40 text around open: #{text[open_start,40].inspect}
41 text around close: #{text[close_start,40].inspect}
43 strict
? raise(msg
) : warn(msg
)
45 elsif !node
.childless
?(xml
) && node
.closing
!= :close
52 # Search the tree for (and return) the first node that matches the given
53 # conditions. The conditions are interpreted differently for different node
54 # types, see HTML::Text#find and HTML::Tag#find.
56 @root.find(conditions
)
59 # Search the tree for (and return) all nodes that match the given
60 # conditions. The conditions are interpreted differently for different node
61 # types, see HTML::Text#find and HTML::Tag#find.
62 def find_all(conditions
)
63 @root.find_all(conditions
)