5 class Conditions
< Hash
#:nodoc:
8 hash
= { :content => hash
} unless Hash
=== hash
9 hash
= keys_to_symbols(hash
)
12 when :tag, :content then
13 # keys are valid, and require no further processing
15 hash
[k
] = keys_to_strings(v
)
16 when :parent, :child, :ancestor, :descendant, :sibling, :before,
18 hash
[k
] = Conditions
.new(v
)
20 hash
[k
] = v
= keys_to_symbols(v
)
23 when :count, :greater_than, :less_than
24 # keys are valid, and require no further processing
26 v
[k
] = Conditions
.new(v2
)
28 raise "illegal key #{k.inspect} => #{v2.inspect}"
32 raise "illegal key #{k.inspect} => #{v.inspect}"
40 def keys_to_strings(hash
)
41 hash
.keys
.inject({}) do |h
,k
|
47 def keys_to_symbols(hash
)
48 hash
.keys
.inject({}) do |h
,k
|
49 raise "illegal key #{k.inspect}" unless k
.respond_to
?(:to_sym)
56 # The base class of all nodes, textual and otherwise, in an HTML document.
58 # The array of children of this node. Not all nodes have children.
61 # The parent node of this node. All nodes have a parent, except for the
65 # The line number of the input where this node was begun
68 # The byte position in the input where this node was begun
71 # Create a new node as a child of the given parent.
72 def initialize(parent
, line
=0, pos
=0)
75 @line, @position = line
, pos
78 # Return a textual representation of the node.
81 @children.each
{ |child
| s
<< child
.to_s
}
85 # Return false (subclasses must override this to provide specific matching
86 # behavior.) +conditions+ may be of any type.
91 # Search the children of this node for the first node for which #find
92 # returns non +nil+. Returns the result of the #find call that succeeded.
94 conditions
= validate_conditions(conditions
)
95 @children.each
do |child
|
96 node
= child
.find(conditions
)
102 # Search for all nodes that match the given conditions, and return them
104 def find_all(conditions
)
105 conditions
= validate_conditions(conditions
)
108 matches
<< self if match(conditions
)
109 @children.each
do |child
|
110 matches
.concat child
.find_all(conditions
)
115 # Returns +false+. Subclasses may override this if they define a kind of
121 def validate_conditions(conditions
)
122 Conditions
=== conditions
? conditions
: Conditions
.new(conditions
)
126 return false unless self.class == node
.class && children
.size
== node
.children
.size
130 children
.size
.times
do |i
|
131 equivalent
&&= children
[i
] == node
.children
[i
]
138 def parse(parent, line, pos, content, strict=true)
140 Text.new(parent, line, pos, content)
142 scanner = StringScanner.new(content)
144 unless scanner.skip(/</)
148 return Text.new(parent, line, pos, content)
152 if scanner.skip(/!\[CDATA\[/)
153 unless scanner.skip_until(/\]\]>/)
155 raise "expected ]]> (got #{scanner.rest.inspect} for #{content})"
157 scanner.skip_until(/\Z/)
161 return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
164 closing = ( scanner.scan(/\//) ? :close : nil )
165 return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:-]+/)
171 while attr = scanner.scan(/[-\w:]+/)
173 if scanner.scan(/\s*=\s*/)
174 if delim = scanner.scan(/['"]/)
176 while text = scanner.scan(/[^#{delim}\\]+|./)
180 value << scanner.getch
187 value = scanner.scan(/[^\s>\/]+/)
190 attributes[attr.downcase] = value
194 closing = ( scanner.scan(/\//) ? :self : nil )
197 unless scanner.scan(/\s*>/)
199 raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
201 # throw away all text until we find what we're looking for
202 scanner.skip_until(/>/) or scanner.terminate
206 Tag.new(parent, line, pos, name, attributes, closing)
212 # A node that represents text, rather than markup.
213 class Text < Node #:nodoc:
217 # Creates a new text node as a child of the given parent, with the given
219 def initialize(parent, line, pos, content)
220 super(parent, line, pos)
224 # Returns the content of this node.
229 # Returns +self+ if this node meets the given conditions
. Text nodes support
230 # conditions of the following kinds:
232 # * if +conditions+ is a string, it must be a substring of the node's
234 # * if +conditions+ is a regular expression, it must match the node's
236 # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
237 # is either a string or a regexp, and which is interpreted as described
240 match(conditions
) && self
243 # Returns non-+nil+ if this node meets the given conditions, or +nil+
244 # otherwise. See the discussion of #find for the valid conditions.
245 def match(conditions
)
248 @content == conditions
250 @content =~ conditions
252 conditions
= validate_conditions(conditions
)
254 # Text nodes only have :content, :parent, :ancestor
255 unless (conditions
.keys
- [:content, :parent, :ancestor]).empty
?
259 match(conditions
[:content])
266 return false unless super
267 content
== node
.content
271 # A CDATA node is simply a text node with a specialized way of displaying
273 class CDATA
< Text
#:nodoc:
275 "<![CDATA[#{super}]]>"
279 # A Tag is any node that represents markup. It may be an opening tag, a
280 # closing tag, or a self-closing tag. It has a name, and may have a hash of
282 class Tag
< Node
#:nodoc:
284 # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
287 # Either +nil+, or a hash of attributes for this node.
288 attr_reader
:attributes
290 # The name of this tag.
293 # Create a new node as a child of the given parent, using the given content
294 # to describe the node. It will be parsed and the node name, attributes and
295 # closing status extracted.
296 def initialize(parent
, line
, pos
, name
, attributes
, closing
)
297 super(parent
, line
, pos
)
299 @attributes = attributes
303 # A convenience for obtaining an attribute of the node. Returns +nil+ if
304 # the node has no attributes.
306 @attributes ? @attributes[attr
] : nil
309 # Returns non-+nil+ if this tag can contain child nodes.
310 def childless
?(xml
= false)
311 return false if xml
&& @closing.nil?
313 @name =~
/^
(img
|br
|hr
|link
|meta
|area
|base
|basefont
|
314 col
|frame
|input
|isindex
|param
)$/ox
317 # Returns a textual representation of the node
319 if @closing == :close
323 @attributes.each
do |k
,v
|
325 s
<< "=\"#{v}\"" if String
=== v
327 s
<< " /" if @closing == :self
329 @children.each
{ |child
| s
<< child
.to_s
}
330 s
<< "</#{@name}>" if @closing != :self && !@children.empty
?
335 # If either the node or any of its children meet the given conditions, the
336 # matching node is returned. Otherwise, +nil+ is returned. (See the
337 # description of the valid conditions in the +match+ method.)
339 match(conditions
) && self || super
342 # Returns +true+, indicating that this node represents an HTML tag.
347 # Returns +true+ if the node meets any of the given conditions. The
348 # +conditions+ parameter must be a hash of any of the following keys
349 # (all are optional):
351 # * <tt>:tag</tt>: the node name must match the corresponding value
352 # * <tt>:attributes</tt>: a hash. The node's values must match the
353 # corresponding values in the hash.
354 # * <tt>:parent</tt>: a hash. The node's parent must match the
355 # corresponding hash.
356 # * <tt>:child</tt>: a hash. At least one of the node's immediate children
357 # must meet the criteria described by the hash.
358 # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
359 # meet the criteria described by the hash.
360 # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
361 # must meet the criteria described by the hash.
362 # * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
363 # meet the criteria described by the hash.
364 # * <tt>:after</tt>: a hash. The node must be after any sibling meeting
365 # the criteria described by the hash, and at least one sibling must match.
366 # * <tt>:before</tt>: a hash. The node must be before any sibling meeting
367 # the criteria described by the hash, and at least one sibling must match.
368 # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
370 # ** <tt>:count</tt>: either a number or a range which must equal (or
371 # include) the number of children that match.
372 # ** <tt>:less_than</tt>: the number of matching children must be less than
374 # ** <tt>:greater_than</tt>: the number of matching children must be
375 # greater than this number.
376 # ** <tt>:only</tt>: another hash consisting of the keys to use
377 # to match on the children, and only matching children will be
380 # Conditions are matched using the following algorithm:
382 # * if the condition is a string, it must be a substring of the value.
383 # * if the condition is a regexp, it must match the value.
384 # * if the condition is a number, the value must match number.to_s.
385 # * if the condition is +true+, the value must not be +nil+.
386 # * if the condition is +false+ or +nil+, the value must be +nil+.
390 # # test if the node is a "span" tag
391 # node.match :tag => "span"
393 # # test if the node's parent is a "div"
394 # node.match :parent => { :tag => "div" }
396 # # test if any of the node's ancestors are "table" tags
397 # node.match :ancestor => { :tag => "table" }
399 # # test if any of the node's immediate children are "em" tags
400 # node.match :child => { :tag => "em" }
402 # # test if any of the node's descendants are "strong" tags
403 # node.match :descendant => { :tag => "strong" }
405 # # test if the node has between 2 and 4 span tags as immediate children
406 # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
408 # # get funky: test to see if the node is a "div", has a "ul" ancestor
409 # # and an "li" parent (with "class" = "enum"), and whether or not it has
410 # # a "span" descendant that contains # text matching /hello world/:
411 # node.match :tag => "div",
412 # :ancestor => { :tag => "ul" },
413 # :parent => { :tag => "li",
414 # :attributes => { :class => "enum" } },
415 # :descendant => { :tag => "span",
416 # :child => /hello world/ }
417 def match(conditions
)
418 conditions
= validate_conditions(conditions
)
419 # check content of child nodes
420 if conditions
[:content]
422 return false unless match_condition("", conditions
[:content])
424 return false unless children
.find
{ |child
| child
.match(conditions
[:content]) }
429 return false unless match_condition(@name, conditions
[:tag]) if conditions
[:tag]
432 (conditions
[:attributes] || {}).each
do |key
, value
|
433 return false unless match_condition(self[key
], value
)
437 return false unless parent
.match(conditions
[:parent]) if conditions
[:parent]
440 return false unless children
.find
{ |child
| child
.match(conditions
[:child]) } if conditions
[:child]
443 if conditions
[:ancestor]
444 return false unless catch
:found do
446 throw :found, true if p
.match(conditions
[:ancestor]) while p
= p
.parent
451 if conditions
[:descendant]
452 return false unless children
.find
do |child
|
454 child
.match(conditions
[:descendant]) ||
455 # test the child's descendants
456 child
.match(:descendant => conditions
[:descendant])
461 if opts
= conditions
[:children]
462 matches
= children
.select
do |c
|
463 (c
.kind_of
?(HTML
::Tag) and (c
.closing
== :self or ! c
.childless
?))
466 matches
= matches
.select
{ |c
| c
.match(opts
[:only]) } if opts
[:only]
467 opts
.each
do |key
, value
|
472 return false if matches
.length
!= value
474 return false unless value
.include?(matches
.length
)
477 return false unless matches
.length
< value
479 return false unless matches
.length
> value
480 else raise "unknown count condition #{key}"
486 if conditions
[:sibling] || conditions
[:before] || conditions
[:after]
487 siblings
= parent
? parent
.children
: []
488 self_index
= siblings
.index(self)
490 if conditions
[:sibling]
491 return false unless siblings
.detect
do |s
|
492 s
!= self && s
.match(conditions
[:sibling])
496 if conditions
[:before]
497 return false unless siblings
[self_index
+1..-1].detect
do |s
|
498 s
!= self && s
.match(conditions
[:before])
502 if conditions
[:after]
503 return false unless siblings
[0,self_index
].detect
do |s
|
504 s
!= self && s
.match(conditions
[:after])
513 return false unless super
514 return false unless closing
== node
.closing
&& self.name
== node
.name
515 attributes
== node
.attributes
519 # Match the given value to the given condition.
520 def match_condition(value
, condition
)
523 value
&& value
== condition
525 value
&& value
.match(condition
)
527 value
== condition
.to_s