[feedcatcher.git] / html-scanner / html / node.rb
1 require 'strscan'
3 module HTML #:nodoc:
5 class Conditions < Hash #:nodoc:
6 def initialize(hash)
7 super()
8 hash = { :content => hash } unless Hash === hash
9 hash = keys_to_symbols(hash)
10 hash.each do |k,v|
11 case k
12 when :tag, :content then
13 # keys are valid, and require no further processing
14 when :attributes then
15 hash[k] = keys_to_strings(v)
16 when :parent, :child, :ancestor, :descendant, :sibling, :before,
17 :after
18 hash[k] =
19 when :children
20 hash[k] = v = keys_to_symbols(v)
21 v.each do |k,v2|
22 case k
23 when :count, :greater_than, :less_than
24 # keys are valid, and require no further processing
25 when :only
26 v[k] =
27 else
28 raise "illegal key #{k.inspect} => #{v2.inspect}"
29 end
30 end
31 else
32 raise "illegal key #{k.inspect} => #{v.inspect}"
33 end
34 end
35 update hash
36 end
38 private
40 def keys_to_strings(hash)
41 hash.keys.inject({}) do |h,k|
42 h[k.to_s] = hash[k]
43 h
44 end
45 end
47 def keys_to_symbols(hash)
48 hash.keys.inject({}) do |h,k|
49 raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
50 h[k.to_sym] = hash[k]
51 h
52 end
53 end
54 end
56 # The base class of all nodes, textual and otherwise, in an HTML document.
57 class Node #:nodoc:
58 # The array of children of this node. Not all nodes have children.
59 attr_reader :children
61 # The parent node of this node. All nodes have a parent, except for the
62 # root node.
63 attr_reader :parent
65 # The line number of the input where this node was begun
66 attr_reader :line
68 # The byte position in the input where this node was begun
69 attr_reader :position
71 # Create a new node as a child of the given parent.
72 def initialize(parent, line=0, pos=0)
73 @parent = parent
74 @children = []
75 @line, @position = line, pos
76 end
78 # Return a textual representation of the node.
79 def to_s
80 s = ""
81 @children.each { |child| s << child.to_s }
82 s
83 end
85 # Return false (subclasses must override this to provide specific matching
86 # behavior.) +conditions+ may be of any type.
87 def match(conditions)
88 false
89 end
91 # Search the children of this node for the first node for which #find
92 # returns non +nil+. Returns the result of the #find call that succeeded.
93 def find(conditions)
94 conditions = validate_conditions(conditions)
95 @children.each do |child|
96 node = child.find(conditions)
97 return node if node
98 end
99 nil
100 end
102 # Search for all nodes that match the given conditions, and return them
103 # as an array.
104 def find_all(conditions)
105 conditions = validate_conditions(conditions)
107 matches = []
108 matches << self if match(conditions)
109 @children.each do |child|
110 matches.concat child.find_all(conditions)
111 end
112 matches
113 end
115 # Returns +false+. Subclasses may override this if they define a kind of
116 # tag.
117 def tag?
118 false
119 end
121 def validate_conditions(conditions)
122 Conditions === conditions ? conditions :
123 end
125 def ==(node)
126 return false unless self.class == node.class && children.size == node.children.size
128 equivalent = true
130 children.size.times do |i|
131 equivalent &&= children[i] == node.children[i]
132 end
134 equivalent
135 end
137 class <<self
138 def parse(parent, line, pos, content, strict=true)
139 if content !~ /^<\S/
140, line, pos, content)
141 else
142 scanner =
144 unless scanner.skip(/</)
145 if strict
146 raise "expected <"
147 else
148 return, line, pos, content)
149 end
150 end
152 if scanner.skip(/!\[CDATA\[/)
153 unless scanner.skip_until(/\]\]>/)
154 if strict
155 raise "expected ]]> (got #{} for #{content})"
156 else
157 scanner.skip_until(/\Z/)
158 end
159 end
161 return, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
162 end
164 closing = ( scanner.scan(/\//) ? :close : nil )
165 return, line, pos, content) unless name = scanner.scan(/[\w:-]+/)
166 name.downcase!
168 unless closing
169 scanner.skip(/\s*/)
170 attributes = {}
171 while attr = scanner.scan(/[-\w:]+/)
172 value = true
173 if scanner.scan(/\s*=\s*/)
174 if delim = scanner.scan(/['"]/)
175 value = ""
176 while text = scanner.scan(/[^#{delim}\\]+|./)
177 case text
178 when "\\" then
179 value << text
180 value << scanner.getch
181 when delim
182 break
183 else value << text
184 end
185 end
186 else
187 value = scanner.scan(/[^\s>\/]+/)
188 end
189 end
190 attributes[attr.downcase] = value
191 scanner.skip(/\s*/)
192 end
194 closing = ( scanner.scan(/\//) ? :self : nil )
195 end
197 unless scanner.scan(/\s*>/)
198 if strict
199 raise "expected > (got #{} for #{content}, #{attributes.inspect})"
200 else
201 # throw away all text until we find what we're looking for
202 scanner.skip_until(/>/) or scanner.terminate
203 end
204 end
206, line, pos, name, attributes, closing)
207 end
208 end
209 end
210 end
212 # A node that represents text, rather than markup.
213 class Text < Node #:nodoc:
215 attr_reader :content
217 # Creates a new text node as a child of the given parent, with the given
218 # content.
219 def initialize(parent, line, pos, content)
220 super(parent, line, pos)
221 @content = content
222 end
224 # Returns the content of this node.
225 def to_s
226 @content
227 end
229 # Returns +self+ if this node meets the given conditions. Text nodes support
230 # conditions of the following kinds:
231 #
232 # * if +conditions+ is a string, it must be a substring of the node's
233 # content
234 # * if +conditions+ is a regular expression, it must match the node's
235 # content
236 # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
237 # is either a string or a regexp, and which is interpreted as described
238 # above.
239 def find(conditions)
240 match(conditions) && self
241 end
243 # Returns non-+nil+ if this node meets the given conditions, or +nil+
244 # otherwise. See the discussion of #find for the valid conditions.
245 def match(conditions)
246 case conditions
247 when String
248 @content == conditions
249 when Regexp
250 @content =~ conditions
251 when Hash
252 conditions = validate_conditions(conditions)
254 # Text nodes only have :content, :parent, :ancestor
255 unless (conditions.keys - [:content, :parent, :ancestor]).empty?
256 return false
257 end
259 match(conditions[:content])
260 else
261 nil
262 end
263 end
265 def ==(node)
266 return false unless super
267 content == node.content
268 end
269 end
271 # A CDATA node is simply a text node with a specialized way of displaying
272 # itself.
273 class CDATA < Text #:nodoc:
274 def to_s
275 "<![CDATA[#{super}]]>"
276 end
277 end
279 # A Tag is any node that represents markup. It may be an opening tag, a
280 # closing tag, or a self-closing tag. It has a name, and may have a hash of
281 # attributes.
282 class Tag < Node #:nodoc:
284 # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
285 attr_reader :closing
287 # Either +nil+, or a hash of attributes for this node.
288 attr_reader :attributes
290 # The name of this tag.
291 attr_reader :name
293 # Create a new node as a child of the given parent, using the given content
294 # to describe the node. It will be parsed and the node name, attributes and
295 # closing status extracted.
296 def initialize(parent, line, pos, name, attributes, closing)
297 super(parent, line, pos)
298 @name = name
299 @attributes = attributes
300 @closing = closing
301 end
303 # A convenience for obtaining an attribute of the node. Returns +nil+ if
304 # the node has no attributes.
305 def [](attr)
306 @attributes ? @attributes[attr] : nil
307 end
309 # Returns non-+nil+ if this tag can contain child nodes.
310 def childless?(xml = false)
311 return false if xml && @closing.nil?
312 !@closing.nil? ||
313 @name =~ /^(img|br|hr|link|meta|area|base|basefont|
314 col|frame|input|isindex|param)$/ox
315 end
317 # Returns a textual representation of the node
318 def to_s
319 if @closing == :close
320 "</#{@name}>"
321 else
322 s = "<#{@name}"
323 @attributes.each do |k,v|
324 s << " #{k}"
325 s << "=\"#{v}\"" if String === v
326 end
327 s << " /" if @closing == :self
328 s << ">"
329 @children.each { |child| s << child.to_s }
330 s << "</#{@name}>" if @closing != :self && !@children.empty?
331 s
332 end
333 end
335 # If either the node or any of its children meet the given conditions, the
336 # matching node is returned. Otherwise, +nil+ is returned. (See the
337 # description of the valid conditions in the +match+ method.)
338 def find(conditions)
339 match(conditions) && self || super
340 end
342 # Returns +true+, indicating that this node represents an HTML tag.
343 def tag?
344 true
345 end
347 # Returns +true+ if the node meets any of the given conditions. The
348 # +conditions+ parameter must be a hash of any of the following keys
349 # (all are optional):
350 #
351 # * <tt>:tag</tt>: the node name must match the corresponding value
352 # * <tt>:attributes</tt>: a hash. The node's values must match the
353 # corresponding values in the hash.
354 # * <tt>:parent</tt>: a hash. The node's parent must match the
355 # corresponding hash.
356 # * <tt>:child</tt>: a hash. At least one of the node's immediate children
357 # must meet the criteria described by the hash.
358 # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
359 # meet the criteria described by the hash.
360 # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
361 # must meet the criteria described by the hash.
362 # * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
363 # meet the criteria described by the hash.
364 # * <tt>:after</tt>: a hash. The node must be after any sibling meeting
365 # the criteria described by the hash, and at least one sibling must match.
366 # * <tt>:before</tt>: a hash. The node must be before any sibling meeting
367 # the criteria described by the hash, and at least one sibling must match.
368 # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
369 # keys:
370 # ** <tt>:count</tt>: either a number or a range which must equal (or
371 # include) the number of children that match.
372 # ** <tt>:less_than</tt>: the number of matching children must be less than
373 # this number.
374 # ** <tt>:greater_than</tt>: the number of matching children must be
375 # greater than this number.
376 # ** <tt>:only</tt>: another hash consisting of the keys to use
377 # to match on the children, and only matching children will be
378 # counted.
379 #
380 # Conditions are matched using the following algorithm:
381 #
382 # * if the condition is a string, it must be a substring of the value.
383 # * if the condition is a regexp, it must match the value.
384 # * if the condition is a number, the value must match number.to_s.
385 # * if the condition is +true+, the value must not be +nil+.
386 # * if the condition is +false+ or +nil+, the value must be +nil+.
387 #
388 # Usage:
389 #
390 # # test if the node is a "span" tag
391 # node.match :tag => "span"
392 #
393 # # test if the node's parent is a "div"
394 # node.match :parent => { :tag => "div" }
395 #
396 # # test if any of the node's ancestors are "table" tags
397 # node.match :ancestor => { :tag => "table" }
398 #
399 # # test if any of the node's immediate children are "em" tags
400 # node.match :child => { :tag => "em" }
401 #
402 # # test if any of the node's descendants are "strong" tags
403 # node.match :descendant => { :tag => "strong" }
404 #
405 # # test if the node has between 2 and 4 span tags as immediate children
406 # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
407 #
408 # # get funky: test to see if the node is a "div", has a "ul" ancestor
409 # # and an "li" parent (with "class" = "enum"), and whether or not it has
410 # # a "span" descendant that contains # text matching /hello world/:
411 # node.match :tag => "div",
412 # :ancestor => { :tag => "ul" },
413 # :parent => { :tag => "li",
414 # :attributes => { :class => "enum" } },
415 # :descendant => { :tag => "span",
416 # :child => /hello world/ }
417 def match(conditions)
418 conditions = validate_conditions(conditions)
419 # check content of child nodes
420 if conditions[:content]
421 if children.empty?
422 return false unless match_condition("", conditions[:content])
423 else
424 return false unless children.find { |child| child.match(conditions[:content]) }
425 end
426 end
428 # test the name
429 return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
431 # test attributes
432 (conditions[:attributes] || {}).each do |key, value|
433 return false unless match_condition(self[key], value)
434 end
436 # test parent
437 return false unless parent.match(conditions[:parent]) if conditions[:parent]
439 # test children
440 return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
442 # test ancestors
443 if conditions[:ancestor]
444 return false unless catch :found do
445 p = self
446 throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
447 end
448 end
450 # test descendants
451 if conditions[:descendant]
452 return false unless children.find do |child|
453 # test the child
454 child.match(conditions[:descendant]) ||
455 # test the child's descendants
456 child.match(:descendant => conditions[:descendant])
457 end
458 end
460 # count children
461 if opts = conditions[:children]
462 matches = do |c|
463 (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
464 end
466 matches = { |c| c.match(opts[:only]) } if opts[:only]
467 opts.each do |key, value|
468 next if key == :only
469 case key
470 when :count
471 if Integer === value
472 return false if matches.length != value
473 else
474 return false unless value.include?(matches.length)
475 end
476 when :less_than
477 return false unless matches.length < value
478 when :greater_than
479 return false unless matches.length > value
480 else raise "unknown count condition #{key}"
481 end
482 end
483 end
485 # test siblings
486 if conditions[:sibling] || conditions[:before] || conditions[:after]
487 siblings = parent ? parent.children : []
488 self_index = siblings.index(self)
490 if conditions[:sibling]
491 return false unless siblings.detect do |s|
492 s != self && s.match(conditions[:sibling])
493 end
494 end
496 if conditions[:before]
497 return false unless siblings[self_index+1..-1].detect do |s|
498 s != self && s.match(conditions[:before])
499 end
500 end
502 if conditions[:after]
503 return false unless siblings[0,self_index].detect do |s|
504 s != self && s.match(conditions[:after])
505 end
506 end
507 end
509 true
510 end
512 def ==(node)
513 return false unless super
514 return false unless closing == node.closing && ==
515 attributes == node.attributes
516 end
518 private
519 # Match the given value to the given condition.
520 def match_condition(value, condition)
521 case condition
522 when String
523 value && value == condition
524 when Regexp
525 value && value.match(condition)
526 when Numeric
527 value == condition.to_s
528 when true
529 !value.nil?
530 when false, nil
531 value.nil?
532 else
533 false
534 end
535 end
536 end
537 end