e2c49c284fc1de65feafaab46526b71f0eff44aa
2 # Copyright (c) 2006 Assaf Arkin (http://labnotes.org)
3 # Under MIT and/or CC By license.
8 # Selects HTML elements using CSS 2 selectors.
10 # The +Selector+ class uses CSS selector expressions to match and select
14 # selector = HTML::Selector.new "form.login[action=/login]"
15 # creates a new selector that matches any +form+ element with the class
16 # +login+ and an attribute +action+ with the value <tt>/login</tt>.
18 # === Matching Elements
20 # Use the #match method to determine if an element matches the selector.
22 # For simple selectors, the method returns an array with that element,
23 # or +nil+ if the element does not match. For complex selectors (see below)
24 # the method returns an array with all matched elements, of +nil+ if no
28 # if selector.match(element)
29 # puts "Element is a login form"
32 # === Selecting Elements
34 # Use the #select method to select all matching elements starting with
35 # one element and going through all children in depth-first order.
37 # This method returns an array of all matching elements, an empty array
38 # if no match is found
41 # selector = HTML::Selector.new "input[type=text]"
42 # matches = selector.select(element)
43 # matches.each do |match|
44 # puts "Found text field with name #{match.attributes['name']}"
49 # Selectors can match elements using any of the following criteria:
50 # * <tt>name</tt> -- Match an element based on its name (tag name).
51 # For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
52 # to match any element.
53 # * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
54 # <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
55 # * <tt>.class</tt> -- Match an element based on its class name, all
56 # class names if more than one specified.
57 # * <tt>[attr]</tt> -- Match an element that has the specified attribute.
58 # * <tt>[attr=value]</tt> -- Match an element that has the specified
59 # attribute and value. (More operators are supported see below)
60 # * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
61 # such as <tt>:nth-child</tt> and <tt>:empty</tt>.
62 # * <tt>:not(expr)</tt> -- Match an element that does not match the
63 # negation expression.
65 # When using a combination of the above, the element name comes first
66 # followed by identifier, class names, attributes, pseudo classes and
67 # negation in any order. Do not separate these parts with spaces!
68 # Space separation is used for descendant selectors.
71 # selector = HTML::Selector.new "form.login[action=/login]"
72 # The matched element must be of type +form+ and have the class +login+.
73 # It may have other classes, but the class +login+ is required to match.
74 # It must also have an attribute called +action+ with the value
77 # This selector will match the following element:
78 # <form class="login form" method="post" action="/login">
79 # but will not match the element:
80 # <form method="post" action="/logout">
82 # === Attribute Values
84 # Several operators are supported for matching attributes:
85 # * <tt>name</tt> -- The element must have an attribute with that name.
86 # * <tt>name=value</tt> -- The element must have an attribute with that
88 # * <tt>name^=value</tt> -- The attribute value must start with the
90 # * <tt>name$=value</tt> -- The attribute value must end with the
92 # * <tt>name*=value</tt> -- The attribute value must contain the
94 # * <tt>name~=word</tt> -- The attribute value must contain the specified
95 # word (space separated).
96 # * <tt>name|=word</tt> -- The attribute value must start with specified
99 # For example, the following two selectors match the same element:
102 # and so do the following two selectors:
106 # === Alternatives, siblings, children
108 # Complex selectors use a combination of expressions to match elements:
109 # * <tt>expr1 expr2</tt> -- Match any element against the second expression
110 # if it has some parent element that matches the first expression.
111 # * <tt>expr1 > expr2</tt> -- Match any element against the second expression
112 # if it is the child of an element that matches the first expression.
113 # * <tt>expr1 + expr2</tt> -- Match any element against the second expression
114 # if it immediately follows an element that matches the first expression.
115 # * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
116 # that comes after an element that matches the first expression.
117 # * <tt>expr1, expr2</tt> -- Match any element against the first expression,
118 # or against the second expression.
120 # Since children and sibling selectors may match more than one element given
121 # the first element, the #match method may return more than one match.
125 # Pseudo classes were introduced in CSS 3. They are most often used to select
126 # elements in a given position:
127 # * <tt>:root</tt> -- Match the element only if it is the root element
128 # (no parent element).
129 # * <tt>:empty</tt> -- Match the element only if it has no child elements,
130 # and no text content.
131 # * <tt>:only-child</tt> -- Match the element if it is the only child (element)
132 # of its parent element.
133 # * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
134 # of its parent element and its type.
135 # * <tt>:first-child</tt> -- Match the element if it is the first child (element)
136 # of its parent element.
137 # * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
138 # of its parent element of its type.
139 # * <tt>:last-child</tt> -- Match the element if it is the last child (element)
140 # of its parent element.
141 # * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
142 # of its parent element of its type.
143 # * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
144 # of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
145 # * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
146 # in each group of <tt>a</tt> child elements of its parent element.
147 # * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
148 # in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
149 # elements of its parent element.
150 # * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
151 # Same as <tt>:nth-child(2n+1)</tt>.
152 # * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
153 # fourth). Same as <tt>:nth-child(2n+2)</tt>.
154 # * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
155 # * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
156 # * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
157 # only elements of its type.
158 # * <tt>:not(selector)</tt> -- Match the element only if the element does not
159 # match the simple selector.
161 # As you can see, <tt>:nth-child<tt> pseudo class and its variant can get quite
162 # tricky and the CSS specification doesn't do a much better job explaining it.
163 # But after reading the examples and trying a few combinations, it's easy to
167 # table tr:nth-child(odd)
168 # Selects every second row in the table starting with the first one.
171 # Selects the fourth paragraph in the +div+, but not if the +div+ contains
172 # other elements, since those are also counted.
174 # div p:nth-of-type(4)
175 # Selects the fourth paragraph in the +div+, counting only paragraphs, and
176 # ignoring all other elements.
178 # div p:nth-of-type(-n+4)
179 # Selects the first four paragraphs, ignoring all others.
181 # And you can always select an element that matches one set of rules but
182 # not another using <tt>:not</tt>. For example:
184 # Matches all paragraphs that do not have the class <tt>.post</tt>.
186 # === Substitution Values
188 # You can use substitution with identifiers, class names and element values.
189 # A substitution takes the form of a question mark (<tt>?</tt>) and uses the
190 # next value in the argument list following the CSS expression.
192 # The substitution value may be a string or a regular expression. All other
193 # values are converted to strings.
196 # selector = HTML::Selector.new "#?", /^\d+$/
197 # matches any element whose identifier consists of one or more digits.
199 # See http://www.w3.org/TR/css3-selectors/
203 # An invalid selector.
204 class InvalidSelectorError
< StandardError
#:nodoc:
211 # Selector.for_class(cls) => selector
213 # Creates a new selector for the given class name.
215 self.new([".?", cls
])
220 # Selector.for_id(id) => selector
222 # Creates a new selector for the given id.
231 # Selector.new(string, [values ...]) => selector
233 # Creates a new selector from a CSS 2 selector expression.
235 # The first argument is the selector expression. All other arguments
236 # are used for value substitution.
238 # Throws InvalidSelectorError is the selector expression is invalid.
239 def initialize(selector
, *values
)
240 raise ArgumentError
, "CSS expression cannot be empty" if selector
.empty
?
242 values
= values
[0] if values
.size
== 1 && values
[0].is_a
?(Array
)
244 # We need a copy to determine if we failed to parse, and also
245 # preserve the original pass by-ref statement.
246 statement
= selector
.strip
.dup
248 # Create a simple selector, along with negation.
249 simple_selector(statement
, values
).each
{ |name
, value
| instance_variable_set("@#{name}", value
) }
254 # Alternative selector.
255 if statement
.sub
!(/^\s*,\s*/, "")
256 second
= Selector
.new(statement
, values
)
257 @alternates << second
258 # If there are alternate selectors, we group them in the top selector.
259 if alternates
= second
.instance_variable_get(:@alternates)
260 second
.instance_variable_set(:@alternates, [])
261 @alternates.concat alternates
263 @source << " , " << second
.to_s
264 # Sibling selector: create a dependency into second selector that will
265 # match element immediately following this one.
266 elsif statement
.sub
!(/^\s*\+\s*/, "")
267 second
= next_selector(statement
, values
)
268 @depends = lambda
do |element
, first
|
269 if element
= next_element(element
)
270 second
.match(element
, first
)
273 @source << " + " << second
.to_s
274 # Adjacent selector: create a dependency into second selector that will
275 # match all elements following this one.
276 elsif statement
.sub
!(/^\s*~\s*/, "")
277 second
= next_selector(statement
, values
)
278 @depends = lambda
do |element
, first
|
280 while element
= next_element(element
)
281 if subset
= second
.match(element
, first
)
282 if first
&& !subset
.empty
?
283 matches
<< subset
.first
286 matches
.concat subset
290 matches
.empty
? ? nil : matches
292 @source << " ~ " << second
.to_s
293 # Child selector: create a dependency into second selector that will
294 # match a child element of this one.
295 elsif statement
.sub
!(/^\s*>\s*/, "")
296 second
= next_selector(statement
, values
)
297 @depends = lambda
do |element
, first
|
299 element
.children
.each
do |child
|
300 if child
.tag
? && subset
= second
.match(child
, first
)
301 if first
&& !subset
.empty
?
302 matches
<< subset
.first
305 matches
.concat subset
309 matches
.empty
? ? nil : matches
311 @source << " > " << second
.to_s
312 # Descendant selector: create a dependency into second selector that
313 # will match all descendant elements of this one. Note,
314 elsif statement
=~
/^\s+\S+/ && statement
!= selector
315 second
= next_selector(statement
, values
)
316 @depends = lambda
do |element
, first
|
318 stack
= element
.children
.reverse
319 while node
= stack
.pop
320 next unless node
.tag
?
321 if subset
= second
.match(node
, first
)
322 if first
&& !subset
.empty
?
323 matches
<< subset
.first
326 matches
.concat subset
328 elsif children
= node
.children
329 stack
.concat children
.reverse
332 matches
.empty
? ? nil : matches
334 @source << " " << second
.to_s
336 # The last selector is where we check that we parsed
338 unless statement
.empty
? || statement
.strip
.empty
?
339 raise ArgumentError
, "Invalid selector: #{statement}"
346 # match(element, first?) => array or nil
348 # Matches an element against the selector.
350 # For a simple selector this method returns an array with the
351 # element if the element matches, nil otherwise.
353 # For a complex selector (sibling and descendant) this method
354 # returns an array with all matching elements, nil if no match is
357 # Use +first_only=true+ if you are only interested in the first element.
360 # if selector.match(element)
361 # puts "Element is a login form"
363 def match(element
, first_only
= false)
364 # Match element if no element name or element name same as element name
365 if matched
= (!@tag_name || @tag_name == element
.name
)
366 # No match if one of the attribute matches failed
367 for attr
in @attributes
368 if element
.attributes
[attr
[0]] !~ attr
[1]
375 # Pseudo class matches (nth-child, empty, etc).
377 for pseudo
in @pseudo
378 unless pseudo
.call(element
)
385 # Negation. Same rules as above, but we fail if a match is made.
386 if matched
&& @negation
387 for negation
in @negation
388 if negation
[:tag_name] == element
.name
391 for attr
in negation
[:attributes]
392 if element
.attributes
[attr
[0]] =~ attr
[1]
399 for pseudo
in negation
[:pseudo]
400 if pseudo
.call(element
)
410 # If element matched but depends on another element (child,
411 # sibling, etc), apply the dependent matches instead.
412 if matched
&& @depends
413 matches
= @depends.call(element
, first_only
)
415 matches
= matched
? [element
] : nil
418 # If this selector is part of the group, try all the alternative
419 # selectors (unless first_only).
420 if !first_only
|| !matches
421 @alternates.each
do |alternate
|
422 break if matches
&& first_only
423 if subset
= alternate
.match(element
, first_only
)
425 matches
.concat subset
438 # select(root) => array
440 # Selects and returns an array with all matching elements, beginning
441 # with one node and traversing through all children depth-first.
442 # Returns an empty array if no match is found.
444 # The root node may be any element in the document, or the document
448 # selector = HTML::Selector.new "input[type=text]"
449 # matches = selector.select(element)
450 # matches.each do |match|
451 # puts "Found text field with name #{match.attributes['name']}"
456 while node
= stack
.pop
457 if node
.tag
? && subset
= match(node
, false)
458 subset
.each
do |match
|
459 matches
<< match
unless matches
.any
? { |item
| item
.equal
?(match
) }
461 elsif children
= node
.children
462 stack
.concat children
.reverse
469 # Similar to #select but returns the first matching element. Returns +nil+
470 # if no element matches the selector.
471 def select_first(root
)
473 while node
= stack
.pop
474 if node
.tag
? && subset
= match(node
, true)
475 return subset
.first
if !subset
.empty
?
476 elsif children
= node
.children
477 stack
.concat children
.reverse
489 # Return the next element after this one. Skips sibling text nodes.
491 # With the +name+ argument, returns the next element with that name,
492 # skipping other sibling elements.
493 def next_element(element
, name
= nil)
494 if siblings
= element
.parent
.children
496 siblings
.each
do |node
|
497 if node
.equal
?(element
)
499 elsif found
&& node
.tag
?
500 return node
if (name
.nil? || node
.name
== name
)
511 # Creates a simple selector given the statement and array of
512 # substitution values.
514 # Returns a hash with the values +tag_name+, +attributes+,
515 # +pseudo+ (classes) and +negation+.
517 # Called the first time with +can_negate+ true to allow
518 # negation. Called a second time with false since negation
520 def simple_selector(statement
, values
, can_negate
= true)
526 # Element name. (Note that in negation, this can come at
527 # any order, but for simplicity we allow if only first).
528 statement
.sub
!(/^(\*|[[:alpha:]][\w\-]*)/) do |match
|
530 tag_name
= match
.downcase
unless match
== "*"
535 # Get identifier, class, attribute name, pseudo or negation.
537 # Element identifier.
538 next if statement
.sub
!(/^#(\?|[\w\-]+)/) do |match
|
544 id
= Regexp
.new("^#{Regexp.escape(id.to_s)}$") unless id
.is_a
?(Regexp
)
545 attributes
<< ["id", id
]
550 next if statement
.sub
!(/^\.([\w\-]+)/) do |match
|
552 @source << ".#{class_name}"
553 class_name
= Regexp
.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name
.is_a
?(Regexp
)
554 attributes
<< ["class", class_name
]
559 next if statement
.sub
!(/^\[\s*([[:alpha:]][\w\-:]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do |match
|
560 name
, equality
, value
= $1, $2, $3
564 # Handle single and double quotes.
566 if (value
[0] == ?" || value[0] == ?') && value[0] == value[-1]
570 @source << "[#{name}#{equality}'#{value}']"
571 attributes
<< [name
.downcase
.strip
, attribute_match(equality
, value
)]
576 next if statement
.sub
!(/^:root/) do |match
|
577 pseudo
<< lambda
do |element
|
578 element
.parent
.nil? || !element
.parent
.tag
?
584 # Nth-child including last and of-type.
585 next if statement
.sub
!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match
|
586 reverse
= $1 == "last-"
587 of_type
= $2 == "of-type"
588 @source << ":nth-#{$1}#{$2}("
591 pseudo
<< nth_child(2, 1, of_type
, reverse
)
594 pseudo
<< nth_child(2, 2, of_type
, reverse
)
596 when /^(\d+|\?)$/ # b only
597 b
= ($1 == "?" ? values
.shift
: $1).to_i
598 pseudo
<< nth_child(0, b
, of_type
, reverse
)
600 when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
601 a
= ($1 == "?" ? values
.shift
:
602 $1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
603 b
= ($2 == "?" ? values
.shift
: $2).to_i
604 pseudo
<< nth_child(a
, b
, of_type
, reverse
)
605 @source << (b
>= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
607 raise ArgumentError
, "Invalid nth-child #{match}"
611 # First/last child (of type).
612 next if statement
.sub
!(/^:(first|last)-(child|of-type)/) do |match
|
613 reverse
= $1 == "last"
614 of_type
= $2 == "of-type"
615 pseudo
<< nth_child(0, 1, of_type
, reverse
)
616 @source << ":#{$1}-#{$2}"
619 # Only child (of type).
620 next if statement
.sub
!(/^:only-(child|of-type)/) do |match
|
621 of_type
= $1 == "of-type"
622 pseudo
<< only_child(of_type
)
623 @source << ":only-#{$1}"
627 # Empty: no child elements or meaningful content (whitespaces
629 next if statement
.sub
!(/^:empty/) do |match
|
630 pseudo
<< lambda
do |element
|
632 for child
in element
.children
633 if child
.tag
? || !child
.content
.strip
.empty
?
643 # Content: match the text content of the element, stripping
644 # leading and trailing spaces.
645 next if statement
.sub
!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do |match
|
648 content
= values
.shift
649 elsif (content
[0] == ?" || content[0] == ?') && content[0] == content[-1]
650 content = content[1..-2]
652 @source << ":content('#{content}')"
653 content = Regexp.new("^
#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
654 pseudo
<< lambda
do |element
|
656 for child
in element
.children
658 text
<< child
.content
661 text
.strip
=~ content
666 # Negation. Create another simple selector to handle it.
667 if statement
.sub
!(/^:not\(\s*/, "")
668 raise ArgumentError
, "Double negatives are not missing feature" unless can_negate
670 negation
<< simple_selector(statement
, values
, false)
671 raise ArgumentError
, "Negation not closed" unless statement
.sub
!(/^\s*\)/, "")
676 # No match: moving on.
680 # Return hash. The keys are mapped to instance variables.
681 {:tag_name=>tag_name
, :attributes=>attributes
, :pseudo=>pseudo
, :negation=>negation
}
685 # Create a regular expression to match an attribute value based
686 # on the equality operator (=, ^=, |=, etc).
687 def attribute_match(equality
, value
)
688 regexp
= value
.is_a
?(Regexp
) ? value
: Regexp
.escape(value
.to_s
)
691 # Match the attribute value in full
692 Regexp
.new("^#{regexp}$")
694 # Match a space-separated word within the attribute value
695 Regexp
.new("(^|\s)#{regexp}($|\s)")
697 # Match the beginning of the attribute value
698 Regexp
.new("^#{regexp}")
700 # Match the end of the attribute value
701 Regexp
.new("#{regexp}$")
703 # Match substring of the attribute value
704 regexp
.is_a
?(Regexp
) ? regexp
: Regexp
.new(regexp
)
706 # Match the first space-separated item of the attribute value
707 Regexp
.new("^#{regexp}($|\s)")
709 raise InvalidSelectorError
, "Invalid operation/value" unless value
.empty
?
710 # Match all attributes values (existence check)
716 # Returns a lambda that can match an element against the nth-child
717 # pseudo class, given the following arguments:
718 # * +a+ -- Value of a part.
719 # * +b+ -- Value of b part.
720 # * +of_type+ -- True to test only elements of this type (of-type).
721 # * +reverse+ -- True to count in reverse order (last-).
722 def nth_child(a
, b
, of_type
, reverse
)
723 # a = 0 means select at index b, if b = 0 nothing selected
724 return lambda
{ |element
| false } if a
== 0 && b
== 0
725 # a < 0 and b < 0 will never match against an index
726 return lambda
{ |element
| false } if a
< 0 && b
< 0
727 b
= a
+ b
+ 1 if b
< 0 # b < 0 just picks last element from each group
728 b
-= 1 unless b
== 0 # b == 0 is same as b == 1, otherwise zero based
730 # Element must be inside parent element.
731 return false unless element
.parent
&& element
.parent
.tag
?
733 # Get siblings, reverse if counting from last.
734 siblings
= element
.parent
.children
735 siblings
= siblings
.reverse
if reverse
736 # Match element name if of-type, otherwise ignore name.
737 name
= of_type
? element
.name
: nil
739 for child
in siblings
740 # Skip text nodes/comments.
741 if child
.tag
? && (name
== nil || child
.name
== name
)
743 # Shortcut when a == 0 no need to go past count
745 found
= child
.equal
?(element
)
749 # Only look for first b elements
751 if child
.equal
?(element
)
752 found
= (index
% a
) == 0
756 # Otherwise, break if child found and count == an+b
757 if child
.equal
?(element
)
758 found
= (index
% a
) == b
770 # Creates a only child lambda. Pass +of-type+ to only look at
771 # elements of its type.
772 def only_child(of_type
)
774 # Element must be inside parent element.
775 return false unless element
.parent
&& element
.parent
.tag
?
776 name
= of_type
? element
.name
: nil
778 for child
in element
.parent
.children
779 # Skip text nodes/comments.
780 if child
.tag
? && (name
== nil || child
.name
== name
)
781 unless child
.equal
?(element
)
792 # Called to create a dependent selector (sibling, descendant, etc).
793 # Passes the remainder of the statement that will be reduced to zero
794 # eventually, and array of substitution values.
796 # This method is called from four places, so it helps to put it here
797 # for reuse. The only logic deals with the need to detect comma
798 # separators (alternate) and apply them to the selector group of the
800 def next_selector(statement
, values
)
801 second
= Selector
.new(statement
, values
)
802 # If there are alternate selectors, we group them in the top selector.
803 if alternates
= second
.instance_variable_get(:@alternates)
804 second
.instance_variable_set(:@alternates, [])
805 @alternates.concat alternates
813 # See HTML::Selector.new
814 def self.selector(statement
, *values
)
815 Selector
.new(statement
, *values
)
821 def select(selector
, *values
)
822 selector
= HTML
::Selector.new(selector
, values
)
823 selector
.select(self)