5 # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
6 # token is a string. Each string represents either "text", or an HTML element.
8 # This currently assumes valid XHTML, which means no free < or > characters.
12 # tokenizer = HTML::Tokenizer.new(text)
13 # while token = tokenizer.next
16 class Tokenizer
#:nodoc:
18 # The current (byte) position in the text
21 # The current line number
24 # Create a new Tokenizer for the given text.
26 @scanner = StringScanner
.new(text
)
32 # Return the next token in the sequence, or +nil+ if there are no more tokens in
35 return nil if @scanner.eos
?
36 @position = @scanner.pos
38 if @scanner.check(/<\S/)
39 update_current_line(scan_tag
)
41 update_current_line(scan_text
)
47 # Treat the text at the current position as a tag, and scan it. Supports
48 # comments, doctype tags, and regular tags, and ignores less-than and
49 # greater-than characters within quoted strings.
52 if @scanner.scan(/!--/) # comment
53 tag
<< @scanner.matched
54 tag
<< (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
55 elsif @scanner.scan(/!\[CDATA\[/)
56 tag
<< @scanner.matched
57 tag
<< (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
58 elsif @scanner.scan(/!/) # doctype
59 tag
<< @scanner.matched
60 tag
<< consume_quoted_regions
62 tag
<< consume_quoted_regions
67 # Scan all text up to the next < character and return it.
69 "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
72 # Counts the number of newlines in the text and updates the current line
74 def update_current_line(text
)
75 text
.scan(/\r?\n/) { @current_line += 1 }
78 # Skips over quoted strings, so that less-than and greater-than characters
79 # within the strings are ignored.
80 def consume_quoted_regions
83 match
= @scanner.scan_until(/['"<>]/) or break
85 delim
= @scanner.matched
92 break if delim
== "<" || delim
== ">"
94 # consume the quoted region
95 while match
= @scanner.scan_until(/[\\#{delim}]/)
97 break if @scanner.matched
== delim
98 text
<< @scanner.getch
# skip the escaped character