Froze rails gems
[depot.git] / vendor / rails / actionpack / lib / action_controller / vendor / html-scanner / html / tokenizer.rb
1 require 'strscan'
2
3 module HTML #:nodoc:
4
5 # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
6 # token is a string. Each string represents either "text", or an HTML element.
7 #
8 # This currently assumes valid XHTML, which means no free < or > characters.
9 #
10 # Usage:
11 #
12 # tokenizer = HTML::Tokenizer.new(text)
13 # while token = tokenizer.next
14 # p token
15 # end
16 class Tokenizer #:nodoc:
17
18 # The current (byte) position in the text
19 attr_reader :position
20
21 # The current line number
22 attr_reader :line
23
24 # Create a new Tokenizer for the given text.
25 def initialize(text)
26 @scanner = StringScanner.new(text)
27 @position = 0
28 @line = 0
29 @current_line = 1
30 end
31
32 # Return the next token in the sequence, or +nil+ if there are no more tokens in
33 # the stream.
34 def next
35 return nil if @scanner.eos?
36 @position = @scanner.pos
37 @line = @current_line
38 if @scanner.check(/<\S/)
39 update_current_line(scan_tag)
40 else
41 update_current_line(scan_text)
42 end
43 end
44
45 private
46
47 # Treat the text at the current position as a tag, and scan it. Supports
48 # comments, doctype tags, and regular tags, and ignores less-than and
49 # greater-than characters within quoted strings.
50 def scan_tag
51 tag = @scanner.getch
52 if @scanner.scan(/!--/) # comment
53 tag << @scanner.matched
54 tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
55 elsif @scanner.scan(/!\[CDATA\[/)
56 tag << @scanner.matched
57 tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
58 elsif @scanner.scan(/!/) # doctype
59 tag << @scanner.matched
60 tag << consume_quoted_regions
61 else
62 tag << consume_quoted_regions
63 end
64 tag
65 end
66
67 # Scan all text up to the next < character and return it.
68 def scan_text
69 "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
70 end
71
72 # Counts the number of newlines in the text and updates the current line
73 # accordingly.
74 def update_current_line(text)
75 text.scan(/\r?\n/) { @current_line += 1 }
76 end
77
78 # Skips over quoted strings, so that less-than and greater-than characters
79 # within the strings are ignored.
80 def consume_quoted_regions
81 text = ""
82 loop do
83 match = @scanner.scan_until(/['"<>]/) or break
84
85 delim = @scanner.matched
86 if delim == "<"
87 match = match.chop
88 @scanner.pos -= 1
89 end
90
91 text << match
92 break if delim == "<" || delim == ">"
93
94 # consume the quoted region
95 while match = @scanner.scan_until(/[\\#{delim}]/)
96 text << match
97 break if @scanner.matched == delim
98 text << @scanner.getch # skip the escaped character
99 end
100 end
101 text
102 end
103 end
104
105 end