Froze rails gems
[depot.git] / vendor / rails / actionpack / lib / action_controller / vendor / html-scanner / html / tokenizer.rb
diff --git a/vendor/rails/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb b/vendor/rails/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
new file mode 100644 (file)
index 0000000..602411e
--- /dev/null
@@ -0,0 +1,105 @@
+require 'strscan'
+
+module HTML #:nodoc:
+  
+  # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+  # token is a string. Each string represents either "text", or an HTML element.
+  #
+  # This currently assumes valid XHTML, which means no free < or > characters.
+  #
+  # Usage:
+  #
+  #   tokenizer = HTML::Tokenizer.new(text)
+  #   while token = tokenizer.next
+  #     p token
+  #   end
+  class Tokenizer #:nodoc:
+    
+    # The current (byte) position in the text
+    attr_reader :position
+    
+    # The current line number
+    attr_reader :line
+    
+    # Create a new Tokenizer for the given text.
+    def initialize(text)
+      @scanner = StringScanner.new(text)
+      @position = 0
+      @line = 0
+      @current_line = 1
+    end
+
+    # Return the next token in the sequence, or +nil+ if there are no more tokens in
+    # the stream.
+    def next
+      return nil if @scanner.eos?
+      @position = @scanner.pos
+      @line = @current_line
+      if @scanner.check(/<\S/)
+        update_current_line(scan_tag)
+      else
+        update_current_line(scan_text)
+      end
+    end
+  
+    private
+
+      # Treat the text at the current position as a tag, and scan it. Supports
+      # comments, doctype tags, and regular tags, and ignores less-than and
+      # greater-than characters within quoted strings.
+      def scan_tag
+        tag = @scanner.getch
+        if @scanner.scan(/!--/) # comment
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!\[CDATA\[/)
+          tag << @scanner.matched
+          tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
+        elsif @scanner.scan(/!/) # doctype
+          tag << @scanner.matched
+          tag << consume_quoted_regions
+        else
+          tag << consume_quoted_regions
+        end
+        tag
+      end
+
+      # Scan all text up to the next < character and return it.
+      def scan_text
+        "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
+      end
+      
+      # Counts the number of newlines in the text and updates the current line
+      # accordingly.
+      def update_current_line(text)
+        text.scan(/\r?\n/) { @current_line += 1 }
+      end
+      
+      # Skips over quoted strings, so that less-than and greater-than characters
+      # within the strings are ignored.
+      def consume_quoted_regions
+        text = ""
+        loop do
+          match = @scanner.scan_until(/['"<>]/) or break
+
+          delim = @scanner.matched
+          if delim == "<"
+            match = match.chop
+            @scanner.pos -= 1
+          end
+
+          text << match
+          break if delim == "<" || delim == ">"
+
+          # consume the quoted region
+          while match = @scanner.scan_until(/[\\#{delim}]/)
+            text << match
+            break if @scanner.matched == delim
+            text << @scanner.getch # skip the escaped character
+          end
+        end
+        text
+      end
+  end
+  
+end