vendor/rails/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb

   1 require 'strscan'
   2
   3 module HTML #:nodoc:
   4
   5   # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
   6   # token is a string. Each string represents either "text", or an HTML element.
   7   #
   8   # This currently assumes valid XHTML, which means no free < or > characters.
   9   #
  10   # Usage:
  11   #
  12   #   tokenizer = HTML::Tokenizer.new(text)
  13   #   while token = tokenizer.next
  14   #     p token
  15   #   end
  16   class Tokenizer #:nodoc:
  17
  18     # The current (byte) position in the text
  19     attr_reader :position
  20
  21     # The current line number
  22     attr_reader :line
  23
  24     # Create a new Tokenizer for the given text.
  25     def initialize(text)
  26       @scanner = StringScanner.new(text)
  27       @position = 0
  28       @line = 0
  29       @current_line = 1
  30     end
  31
  32     # Return the next token in the sequence, or +nil+ if there are no more tokens in
  33     # the stream.
  34     def next
  35       return nil if @scanner.eos?
  36       @position = @scanner.pos
  37       @line = @current_line
  38       if @scanner.check(/<\S/)
  39         update_current_line(scan_tag)
  40       else
  41         update_current_line(scan_text)
  42       end
  43     end
  44
  45     private
  46
  47       # Treat the text at the current position as a tag, and scan it. Supports
  48       # comments, doctype tags, and regular tags, and ignores less-than and
  49       # greater-than characters within quoted strings.
  50       def scan_tag
  51         tag = @scanner.getch
  52         if @scanner.scan(/!--/) # comment
  53           tag << @scanner.matched
  54           tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
  55         elsif @scanner.scan(/!\[CDATA\[/)
  56           tag << @scanner.matched
  57           tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
  58         elsif @scanner.scan(/!/) # doctype
  59           tag << @scanner.matched
  60           tag << consume_quoted_regions
  61         else
  62           tag << consume_quoted_regions
  63         end
  64         tag
  65       end
  66
  67       # Scan all text up to the next < character and return it.
  68       def scan_text
  69         "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
  70       end
  71
  72       # Counts the number of newlines in the text and updates the current line
  73       # accordingly.
  74       def update_current_line(text)
  75         text.scan(/\r?\n/) { @current_line += 1 }
  76       end
  77
  78       # Skips over quoted strings, so that less-than and greater-than characters
  79       # within the strings are ignored.
  80       def consume_quoted_regions
  81         text = ""
  82         loop do
  83           match = @scanner.scan_until(/['"<>]/) or break
  84
  85           delim = @scanner.matched
  86           if delim == "<"
  87             match = match.chop
  88             @scanner.pos -= 1
  89           end
  90
  91           text << match
  92           break if delim == "<" || delim == ">"
  93
  94           # consume the quoted region
  95           while match = @scanner.scan_until(/[\\#{delim}]/)
  96             text << match
  97             break if @scanner.matched == delim
  98             text << @scanner.getch # skip the escaped character
  99           end
 100         end
 101         text
 102       end
 103   end
 104
 105 end