Froze rails gems
[depot.git] / vendor / rails / actionpack / lib / action_controller / vendor / html-scanner / html / sanitizer.rb
1 module HTML
2 class Sanitizer
3 def sanitize(text, options = {})
4 return text unless sanitizeable?(text)
5 tokenize(text, options).join
6 end
7
8 def sanitizeable?(text)
9 !(text.nil? || text.empty? || !text.index("<"))
10 end
11
12 protected
13 def tokenize(text, options)
14 tokenizer = HTML::Tokenizer.new(text)
15 result = []
16 while token = tokenizer.next
17 node = Node.parse(nil, 0, 0, token, false)
18 process_node node, result, options
19 end
20 result
21 end
22
23 def process_node(node, result, options)
24 result << node.to_s
25 end
26 end
27
28 class FullSanitizer < Sanitizer
29 def sanitize(text, options = {})
30 result = super
31 # strip any comments, and if they have a newline at the end (ie. line with
32 # only a comment) strip that too
33 result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
34 # Recurse - handle all dirty nested tags
35 result == text ? result : sanitize(result, options)
36 end
37
38 def process_node(node, result, options)
39 result << node.to_s if node.class == HTML::Text
40 end
41 end
42
43 class LinkSanitizer < FullSanitizer
44 cattr_accessor :included_tags, :instance_writer => false
45 self.included_tags = Set.new(%w(a href))
46
47 def sanitizeable?(text)
48 !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
49 end
50
51 protected
52 def process_node(node, result, options)
53 result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
54 end
55 end
56
57 class WhiteListSanitizer < Sanitizer
58 [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
59 :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
60 class_inheritable_accessor attr, :instance_writer => false
61 end
62
63 # A regular expression of the valid characters used to separate protocols like
64 # the ':' in 'http://foo.com'
65 self.protocol_separator = /:|(&#0*58)|(&#x70)|(%|&#37;)3A/
66
67 # Specifies a Set of HTML attributes that can have URIs.
68 self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
69
70 # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
71 # to just escaping harmless tags like &lt;font&gt;
72 self.bad_tags = Set.new(%w(script))
73
74 # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
75 self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
76 sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr
77 acronym a img blockquote del ins))
78
79 # Specifies the default Set of html attributes that the #sanitize helper will leave
80 # in the allowed tag.
81 self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
82
83 # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
84 self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
85 feed svn urn aim rsync tag ssh sftp rtsp afs))
86
87 # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
88 self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
89 border-color border-left-color border-right-color border-top-color clear color cursor direction display
90 elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
91 overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
92 speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
93 width))
94
95 # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
96 self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
97 collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
98 nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
99
100 # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
101 self.shorthand_css_properties = Set.new(%w(background border margin padding))
102
103 # Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
104 def sanitize_css(style)
105 # disallow urls
106 style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
107
108 # gauntlet
109 if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
110 style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*$/
111 return ''
112 end
113
114 clean = []
115 style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
116 if allowed_css_properties.include?(prop.downcase)
117 clean << prop + ': ' + val + ';'
118 elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
119 unless val.split().any? do |keyword|
120 !allowed_css_keywords.include?(keyword) &&
121 keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
122 end
123 clean << prop + ': ' + val + ';'
124 end
125 end
126 end
127 clean.join(' ')
128 end
129
130 protected
131 def tokenize(text, options)
132 options[:parent] = []
133 options[:attributes] ||= allowed_attributes
134 options[:tags] ||= allowed_tags
135 super
136 end
137
138 def process_node(node, result, options)
139 result << case node
140 when HTML::Tag
141 if node.closing == :close
142 options[:parent].shift
143 else
144 options[:parent].unshift node.name
145 end
146
147 process_attributes_for node, options
148
149 options[:tags].include?(node.name) ? node : nil
150 else
151 bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
152 end
153 end
154
155 def process_attributes_for(node, options)
156 return unless node.attributes
157 node.attributes.keys.each do |attr_name|
158 value = node.attributes[attr_name].to_s
159
160 if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
161 node.attributes.delete(attr_name)
162 else
163 node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
164 end
165 end
166 end
167
168 def contains_bad_protocols?(attr_name, value)
169 uri_attributes.include?(attr_name) &&
170 (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(%|&#37;)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first))
171 end
172 end
173 end