/
scrub.rb
106 lines (91 loc) · 4.07 KB
/
scrub.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#encoding: US-ASCII
require 'cgi'
require 'crass'
module Loofah
module HTML5 # :nodoc:
module Scrub
CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
class << self
def allowed_element? element_name
::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
end
# alternative implementation of the html5lib attribute scrubbing algorithm
def scrub_attributes node
node.attribute_nodes.each do |attr_node|
attr_name = if attr_node.namespace
"#{attr_node.namespace.prefix}:#{attr_node.node_name}"
else
attr_node.node_name
end
if attr_name =~ /\Adata-[\w-]+\z/
next
end
unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
attr_node.remove
next
end
if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
attr_node.remove
next
elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
# permit only allowed data mediatypes
mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
mediatype, _ = mediatype.split(';')[0..1] if mediatype
if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
attr_node.remove
next
end
end
end
if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
end
if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
attr_node.remove
next
end
end
scrub_css_attribute node
node.attribute_nodes.each do |attr_node|
node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
end
end
def scrub_css_attribute node
style = node.attributes['style']
style.value = scrub_css(style.value) if style
end
def scrub_css style
style_tree = Crass.parse_properties style
sanitized_tree = []
style_tree.each do |node|
next unless node[:node] == :property
next if node[:children].any? do |child|
[:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
end
name = node[:name].downcase
if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
sanitized_tree << node << CRASS_SEMICOLON
elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
value = node[:value].split.map do |keyword|
if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
keyword
end
end.compact
unless value.empty?
propstring = sprintf "%s:%s", name, value.join(" ")
sanitized_node = Crass.parse_properties(propstring).first
sanitized_tree << sanitized_node << CRASS_SEMICOLON
end
end
end
Crass::Parser.stringify sanitized_tree
end
end
end
end
end