/
scrubber.rb
134 lines (121 loc) · 4.2 KB
/
scrubber.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# frozen_string_literal: true
module Loofah
#
# A RuntimeError raised when Loofah could not find an appropriate scrubber.
#
class ScrubberNotFound < RuntimeError ; end
#
# A Scrubber wraps up a block (or method) that is run on an HTML node (element):
#
# # change all <span> tags to <div> tags
# span2div = Loofah::Scrubber.new do |node|
# node.name = "div" if node.name == "span"
# end
#
# Alternatively, this scrubber could have been implemented as:
#
# class Span2Div < Loofah::Scrubber
# def scrub(node)
# node.name = "div" if node.name == "span"
# end
# end
# span2div = Span2Div.new
#
# This can then be run on a document:
#
# Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
# # => "<div>foo</div><p>bar</p>"
#
# Scrubbers can be run on a document in either a top-down traversal (the
# default) or bottom-up. Top-down scrubbers can optionally return
# Scrubber::STOP to terminate the traversal of a subtree.
#
class Scrubber
# Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
CONTINUE = Object.new.freeze
# Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
STOP = Object.new.freeze
# When a scrubber is initialized, the :direction may be specified
# as :top_down (the default) or :bottom_up.
attr_reader :direction
# When a scrubber is initialized, the optional block is saved as
# :block. Note that, if no block is passed, then the +scrub+
# method is assumed to have been implemented.
attr_reader :block
#
# Options may include
# :direction => :top_down (the default)
# or
# :direction => :bottom_up
#
# For top_down traversals, if the block returns
# Loofah::Scrubber::STOP, then the traversal will be terminated
# for the current node's subtree.
#
# Alternatively, a Scrubber may inherit from Loofah::Scrubber,
# and implement +scrub+, which is slightly faster than using a
# block.
#
def initialize(options = {}, &block)
direction = options[:direction] || :top_down
unless [:top_down, :bottom_up].include?(direction)
raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
end
@direction, @block = direction, block
end
#
# Calling +traverse+ will cause the document to be traversed by
# either the lambda passed to the initializer or the +scrub+
# method, in the direction specified at +new+ time.
#
def traverse(node)
direction == :bottom_up ? traverse_conditionally_bottom_up(node) : traverse_conditionally_top_down(node)
end
#
# When +new+ is not passed a block, the class may implement
# +scrub+, which will be called for each document node.
#
def scrub(node)
raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
end
#
# If the attribute is not set, add it
# If the attribute is set, don't overwrite the existing value
#
def append_attribute(node, attribute, value)
current_value = node.get_attribute(attribute) || ''
current_values = current_value.split(/\s+/)
updated_value = current_values | [value]
node.set_attribute(attribute, updated_value.join(' '))
end
private
def html5lib_sanitize(node)
case node.type
when Nokogiri::XML::Node::ELEMENT_NODE
if HTML5::Scrub.allowed_element? node.name
HTML5::Scrub.scrub_attributes node
return Scrubber::CONTINUE
end
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
return Scrubber::CONTINUE
end
Scrubber::STOP
end
def traverse_conditionally_top_down(node)
if block
return if block.call(node) == STOP
else
return if scrub(node) == STOP
end
node.children.each {|j| traverse_conditionally_top_down(j)}
end
def traverse_conditionally_bottom_up(node)
node.children.each {|j| traverse_conditionally_bottom_up(j)}
if block
block.call(node)
else
scrub(node)
end
end
end
end