/
sanitize.py
169 lines (147 loc) · 4.99 KB
/
sanitize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
"""
import warnings
from bleach import ALLOWED_ATTRIBUTES, ALLOWED_TAGS, clean
from traitlets import Any, Bool, List, Set, Unicode
_USE_BLEACH_CSS_SANITIZER = False
_USE_BLEACH_STYLES = False
try:
# bleach[css] >=5.0
from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES as ALLOWED_STYLES
from bleach.css_sanitizer import CSSSanitizer
_USE_BLEACH_CSS_SANITIZER = True
_USE_BLEACH_STYLES = False
except ImportError:
try:
# bleach <5
from bleach import ALLOWED_STYLES
_USE_BLEACH_CSS_SANITIZER = False
_USE_BLEACH_STYLES = True
warnings.warn(
"Support for bleach <5 will be removed in a future version of nbconvert",
DeprecationWarning,
)
except ImportError:
warnings.warn(
"The installed bleach/tinycss2 do not provide CSS sanitization, "
"please upgrade to bleach >=5",
UserWarning,
)
from .base import Preprocessor
__all__ = ["SanitizeHTML"]
class SanitizeHTML(Preprocessor):
# Bleach config.
attributes = Any(
config=True,
default_value=ALLOWED_ATTRIBUTES,
help="Allowed HTML tag attributes",
)
tags = List(
Unicode(),
config=True,
default_value=ALLOWED_TAGS,
help="List of HTML tags to allow",
)
styles = List(
Unicode(),
config=True,
default_value=ALLOWED_STYLES,
help="Allowed CSS styles if <style> tag is allowed",
)
strip = Bool(
config=True,
default_value=False,
help="If True, remove unsafe markup entirely instead of escaping",
)
strip_comments = Bool(
config=True,
default_value=True,
help="If True, strip comments from escaped HTML",
)
# Display data config.
safe_output_keys = Set(
config=True,
default_value={
"metadata", # Not a mimetype per-se, but expected and safe.
"text/plain",
"text/latex",
"application/json",
"image/png",
"image/jpeg",
},
help="Cell output mimetypes to render without modification",
)
sanitized_output_types = Set(
config=True,
default_value={
"text/html",
"text/markdown",
},
help="Cell output types to display after escaping with Bleach.",
)
def preprocess_cell(self, cell, resources, cell_index):
"""
Sanitize potentially-dangerous contents of the cell.
Cell Types:
raw:
Sanitize literal HTML
markdown:
Sanitize literal HTML
code:
Sanitize outputs that could result in code execution
"""
if cell.cell_type == "raw":
# Sanitize all raw cells anyway.
# Only ones with the text/html mimetype should be emitted
# but erring on the side of safety maybe.
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == "markdown":
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == "code":
cell.outputs = self.sanitize_code_outputs(cell.outputs)
return cell, resources
def sanitize_code_outputs(self, outputs):
"""
Sanitize code cell outputs.
Removes 'text/javascript' fields from display_data outputs, and
runs `sanitize_html_tags` over 'text/html'.
"""
for output in outputs:
# These are always ascii, so nothing to escape.
if output["output_type"] in ("stream", "error"):
continue
data = output.data
to_remove = []
for key in data:
if key in self.safe_output_keys:
continue
elif key in self.sanitized_output_types:
self.log.info("Sanitizing %s" % key)
data[key] = self.sanitize_html_tags(data[key])
else:
# Mark key for removal. (Python doesn't allow deletion of
# keys from a dict during iteration)
to_remove.append(key)
for key in to_remove:
self.log.info("Removing %s" % key)
del data[key]
return outputs
def sanitize_html_tags(self, html_str):
"""
Sanitize a string containing raw HTML tags.
"""
kwargs = dict(
tags=self.tags,
attributes=self.attributes,
strip=self.strip,
strip_comments=self.strip_comments,
)
if _USE_BLEACH_CSS_SANITIZER:
css_sanitizer = CSSSanitizer(allowed_css_properties=self.styles)
kwargs.update(css_sanitizer=css_sanitizer)
elif _USE_BLEACH_STYLES:
kwargs.update(styles=self.styles)
return clean(html_str, **kwargs)