/
sanitize.py
128 lines (117 loc) · 3.93 KB
/
sanitize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
"""
from bleach import ALLOWED_ATTRIBUTES, ALLOWED_STYLES, ALLOWED_TAGS, clean
from traitlets import Any, Bool, List, Set, Unicode
from .base import Preprocessor
class SanitizeHTML(Preprocessor):
# Bleach config.
attributes = Any(
config=True,
default_value=ALLOWED_ATTRIBUTES,
help="Allowed HTML tag attributes",
)
tags = List(
Unicode(),
config=True,
default_value=ALLOWED_TAGS,
help="List of HTML tags to allow",
)
styles = List(
Unicode(),
config=True,
default_value=ALLOWED_STYLES,
help="Allowed CSS styles if <style> tag is allowed",
)
strip = Bool(
config=True,
default_value=False,
help="If True, remove unsafe markup entirely instead of escaping",
)
strip_comments = Bool(
config=True,
default_value=True,
help="If True, strip comments from escaped HTML",
)
# Display data config.
safe_output_keys = Set(
config=True,
default_value={
"metadata", # Not a mimetype per-se, but expected and safe.
"text/plain",
"text/latex",
"application/json",
"image/png",
"image/jpeg",
},
help="Cell output mimetypes to render without modification",
)
sanitized_output_types = Set(
config=True,
default_value={
"text/html",
"text/markdown",
},
help="Cell output types to display after escaping with Bleach.",
)
def preprocess_cell(self, cell, resources, cell_index):
"""
Sanitize potentially-dangerous contents of the cell.
Cell Types:
raw:
Sanitize literal HTML
markdown:
Sanitize literal HTML
code:
Sanitize outputs that could result in code execution
"""
if cell.cell_type == "raw":
# Sanitize all raw cells anyway.
# Only ones with the text/html mimetype should be emitted
# but erring on the side of safety maybe.
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == "markdown":
cell.source = self.sanitize_html_tags(cell.source)
return cell, resources
elif cell.cell_type == "code":
cell.outputs = self.sanitize_code_outputs(cell.outputs)
return cell, resources
def sanitize_code_outputs(self, outputs):
"""
Sanitize code cell outputs.
Removes 'text/javascript' fields from display_data outputs, and
runs `sanitize_html_tags` over 'text/html'.
"""
for output in outputs:
# These are always ascii, so nothing to escape.
if output["output_type"] in ("stream", "error"):
continue
data = output.data
to_remove = []
for key in data:
if key in self.safe_output_keys:
continue
elif key in self.sanitized_output_types:
self.log.info("Sanitizing %s" % key)
data[key] = self.sanitize_html_tags(data[key])
else:
# Mark key for removal. (Python doesn't allow deletion of
# keys from a dict during iteration)
to_remove.append(key)
for key in to_remove:
self.log.info("Removing %s" % key)
del data[key]
return outputs
def sanitize_html_tags(self, html_str):
"""
Sanitize a string containing raw HTML tags.
"""
return clean(
html_str,
tags=self.tags,
attributes=self.attributes,
styles=self.styles,
strip=self.strip,
strip_comments=self.strip_comments,
)