/
html5lib_shim.py
668 lines (551 loc) · 19.7 KB
/
html5lib_shim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
# flake8: noqa
"""
Shim module between Bleach and html5lib. This makes it easier to upgrade the
html5lib library without having to change a lot of code.
"""
from __future__ import unicode_literals
import re
import string
import warnings
import six
# ignore html5lib deprecation warnings to use bleach; we are bleach
# apply before we import submodules that import html5lib
warnings.filterwarnings(
"ignore",
message="html5lib's sanitizer is deprecated",
category=DeprecationWarning,
module="bleach._vendor.html5lib",
)
from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
HTMLParser,
getTreeWalker,
)
from bleach._vendor.html5lib import (
constants,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
namespaces,
prefixes,
)
from bleach._vendor.html5lib.constants import (
_ReparseException as ReparseException,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.base import (
Filter,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
allowed_protocols,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
Filter as SanitizerFilter,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib._inputstream import (
HTMLInputStream,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.serializer import (
HTMLSerializer,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib._tokenizer import (
attributeMap,
HTMLTokenizer,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib._trie import (
Trie,
) # noqa: E402 module level import not at top of file
#: Map of entity name to expanded entity
ENTITIES = constants.entities
#: Trie of html entity string -> character representation
ENTITIES_TRIE = Trie(ENTITIES)
#: Token type constants--these never change
TAG_TOKEN_TYPES = {
constants.tokenTypes["StartTag"],
constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"],
}
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
HTML_TAGS = [
"a",
"abbr",
"address",
"area",
"article",
"aside",
"audio",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"cite",
"code",
"col",
"colgroup",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"div",
"dl",
"dt",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"map",
"mark",
"menu",
"meta",
"meter",
"nav",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"picture",
"pre",
"progress",
"q",
"rp",
"rt",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"slot",
"small",
"source",
"span",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"u",
"ul",
"var",
"video",
"wbr",
]
class InputStreamWithMemory(object):
"""Wraps an HTMLInputStream to remember characters since last <
This wraps existing HTMLInputStream classes to keep track of the stream
since the last < which marked an open tag state.
"""
def __init__(self, inner_stream):
self._inner_stream = inner_stream
self.reset = self._inner_stream.reset
self.position = self._inner_stream.position
self._buffer = []
@property
def errors(self):
return self._inner_stream.errors
@property
def charEncoding(self):
return self._inner_stream.charEncoding
@property
def changeEncoding(self):
return self._inner_stream.changeEncoding
def char(self):
c = self._inner_stream.char()
# char() can return None if EOF, so ignore that
if c:
self._buffer.append(c)
return c
def charsUntil(self, characters, opposite=False):
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
self._buffer.extend(list(chars))
return chars
def unget(self, char):
if self._buffer:
self._buffer.pop(-1)
return self._inner_stream.unget(char)
def get_tag(self):
"""Returns the stream history since last '<'
Since the buffer starts at the last '<' as as seen by tagOpenState(),
we know that everything from that point to when this method is called
is the "tag" that is being tokenized.
"""
return six.text_type("").join(self._buffer)
def start_tag(self):
"""Resets stream history to just '<'
This gets called by tagOpenState() which marks a '<' that denotes an
open tag. Any time we see that, we reset the buffer.
"""
self._buffer = ["<"]
class BleachHTMLTokenizer(HTMLTokenizer):
"""Tokenizer that doesn't consume character entities"""
def __init__(self, consume_entities=False, **kwargs):
super(BleachHTMLTokenizer, self).__init__(**kwargs)
self.consume_entities = consume_entities
# Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream)
def __iter__(self):
last_error_token = None
for token in super(BleachHTMLTokenizer, self).__iter__():
if last_error_token is not None:
if (
last_error_token["data"] == "invalid-character-in-attribute-name"
and token["type"] in TAG_TOKEN_TYPES
and token.get("data")
):
# token["data"] is an html5lib attributeMap
# (OrderedDict 3.7+ and dict otherwise)
# of attr name to attr value
#
# Remove attribute names that have ', " or < in them
# because those characters are invalid for attribute names.
token["data"] = attributeMap(
(attr_name, attr_value)
for attr_name, attr_value in token["data"].items()
if (
'"' not in attr_name
and "'" not in attr_name
and "<" not in attr_name
)
)
last_error_token = None
yield token
elif (
last_error_token["data"] == "expected-closing-tag-but-got-char"
and self.parser.tags is not None
and token["data"].lower().strip() not in self.parser.tags
):
# We've got either a malformed tag or a pseudo-tag or
# something that html5lib wants to turn into a malformed
# comment which Bleach clean() will drop so we interfere
# with the token stream to handle it more correctly.
#
# If this is an allowed tag, it's malformed and we just let
# the html5lib parser deal with it--we don't enter into this
# block.
#
# If this is not an allowed tag, then we convert it to
# characters and it'll get escaped in the sanitizer.
token["data"] = self.stream.get_tag()
token["type"] = CHARACTERS_TYPE
last_error_token = None
yield token
elif token["type"] == PARSEERROR_TYPE:
# If the token is a parse error, then let the last_error_token
# go, and make token the new last_error_token
yield last_error_token
last_error_token = token
else:
yield last_error_token
yield token
last_error_token = None
continue
# If the token is a ParseError, we hold on to it so we can get the
# next token and potentially fix it.
if token["type"] == PARSEERROR_TYPE:
last_error_token = token
continue
yield token
if last_error_token:
yield last_error_token
def consumeEntity(self, allowedChar=None, fromAttribute=False):
# If this tokenizer is set to consume entities, then we can let the
# superclass do its thing.
if self.consume_entities:
return super(BleachHTMLTokenizer, self).consumeEntity(
allowedChar, fromAttribute
)
# If this tokenizer is set to not consume entities, then we don't want
# to consume and convert them, so this overrides the html5lib tokenizer's
# consumeEntity so that it's now a no-op.
#
# However, when that gets called, it's consumed an &, so we put that back in
# the stream.
if fromAttribute:
self.currentToken["data"][-1][1] += "&"
else:
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
def tagOpenState(self):
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
# or ParseError. In all cases, we want to drop any stream history
# we've collected so far and we do that by calling start_tag() on
# the input stream wrapper.
self.stream.start_tag()
return super(BleachHTMLTokenizer, self).tagOpenState()
def emitCurrentToken(self):
token = self.currentToken
if (
self.parser.tags is not None
and token["type"] in TAG_TOKEN_TYPES
and token["name"].lower() not in self.parser.tags
):
# If this is a start/end/empty tag for a tag that's not in our
# allowed list, then it gets stripped or escaped. In both of these
# cases it gets converted to a Characters token.
if self.parser.strip:
# If we're stripping the token, we just throw in an empty
# string token.
new_data = ""
else:
# If we're escaping the token, we want to escape the exact
# original string. Since tokenizing also normalizes data
# and this is a tag-like thing, we've lost some information.
# So we go back through the stream to get the original
# string and use that.
new_data = self.stream.get_tag()
new_token = {"type": CHARACTERS_TYPE, "data": new_data}
self.currentToken = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return
super(BleachHTMLTokenizer, self).emitCurrentToken()
class BleachHTMLParser(HTMLParser):
"""Parser that uses BleachHTMLTokenizer"""
def __init__(self, tags, strip, consume_entities, **kwargs):
"""
:arg tags: list of allowed tags--everything else is either stripped or
escaped; if None, then this doesn't look at tags at all
:arg strip: whether to strip disallowed tags (True) or escape them (False);
if tags=None, then this doesn't have any effect
:arg consume_entities: whether to consume entities (default behavior) or
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
"""
self.tags = [tag.lower() for tag in tags] if tags is not None else None
self.strip = strip
self.consume_entities = consume_entities
super(BleachHTMLParser, self).__init__(**kwargs)
def _parse(
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
):
# set scripting=True to parse <noscript> as though JS is enabled to
# match the expected context in browsers
#
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
#
# Override HTMLParser so we can swap out the tokenizer for our own.
self.innerHTMLMode = innerHTML
self.container = container
self.scripting = scripting
self.tokenizer = BleachHTMLTokenizer(
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
)
self.reset()
try:
self.mainLoop()
except ReparseException:
self.reset()
self.mainLoop()
def convert_entity(value):
"""Convert an entity (minus the & and ; part) into what it represents
This handles numeric, hex, and text entities.
:arg value: the string (minus the ``&`` and ``;`` part) to convert
:returns: unicode character or None if it's an ambiguous ampersand that
doesn't match a character entity
"""
if value[0] == "#":
if len(value) < 2:
return None
if value[1] in ("x", "X"):
# hex-encoded code point
int_as_string, base = value[2:], 16
else:
# decimal code point
int_as_string, base = value[1:], 10
if int_as_string == "":
return None
code_point = int(int_as_string, base)
if 0 < code_point < 0x110000:
return six.unichr(code_point)
else:
return None
return ENTITIES.get(value, None)
def convert_entities(text):
"""Converts all found entities in the text
:arg text: the text to convert entities in
:returns: unicode text with converted entities
"""
if "&" not in text:
return text
new_text = []
for part in next_possible_entity(text):
if not part:
continue
if part.startswith("&"):
entity = match_entity(part)
if entity is not None:
converted = convert_entity(entity)
# If it's not an ambiguous ampersand, then replace with the
# unicode character. Otherwise, we leave the entity in.
if converted is not None:
new_text.append(converted)
remainder = part[len(entity) + 2 :]
if part:
new_text.append(remainder)
continue
new_text.append(part)
return "".join(new_text)
def match_entity(stream):
"""Returns first entity in stream or None if no entity exists
Note: For Bleach purposes, entities must start with a "&" and end with
a ";". This ignoresambiguous character entities that have no ";" at the
end.
:arg stream: the character stream
:returns: ``None`` or the entity string without "&" or ";"
"""
# Nix the & at the beginning
if stream[0] != "&":
raise ValueError('Stream should begin with "&"')
stream = stream[1:]
stream = list(stream)
possible_entity = ""
end_characters = "<&=;" + string.whitespace
# Handle number entities
if stream and stream[0] == "#":
possible_entity = "#"
stream.pop(0)
if stream and stream[0] in ("x", "X"):
allowed = "0123456789abcdefABCDEF"
possible_entity += stream.pop(0)
else:
allowed = "0123456789"
# FIXME(willkg): Do we want to make sure these are valid number
# entities? This doesn't do that currently.
while stream and stream[0] not in end_characters:
c = stream.pop(0)
if c not in allowed:
break
possible_entity += c
if possible_entity and stream and stream[0] == ";":
return possible_entity
return None
# Handle character entities
while stream and stream[0] not in end_characters:
c = stream.pop(0)
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
break
possible_entity += c
if possible_entity and stream and stream[0] == ";":
return possible_entity
return None
AMP_SPLIT_RE = re.compile("(&)")
def next_possible_entity(text):
"""Takes a text and generates a list of possible entities
:arg text: the text to look at
:returns: generator where each part (except the first) starts with an
"&"
"""
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
if i == 0:
yield part
elif i % 2 == 0:
yield "&" + part
class BleachHTMLSerializer(HTMLSerializer):
"""HTMLSerializer that undoes & -> & in attributes and sets
escape_rcdata to True
"""
# per the HTMLSerializer.__init__ docstring:
#
# Whether to escape characters that need to be
# escaped within normal elements within rcdata elements such as
# style.
#
escape_rcdata = True
def escape_base_amp(self, stoken):
"""Escapes just bare & in HTML attribute values"""
# First, undo escaping of &. We need to do this because html5lib's
# HTMLSerializer expected the tokenizer to consume all the character
# entities and convert them to their respective characters, but the
# BleachHTMLTokenizer doesn't do that. For example, this fixes
# &entity; back to &entity; .
stoken = stoken.replace("&", "&")
# However, we do want all bare & that are not marking character
# entities to be changed to &, so let's do that carefully here.
for part in next_possible_entity(stoken):
if not part:
continue
if part.startswith("&"):
entity = match_entity(part)
# Only leave entities in that are not ambiguous. If they're
# ambiguous, then we escape the ampersand.
if entity is not None and convert_entity(entity) is not None:
yield "&" + entity + ";"
# Length of the entity plus 2--one for & at the beginning
# and one for ; at the end
part = part[len(entity) + 2 :]
if part:
yield part
continue
yield part.replace("&", "&")
def serialize(self, treewalker, encoding=None):
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
Note that this converts & to & in attribute values where the & isn't
already part of an unambiguous character entity.
"""
in_tag = False
after_equals = False
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
if in_tag:
if stoken == ">":
in_tag = False
elif after_equals:
if stoken != '"':
for part in self.escape_base_amp(stoken):
yield part
after_equals = False
continue
elif stoken == "=":
after_equals = True
yield stoken
else:
if stoken.startswith("<"):
in_tag = True
yield stoken