/
test_html5lib_shim.py
141 lines (128 loc) · 5.05 KB
/
test_html5lib_shim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from __future__ import unicode_literals
import pytest
from bleach import html5lib_shim
@pytest.mark.parametrize(
"data, expected",
[
# Strings without character entities pass through as is
("", ""),
("abc", "abc"),
# Handles character entities--both named and numeric
(" ", "\xa0"),
(" ", " "),
(" ", " "),
# Handles ambiguous ampersand
("&xx;", "&xx;"),
# Handles multiple entities in the same string
("this & that & that", "this & that & that"),
# Handles empty decimal and hex encoded code points
("&#x;", "&#x;"),
("&#;", "&#;"),
# Handles too high unicode points
("�", "�"),
("�", "�"),
("�", "�"),
# Handles negative unicode points
("&#-1;", "&#-1;"),
("&#x-1;", "&#x-1;"),
],
)
def test_convert_entities(data, expected):
assert html5lib_shim.convert_entities(data) == expected
@pytest.mark.parametrize(
"data, expected",
[
("", ""),
("text", "text"),
# & in Characters is escaped
("&", "&"),
# FIXME(willkg): This happens because the BleachHTMLTokenizer is ignoring
# character entities. What it should be doing is creating Entity tokens
# for character entities.
#
# That was too hard at the time I was fixing it, so I fixed it in
# BleachSanitizerFilter. When that gest fixed correctly in the tokenizer,
# then this test cases will get fixed.
("a & b", "a & b"), # should be 'a & b'
# & in HTML attribute values are escaped
(
'<a href="http://example.com?key=value&key2=value">tag</a>',
'<a href="http://example.com?key=value&key2=value">tag</a>',
),
# & marking character entities in HTML attribute values aren't escaped
(
'<a href="http://example.com?key=value&key2=value">tag</a>',
'<a href="http://example.com?key=value&key2=value">tag</a>',
),
# & marking ambiguous character entities in attribute values are escaped
# (¤ is a character entity)
(
'<a href="http://example.com?key=value¤t=value">tag</a>',
'<a href="http://example.com?key=value&current=value">tag</a>',
),
],
)
def test_serializer(data, expected):
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(
tags=None, strip=True, consume_entities=False, namespaceHTMLElements=False
)
walker = html5lib_shim.getTreeWalker("etree")
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values="always",
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))
assert serialized == expected
@pytest.mark.parametrize(
"parser_args, data, expected",
[
# Make sure InputStreamWithMemory has charEncoding and changeEncoding
({}, '<meta charset="utf-8">', '<meta charset="utf-8">'),
# Handle consume entities False--all entities are passed along and then
# escaped when serialized
(
{"consume_entities": False},
"text &>"",
"text &amp;&gt;&quot;",
),
# Handle consume entities True--all entities are consumed and converted
# to their character equivalents and then &, <, and > are escaped when
# serialized
({"consume_entities": True}, "text &>"", 'text &>"'),
# Test that "invalid-character-in-attribute-name" errors in tokenizing
# result in attributes with invalid names getting dropped
({}, '<a href="http://example.com"">', '<a href="http://example.com"></a>'),
({}, "<a href='http://example.com''>", '<a href="http://example.com"></a>'),
# Test that "expected-closing-tag-but-got-char" works when tags is None
(
{},
"</ chars",
"<!-- chars-->",
),
],
)
def test_bleach_html_parser(parser_args, data, expected):
args = {"tags": None, "strip": True, "consume_entities": True}
args.update(parser_args)
# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(**args)
walker = html5lib_shim.getTreeWalker("etree")
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values="always",
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)
# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))
assert serialized == expected