Skip to content

Commit

Permalink
Handle escaping < in edge cases where it doesn't start a tag (#544) (#…
Browse files Browse the repository at this point in the history
…667)

The html5lib tokenizer kicks up a parse error token when there's a <
that isn't the start of a tag. This adds some handling for that case and
treats the < plus whatever is after it as characters data.
  • Loading branch information
willkg committed Jun 2, 2022
1 parent c5c3f50 commit ed06d4e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
12 changes: 11 additions & 1 deletion bleach/html5lib_shim.py
Expand Up @@ -385,7 +385,17 @@ def __iter__(self):
yield token

if last_error_token:
yield last_error_token
if last_error_token["data"] == "eof-in-tag-name":
# Handle the case where the text being parsed ends with <
# followed by a series of characters. It's treated as a tag
# name that abruptly ends, but we should treat that like
# character data
yield {
"type": TAG_TOKEN_TYPE_CHARACTERS,
"data": "<" + self.currentToken["name"],
}
else:
yield last_error_token

def consumeEntity(self, allowedChar=None, fromAttribute=False):
# If this tokenizer is set to consume entities, then we can let the
Expand Down
16 changes: 16 additions & 0 deletions tests/test_clean.py
Expand Up @@ -156,6 +156,22 @@ def test_bare_entities_get_escaped_correctly(text, expected):
assert clean(text) == expected


@pytest.mark.parametrize(
"text, expected",
[
("x<y", "x&lt;y"),
("<y", "&lt;y"),
("x < y", "x &lt; y"),
("<y>", "&lt;y&gt;"),
],
)
def test_lessthan_escaping(text, expected):
# Tests whether < gets escaped correctly in a series of edge cases where
# the html5lib tokenizer hits an error because it's not the beginning of a
# tag.
assert clean(text) == expected


@pytest.mark.parametrize(
"text, expected",
[
Expand Down

0 comments on commit ed06d4e

Please sign in to comment.