Handle escaping < in edge cases where it doesn't start a tag (#544) (#…

…667) The html5lib tokenizer kicks up a parse error token when there's a < that isn't the start of a tag. This adds some handling for that case and treats the < plus whatever is after it as characters data.
mozilla · Jun 2, 2022 · ed06d4e · ed06d4e
1 parent c5c3f50
commit ed06d4e
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 1 deletion.
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -385,7 +385,17 @@ def __iter__(self):
             yield token
 
         if last_error_token:
-            yield last_error_token
+            if last_error_token["data"] == "eof-in-tag-name":
+                # Handle the case where the text being parsed ends with <
+                # followed by a series of characters. It's treated as a tag
+                # name that abruptly ends, but we should treat that like
+                # character data
+                yield {
+                    "type": TAG_TOKEN_TYPE_CHARACTERS,
+                    "data": "<" + self.currentToken["name"],
+                }
+            else:
+                yield last_error_token
 
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # If this tokenizer is set to consume entities, then we can let the

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -156,6 +156,22 @@ def test_bare_entities_get_escaped_correctly(text, expected):
     assert clean(text) == expected
 
 
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("x<y", "x&lt;y"),
+        ("<y", "&lt;y"),
+        ("x < y", "x &lt; y"),
+        ("<y>", "&lt;y&gt;"),
+    ],
+)
+def test_lessthan_escaping(text, expected):
+    # Tests whether < gets escaped correctly in a series of edge cases where
+    # the html5lib tokenizer hits an error because it's not the beginning of a
+    # tag.
+    assert clean(text) == expected
+
+
 @pytest.mark.parametrize(
     "text, expected",
     [