diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index aa5189b1..ca1cc8c8 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -395,10 +395,17 @@ def __iter__(self): # followed by a series of characters. It's treated as a tag # name that abruptly ends, but we should treat that like # character data - yield { - "type": TAG_TOKEN_TYPE_CHARACTERS, - "data": "<" + self.currentToken["name"], - } + yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} + elif last_error_token["data"] in ( + "eof-in-attribute-name", + "eof-in-attribute-value-no-quotes", + ): + # Handle the case where the text being parsed ends with < + # followed by a series of characters and then space and then + # more characters. It's treated as a tag name followed by an + # attribute that abruptly ends, but we should treat that like + # character data. + yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} else: yield last_error_token diff --git a/tests/test_clean.py b/tests/test_clean.py index 73946a1f..dc129d0e 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -163,6 +163,10 @@ def test_bare_entities_get_escaped_correctly(text, expected): ("", "<y>"), + # this is an eof-in-attribute-name parser error + ("