diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 9984b175..13316a99 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -459,9 +459,22 @@ def convert_entity(value): if value[0] == "#": if len(value) < 2: return None + if value[1] in ("x", "X"): - return six.unichr(int(value[2:], 16)) - return six.unichr(int(value[1:], 10)) + # hex-encoded code point + prefix, base = value[2:], 16 + else: + # decimal code point + prefix, base = value[1:], 10 + + if prefix == "": + return None + + code_point = int(prefix, base) + if 0 < code_point < 0x110000: + return six.unichr(code_point) + else: + return None return ENTITIES.get(value, None) diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py index fcb7799d..5a836bcd 100644 --- a/tests/test_html5lib_shim.py +++ b/tests/test_html5lib_shim.py @@ -19,6 +19,16 @@ ("&xx;", "&xx;"), # Handles multiple entities in the same string ("this & that & that", "this & that & that"), + # Handles empty decimal and hex encoded code points + ("&#x;", "&#x;"), + ("&#;", "&#;"), + # Handles too high unicode points + ("�", "�"), + ("�", "�"), + ("�", "�"), + # Handles negative unicode points + ("&#-1;", "&#-1;"), + ("&#x-1;", "&#x-1;"), ], ) def test_convert_entities(data, expected):