Skip to content

Commit

Permalink
html5lib_shim: validate unicode points for convert_entity
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Guthe committed Jan 25, 2021
1 parent 90cb80b commit e0319c0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
17 changes: 15 additions & 2 deletions bleach/html5lib_shim.py
Expand Up @@ -459,9 +459,22 @@ def convert_entity(value):
if value[0] == "#":
if len(value) < 2:
return None

if value[1] in ("x", "X"):
return six.unichr(int(value[2:], 16))
return six.unichr(int(value[1:], 10))
# hex-encoded code point
prefix, base = value[2:], 16
else:
# decimal code point
prefix, base = value[1:], 10

if prefix == "":
return None

code_point = int(prefix, base)
if 0 < code_point < 0x110000:
return six.unichr(code_point)
else:
return None

return ENTITIES.get(value, None)

Expand Down
10 changes: 10 additions & 0 deletions tests/test_html5lib_shim.py
Expand Up @@ -19,6 +19,16 @@
("&xx;", "&xx;"),
# Handles multiple entities in the same string
("this &amp; that &amp; that", "this & that & that"),
# Handles empty decimal and hex encoded code points
("&#x;", "&#x;"),
("&#;", "&#;"),
# Handles too high unicode points
("&#x110000;", "&#x110000;"),
("&#x110111;", "&#x110111;"),
("&#9277809;", "&#9277809;"),
# Handles negative unicode points
("&#-1;", "&#-1;"),
("&#x-1;", "&#x-1;"),
],
)
def test_convert_entities(data, expected):
Expand Down

0 comments on commit e0319c0

Please sign in to comment.