Skip to content

Commit

Permalink
Merge pull request #651 from willkg/369-whitespace
Browse files Browse the repository at this point in the history
fix stripping block-level tags (#369)
  • Loading branch information
willkg committed Apr 7, 2022
2 parents a1139a5 + 8a6f2bb commit a14d412
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGES
Expand Up @@ -28,6 +28,8 @@ Version 5.0.0 (In development)
See `development docs <https://bleach.readthedocs.io/en/latest/dev.html>`_
for more details. (#620)

* Add newline when dropping block-level tags. Thank you, @jvanasco! (#369)

Version 4.1.0 (August 25th, 2021)
---------------------------------

Expand Down
80 changes: 69 additions & 11 deletions bleach/html5lib_shim.py
Expand Up @@ -70,8 +70,10 @@
constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"],
}
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]


#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
Expand Down Expand Up @@ -192,6 +194,48 @@
]


#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS_BLOCK_LEVEL = frozenset(
[
"address",
"article",
"aside",
"blockquote",
"details",
"dialog",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"ul",
]
)


class InputStreamWithMemory:
"""Wraps an HTMLInputStream to remember characters since last <
Expand Down Expand Up @@ -266,6 +310,9 @@ def __init__(self, consume_entities=False, **kwargs):
# Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream)

# Remember the last token emitted; needed for block element spacing
self.emitted_last_token = None

def __iter__(self):
last_error_token = None

Expand Down Expand Up @@ -311,12 +358,12 @@ def __iter__(self):
# If this is not an allowed tag, then we convert it to
# characters and it'll get escaped in the sanitizer.
token["data"] = self.stream.get_tag()
token["type"] = CHARACTERS_TYPE
token["type"] = TAG_TOKEN_TYPE_CHARACTERS

last_error_token = None
yield token

elif token["type"] == PARSEERROR_TYPE:
elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
# If the token is a parse error, then let the last_error_token
# go, and make token the new last_error_token
yield last_error_token
Expand All @@ -331,7 +378,7 @@ def __iter__(self):

# If the token is a ParseError, we hold on to it so we can get the
# next token and potentially fix it.
if token["type"] == PARSEERROR_TYPE:
if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
last_error_token = token
continue

Expand All @@ -356,7 +403,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
self.currentToken["data"][-1][1] += "&"

else:
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})

def tagOpenState(self):
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
Expand All @@ -378,9 +425,19 @@ def emitCurrentToken(self):
# allowed list, then it gets stripped or escaped. In both of these
# cases it gets converted to a Characters token.
if self.parser.strip:
# If we're stripping the token, we just throw in an empty
# string token.
new_data = ""
if (
self.emitted_last_token
and token["type"] == TAG_TOKEN_TYPE_START
and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
):
# If this is a block level tag we're stripping, we drop it
# for a newline because that's what a browser would parse
# it as
new_data = "\n"
else:
# For all other things being stripped, we throw in an empty
# string token
new_data = ""

else:
# If we're escaping the token, we want to escape the exact
Expand All @@ -390,13 +447,14 @@ def emitCurrentToken(self):
# string and use that.
new_data = self.stream.get_tag()

new_token = {"type": CHARACTERS_TYPE, "data": new_data}
new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}

self.currentToken = new_token
self.currentToken = self.emitted_last_token = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return

self.emitted_last_token = self.currentToken
super().emitCurrentToken()


Expand Down
36 changes: 36 additions & 0 deletions tests/test_clean.py
Expand Up @@ -1070,6 +1070,42 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
)


@pytest.mark.parametrize(
"text, expected",
[
(
"<p>Te<b>st</b>!</p><p>Hello</p>",
"Test!\nHello",
),
(
# with an internal space and escaped character
"<p>This is our <b>description!</b> &amp;</p><p>nice!</p>",
"This is our description! &amp;\nnice!",
),
(
# note: double-wrap causes an initial newline--this can't really be
# handled under the current design
"<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>",
"\nThis is our description! &amp;\nnice!",
),
(
# newlines are used to keep lists and other elements readable
(
"<div><p>This is our <b>description!</b> &amp;</p><p>1</p>"
"<ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>"
),
"\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!",
),
],
)
def test_strip_respects_block_level_elements(text, expected):
"""
Insert a newline between block level elements
https://github.com/mozilla/bleach/issues/369
"""
assert clean(text, tags=[], strip=True) == expected


def get_ids_and_tests():
"""Retrieves regression tests from data/ directory
Expand Down

0 comments on commit a14d412

Please sign in to comment.