Skip to content

Commit

Permalink
fix for mozilla#369; simplified redo of mozilla#461 agaist main
Browse files Browse the repository at this point in the history
block elements are tracked and a newline is inserted when they are stripped.

new tests are included.
  • Loading branch information
jvanasco committed Jul 23, 2021
1 parent ea1849a commit 8c39b9e
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
51 changes: 51 additions & 0 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
constants.tokenTypes["EndTag"],
constants.tokenTypes["EmptyTag"],
}
TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
TAG_TOKEN_TYPE_END = constants.tokenTypes['EndTag']
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]

Expand Down Expand Up @@ -194,6 +196,45 @@
]


#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
#: from mozilla on 2019.07.11
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS__BLOCK_LEVEL = [
'address',
'article',
'aside',
'blockquote',
'details',
'dialog',
'dd',
'div',
'dl',
'dt',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'main',
'nav',
'ol',
'p',
'pre',
'section',
'table',
'ul',
]

class InputStreamWithMemory(object):
"""Wraps an HTMLInputStream to remember characters since last <
Expand Down Expand Up @@ -260,6 +301,9 @@ def start_tag(self):
class BleachHTMLTokenizer(HTMLTokenizer):
"""Tokenizer that doesn't consume character entities"""

# remember the last token emitted, needed for block element spacing
_emittedLastToken = None

def __init__(self, consume_entities=False, **kwargs):
super(BleachHTMLTokenizer, self).__init__(**kwargs)

Expand Down Expand Up @@ -385,6 +429,11 @@ def emitCurrentToken(self):
# If we're stripping the token, we just throw in an empty
# string token.
new_data = ""
if ((self._emittedLastToken and
token['type'] == TAG_TOKEN_TYPE_START and
token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
)):
new_data = '\n'

else:
# If we're escaping the token, we want to escape the exact
Expand All @@ -397,10 +446,12 @@ def emitCurrentToken(self):
new_token = {"type": CHARACTERS_TYPE, "data": new_data}

self.currentToken = new_token
self._emittedLastToken = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return

self._emittedLastToken = self.currentToken
super(BleachHTMLTokenizer, self).emitCurrentToken()


Expand Down
22 changes: 22 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,28 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
)


def test_strip_respects_block_level_elements():
"""
Insert a newline between block level elements
https://github.com/mozilla/bleach/issues/369
"""
# simple example
text = '<p>Te<b>st</b>!</p><p>Hello</p>'
assert clean(text, tags=[], strip=True) == 'Test!\nHello'

# with an internal space and escaped character, just to be sure
text = '<p>This is our <b>description!</b> &amp;</p><p>nice!</p>'
assert clean(text, tags=[], strip=True) == 'This is our description! &amp;\nnice!'

# a double-wrap causes an initial newline. this can't really be handled under the current design
text = '<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>'
assert clean(text, tags=[], strip=True) == '\nThis is our description! &amp;\nnice!'

# newlines are used to keep lists and other elements readable
text = '<div><p>This is our <b>description!</b> &amp;</p><p>1</p><ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>'
assert clean(text, tags=[], strip=True) == '\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!'


def get_ids_and_tests():
"""Retrieves regression tests from data/ directory
Expand Down

0 comments on commit 8c39b9e

Please sign in to comment.