fix for mozilla#369; simplified redo of mozilla#461 agaist main

block elements are tracked and a newline is inserted when they are stripped. new tests are included.
jvanasco · Jul 23, 2021 · 8c39b9e · 8c39b9e
1 parent ea1849a
commit 8c39b9e
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 0 deletions.
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -72,6 +72,8 @@
     constants.tokenTypes["EndTag"],
     constants.tokenTypes["EmptyTag"],
 }
+TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
+TAG_TOKEN_TYPE_END = constants.tokenTypes['EndTag']
 CHARACTERS_TYPE = constants.tokenTypes["Characters"]
 PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
 
@@ -194,6 +196,45 @@
 ]
 
 
+#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
+#: from mozilla on 2019.07.11
+#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
+HTML_TAGS__BLOCK_LEVEL = [
+    'address',
+    'article',
+    'aside',
+    'blockquote',
+    'details',
+    'dialog',
+    'dd',
+    'div',
+    'dl',
+    'dt',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'header',
+    'hgroup',
+    'hr',
+    'li',
+    'main',
+    'nav',
+    'ol',
+    'p',
+    'pre',
+    'section',
+    'table',
+    'ul',
+]
+
 class InputStreamWithMemory(object):
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -260,6 +301,9 @@ def start_tag(self):
 class BleachHTMLTokenizer(HTMLTokenizer):
     """Tokenizer that doesn't consume character entities"""
 
+    # remember the last token emitted, needed for block element spacing
+    _emittedLastToken = None
+
     def __init__(self, consume_entities=False, **kwargs):
         super(BleachHTMLTokenizer, self).__init__(**kwargs)
 
@@ -385,6 +429,11 @@ def emitCurrentToken(self):
                 # If we're stripping the token, we just throw in an empty
                 # string token.
                 new_data = ""
+                if ((self._emittedLastToken and
+                     token['type'] == TAG_TOKEN_TYPE_START and
+                     token['name'].lower() in HTML_TAGS__BLOCK_LEVEL
+                     )):
+                    new_data = '\n'
 
             else:
                 # If we're escaping the token, we want to escape the exact
@@ -397,10 +446,12 @@ def emitCurrentToken(self):
             new_token = {"type": CHARACTERS_TYPE, "data": new_data}
 
             self.currentToken = new_token
+            self._emittedLastToken = new_token
             self.tokenQueue.append(new_token)
             self.state = self.dataState
             return
 
+        self._emittedLastToken = self.currentToken
         super(BleachHTMLTokenizer, self).emitCurrentToken()
 
 

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -1024,6 +1024,28 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
     )
 
 
+def test_strip_respects_block_level_elements():
+    """
+    Insert a newline between block level elements
+    https://github.com/mozilla/bleach/issues/369
+    """
+    # simple example
+    text = '<p>Te<b>st</b>!</p><p>Hello</p>'
+    assert clean(text, tags=[], strip=True) == 'Test!\nHello'
+
+    # with an internal space and escaped character, just to be sure
+    text = '<p>This is our <b>description!</b> &amp;</p><p>nice!</p>'
+    assert clean(text, tags=[], strip=True) == 'This is our description! &amp;\nnice!'
+
+    # a double-wrap causes an initial newline. this can't really be handled under the current design
+    text = '<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>'
+    assert clean(text, tags=[], strip=True) == '\nThis is our description! &amp;\nnice!'
+
+    # newlines are used to keep lists and other elements readable
+    text = '<div><p>This is our <b>description!</b> &amp;</p><p>1</p><ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>'
+    assert clean(text, tags=[], strip=True) == '\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!'
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory