Merge pull request #651 from willkg/369-whitespace

fix stripping block-level tags (#369)
mozilla · Apr 7, 2022 · a14d412 · a14d412
2 parents a1139a5 + 8a6f2bb
commit a14d412
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 11 deletions.
diff --git a/CHANGES b/CHANGES
@@ -28,6 +28,8 @@ Version 5.0.0 (In development)
   See `development docs <https://bleach.readthedocs.io/en/latest/dev.html>`_
   for more details. (#620)
 
+* Add newline when dropping block-level tags. Thank you, @jvanasco! (#369)
+
 Version 4.1.0 (August 25th, 2021)
 ---------------------------------
 

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -70,8 +70,10 @@
     constants.tokenTypes["EndTag"],
     constants.tokenTypes["EmptyTag"],
 }
-CHARACTERS_TYPE = constants.tokenTypes["Characters"]
-PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
+TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
+TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
+TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
+TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
 
 
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
@@ -192,6 +194,48 @@
 ]
 
 
+#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
+#: from mozilla on 2019.07.11
+#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
+HTML_TAGS_BLOCK_LEVEL = frozenset(
+    [
+        "address",
+        "article",
+        "aside",
+        "blockquote",
+        "details",
+        "dialog",
+        "dd",
+        "div",
+        "dl",
+        "dt",
+        "fieldset",
+        "figcaption",
+        "figure",
+        "footer",
+        "form",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "header",
+        "hgroup",
+        "hr",
+        "li",
+        "main",
+        "nav",
+        "ol",
+        "p",
+        "pre",
+        "section",
+        "table",
+        "ul",
+    ]
+)
+
+
 class InputStreamWithMemory:
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -266,6 +310,9 @@ def __init__(self, consume_entities=False, **kwargs):
         # Wrap the stream with one that remembers the history
         self.stream = InputStreamWithMemory(self.stream)
 
+        # Remember the last token emitted; needed for block element spacing
+        self.emitted_last_token = None
+
     def __iter__(self):
         last_error_token = None
 
@@ -311,12 +358,12 @@ def __iter__(self):
                     # If this is not an allowed tag, then we convert it to
                     # characters and it'll get escaped in the sanitizer.
                     token["data"] = self.stream.get_tag()
-                    token["type"] = CHARACTERS_TYPE
+                    token["type"] = TAG_TOKEN_TYPE_CHARACTERS
 
                     last_error_token = None
                     yield token
 
-                elif token["type"] == PARSEERROR_TYPE:
+                elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
                     # If the token is a parse error, then let the last_error_token
                     # go, and make token the new last_error_token
                     yield last_error_token
@@ -331,7 +378,7 @@ def __iter__(self):
 
             # If the token is a ParseError, we hold on to it so we can get the
             # next token and potentially fix it.
-            if token["type"] == PARSEERROR_TYPE:
+            if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
                 last_error_token = token
                 continue
 
@@ -356,7 +403,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
             self.currentToken["data"][-1][1] += "&"
 
         else:
-            self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
+            self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
 
     def tagOpenState(self):
         # This state marks a < that is either a StartTag, EndTag, EmptyTag,
@@ -378,9 +425,19 @@ def emitCurrentToken(self):
             # allowed list, then it gets stripped or escaped. In both of these
             # cases it gets converted to a Characters token.
             if self.parser.strip:
-                # If we're stripping the token, we just throw in an empty
-                # string token.
-                new_data = ""
+                if (
+                    self.emitted_last_token
+                    and token["type"] == TAG_TOKEN_TYPE_START
+                    and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
+                ):
+                    # If this is a block level tag we're stripping, we drop it
+                    # for a newline because that's what a browser would parse
+                    # it as
+                    new_data = "\n"
+                else:
+                    # For all other things being stripped, we throw in an empty
+                    # string token
+                    new_data = ""
 
             else:
                 # If we're escaping the token, we want to escape the exact
@@ -390,13 +447,14 @@ def emitCurrentToken(self):
                 # string and use that.
                 new_data = self.stream.get_tag()
 
-            new_token = {"type": CHARACTERS_TYPE, "data": new_data}
+            new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
 
-            self.currentToken = new_token
+            self.currentToken = self.emitted_last_token = new_token
             self.tokenQueue.append(new_token)
             self.state = self.dataState
             return
 
+        self.emitted_last_token = self.currentToken
         super().emitCurrentToken()
 
 

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -1070,6 +1070,42 @@ def test_html_comments_escaped(namespace_tag, end_tag, eject_tag, data, expected
     )
 
 
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        (
+            "<p>Te<b>st</b>!</p><p>Hello</p>",
+            "Test!\nHello",
+        ),
+        (
+            # with an internal space and escaped character
+            "<p>This is our <b>description!</b> &amp;</p><p>nice!</p>",
+            "This is our description! &amp;\nnice!",
+        ),
+        (
+            # note: double-wrap causes an initial newline--this can't really be
+            # handled under the current design
+            "<div><p>This is our <b>description!</b> &amp;</p></div><p>nice!</p>",
+            "\nThis is our description! &amp;\nnice!",
+        ),
+        (
+            # newlines are used to keep lists and other elements readable
+            (
+                "<div><p>This is our <b>description!</b> &amp;</p><p>1</p>"
+                "<ul><li>a</li><li>b</li><li>c</li></ul></div><p>nice!</p>"
+            ),
+            "\nThis is our description! &amp;\n1\n\na\nb\nc\nnice!",
+        ),
+    ],
+)
+def test_strip_respects_block_level_elements(text, expected):
+    """
+    Insert a newline between block level elements
+    https://github.com/mozilla/bleach/issues/369
+    """
+    assert clean(text, tags=[], strip=True) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory