fix for issue mozilla#369

jvanasco · Jul 11, 2019 · 46c7334 · 46c7334
1 parent 47fe22e
commit 46c7334
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 3 deletions.
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -42,6 +42,7 @@
     constants.tokenTypes['EndTag'],
     constants.tokenTypes['EmptyTag']
 }
+TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 PARSEERROR_TYPE = constants.tokenTypes['ParseError']
 
@@ -164,6 +165,45 @@
 ]
 
 
+#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369
+#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
+HTML_TAGS__BLOCK_LEVEL = [
+    'address',
+    'article',
+    'aside',
+    'blockquote',
+    'details',
+    'dialog',
+    'dd',
+    'div',
+    'dl',
+    'dt',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'header',
+    'hgroup',
+    'hr',
+    'li',
+    'main',
+    'nav',
+    'ol',
+    'p',
+    'pre',
+    'section',
+    'table',
+    'ul',
+]
+
+
 class InputStreamWithMemory(object):
     """Wraps an HTMLInputStream to remember characters since last <
 
@@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs):
         # Wrap the stream with one that remembers the history
         self.stream = InputStreamWithMemory(self.stream)
 
+    # we need to remember the last token emitted, so we don't add too many spaces
+    _emittedLastToken = None
+
     def __iter__(self):
         last_error_token = None
 
@@ -335,9 +378,15 @@ def emitCurrentToken(self):
             # cases it gets converted to a Characters token.
             if self.parser.strip:
                 # If we're stripping the token, we just throw in an empty
-                # string token.
+                # string token
                 new_data = ''
-
+                if ((self._emittedLastToken and
+                     token['type'] == TAG_TOKEN_TYPE_START and
+                     token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and
+                     not self._emittedLastToken.get('data', '').endswith(' '))):
+                    # BUT, if this is the START of a block level tag, then we
+                    # want to insert a space for accessibility.
+                    new_data = ' '
             else:
                 # If we're escaping the token, we want to escape the exact
                 # original string. Since tokenizing also normalizes data
@@ -351,11 +400,12 @@ def emitCurrentToken(self):
                 'data': new_data
             }
 
-            self.currentToken = new_token
+            self.currentToken = self._emittedLastToken = new_token
             self.tokenQueue.append(new_token)
             self.state = self.dataState
             return
 
+        self._emittedLastToken = self.currentToken
         super(BleachHTMLTokenizer, self).emitCurrentToken()
 
 

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -848,3 +848,12 @@ def __iter__(self):
             cleaner.clean(dirty) ==
             'this is cute! <img rel="moo" src="moo">'
         )
+
+
+def test_strip_respects_block_level_elements():
+    """
+    We should at least have a space between block level elements
+    https://github.com/mozilla/bleach/issues/369
+    """
+    text = '<p>Te<b>st</b>!</p><p>Hello</p>'
+    assert clean(text, tags=[], strip=True) == 'Test! Hello'