Skip to content

Commit

Permalink
fix for issue mozilla#369
Browse files Browse the repository at this point in the history
  • Loading branch information
jvanasco committed Jul 11, 2019
1 parent 47fe22e commit 46c7334
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 3 deletions.
56 changes: 53 additions & 3 deletions bleach/html5lib_shim.py
Expand Up @@ -42,6 +42,7 @@
constants.tokenTypes['EndTag'],
constants.tokenTypes['EmptyTag']
}
TAG_TOKEN_TYPE_START = constants.tokenTypes['StartTag']
CHARACTERS_TYPE = constants.tokenTypes['Characters']
PARSEERROR_TYPE = constants.tokenTypes['ParseError']

Expand Down Expand Up @@ -164,6 +165,45 @@
]


#: List of block level HTML tags, from mozilla on 2019.07.11, as per https://github.com/mozilla/bleach/issues/369
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
HTML_TAGS__BLOCK_LEVEL = [
'address',
'article',
'aside',
'blockquote',
'details',
'dialog',
'dd',
'div',
'dl',
'dt',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'main',
'nav',
'ol',
'p',
'pre',
'section',
'table',
'ul',
]


class InputStreamWithMemory(object):
"""Wraps an HTMLInputStream to remember characters since last <
Expand Down Expand Up @@ -236,6 +276,9 @@ def __init__(self, consume_entities=False, **kwargs):
# Wrap the stream with one that remembers the history
self.stream = InputStreamWithMemory(self.stream)

# we need to remember the last token emitted, so we don't add too many spaces
_emittedLastToken = None

def __iter__(self):
last_error_token = None

Expand Down Expand Up @@ -335,9 +378,15 @@ def emitCurrentToken(self):
# cases it gets converted to a Characters token.
if self.parser.strip:
# If we're stripping the token, we just throw in an empty
# string token.
# string token
new_data = ''

if ((self._emittedLastToken and
token['type'] == TAG_TOKEN_TYPE_START and
token['name'].lower() in HTML_TAGS__BLOCK_LEVEL and
not self._emittedLastToken.get('data', '').endswith(' '))):
# BUT, if this is the START of a block level tag, then we
# want to insert a space for accessibility.
new_data = ' '
else:
# If we're escaping the token, we want to escape the exact
# original string. Since tokenizing also normalizes data
Expand All @@ -351,11 +400,12 @@ def emitCurrentToken(self):
'data': new_data
}

self.currentToken = new_token
self.currentToken = self._emittedLastToken = new_token
self.tokenQueue.append(new_token)
self.state = self.dataState
return

self._emittedLastToken = self.currentToken
super(BleachHTMLTokenizer, self).emitCurrentToken()


Expand Down
9 changes: 9 additions & 0 deletions tests/test_clean.py
Expand Up @@ -848,3 +848,12 @@ def __iter__(self):
cleaner.clean(dirty) ==
'this is cute! <img rel="moo" src="moo">'
)


def test_strip_respects_block_level_elements():
"""
We should at least have a space between block level elements
https://github.com/mozilla/bleach/issues/369
"""
text = '<p>Te<b>st</b>!</p><p>Hello</p>'
assert clean(text, tags=[], strip=True) == 'Test! Hello'

0 comments on commit 46c7334

Please sign in to comment.