Skip to content

Commit

Permalink
Merge pull request #648 from willkg/633-css
Browse files Browse the repository at this point in the history
switch to tinycss2 (#633)
  • Loading branch information
willkg committed Apr 4, 2022
2 parents d30669b + 0782fd9 commit a1139a5
Show file tree
Hide file tree
Showing 13 changed files with 304 additions and 201 deletions.
8 changes: 8 additions & 0 deletions CHANGES
Expand Up @@ -11,6 +11,14 @@ Version 5.0.0 (In development)

* Drop support for Python 3.6. Thank you, @hugovk! (#629)

* CSS sanitization in style tags is completely different now. If you're using
Bleach ``clean`` to sanitize css in style tags, you'll need to update your
code and you'll need to install the ``css`` extras::

pip install 'bleach[css]'

See the documentation on sanitizing CSS for how to do it. (#633)

**Bug fixes**

* Rework dev dependencies. We no longer have
Expand Down
11 changes: 5 additions & 6 deletions bleach/__init__.py
Expand Up @@ -5,7 +5,6 @@
from bleach.sanitizer import (
ALLOWED_ATTRIBUTES,
ALLOWED_PROTOCOLS,
ALLOWED_STYLES,
ALLOWED_TAGS,
Cleaner,
)
Expand All @@ -24,10 +23,10 @@ def clean(
text,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
css_sanitizer=None,
):
"""Clean an HTML fragment of malicious content and return it
Expand Down Expand Up @@ -59,26 +58,26 @@ def clean(
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip: whether or not to strip disallowed elements
:arg bool strip_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
:returns: cleaned text as unicode
"""
cleaner = Cleaner(
tags=tags,
attributes=attributes,
styles=styles,
protocols=protocols,
strip=strip,
strip_comments=strip_comments,
css_sanitizer=css_sanitizer,
)
return cleaner.clean(text)

Expand Down
104 changes: 104 additions & 0 deletions bleach/css_sanitizer.py
@@ -0,0 +1,104 @@
import tinycss2


ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)


ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)


class CSSSanitizer:
def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties

def sanitize_css(self, style):
"""Sanitizes css in style tags"""
parsed = tinycss2.parse_declaration_list(style)

if not parsed:
return ""

new_tokens = []
for token in parsed:
if token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif token.type in ("comment", "whitespace"):
if new_tokens and new_tokens[-1].type != token.type:
new_tokens.append(token)

# NOTE(willkg): We currently don't handle AtRule or ParseError and
# so both get silently thrown out

if not new_tokens:
return ""

return tinycss2.serialize(new_tokens).strip()
2 changes: 2 additions & 0 deletions bleach/html5lib_shim.py
Expand Up @@ -36,6 +36,8 @@
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
allowed_protocols,
allowed_css_properties,
allowed_svg_properties,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
Filter as SanitizerFilter,
Expand Down
92 changes: 31 additions & 61 deletions bleach/sanitizer.py
Expand Up @@ -32,9 +32,6 @@
"acronym": ["title"],
}

#: List of allowed styles
ALLOWED_STYLES = []

#: List of allowed protocols
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]

Expand Down Expand Up @@ -84,11 +81,11 @@ def __init__(
self,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
filters=None,
css_sanitizer=None,
):
"""Initializes a Cleaner
Expand All @@ -98,9 +95,6 @@ def __init__(
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
Expand All @@ -117,14 +111,17 @@ def __init__(
Using filters changes the output of ``bleach.Cleaner.clean``.
Make sure the way the filters change the output are secure.
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.tags = tags
self.attributes = attributes
self.styles = styles
self.protocols = protocols
self.strip = strip
self.strip_comments = strip_comments
self.filters = filters or []
self.css_sanitizer = css_sanitizer

self.parser = html5lib_shim.BleachHTMLParser(
tags=self.tags,
Expand Down Expand Up @@ -174,11 +171,10 @@ def clean(self, text):
attributes=self.attributes,
strip_disallowed_elements=self.strip,
strip_html_comments=self.strip_comments,
css_sanitizer=self.css_sanitizer,
# html5lib-sanitizer things
allowed_elements=self.tags,
allowed_css_properties=self.styles,
allowed_protocols=self.protocols,
allowed_svg_properties=[],
)

# Apply any filters after the BleachSanitizerFilter
Expand Down Expand Up @@ -241,36 +237,40 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
def __init__(
self,
source,
allowed_elements=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
allowed_protocols=ALLOWED_PROTOCOLS,
strip_disallowed_elements=False,
strip_html_comments=True,
css_sanitizer=None,
**kwargs,
):
"""Creates a BleachSanitizerFilter instance
:arg Treewalker source: stream
:arg list tags: allowed list of tags; defaults to
:arg list allowed_elements: allowed list of tags; defaults to
``bleach.sanitizer.ALLOWED_TAGS``
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
:arg list allowed_protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip_disallowed_elements: whether or not to strip disallowed
elements
:arg bool strip_html_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements
self.strip_html_comments = strip_html_comments
self.css_sanitizer = css_sanitizer

# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
warnings.filterwarnings(
Expand All @@ -279,7 +279,12 @@ def __init__(
category=DeprecationWarning,
module="bleach._vendor.html5lib",
)
return super().__init__(source, **kwargs)
return super().__init__(
source,
allowed_elements=allowed_elements,
allowed_protocols=allowed_protocols,
**kwargs,
)

def sanitize_stream(self, token_iterator):
for token in token_iterator:
Expand Down Expand Up @@ -541,7 +546,16 @@ def allow_token(self, token):

# If it's a style attribute, sanitize it
if namespaced_name == (None, "style"):
val = self.sanitize_css(val)
if self.css_sanitizer:
val = self.css_sanitizer.sanitize_css(val)
else:
# FIXME(willkg): if style is allowed, but no
# css_sanitizer was set up, then this is probably a
# mistake and we should raise an error here
#
# For now, we're going to set the value to "" because
# there was no sanitizer set
val = ""

# At this point, we want to keep the attribute, so add it in
attrs[namespaced_name] = val
Expand Down Expand Up @@ -593,47 +607,3 @@ def disallowed_token(self, token):

del token["name"]
return token

def sanitize_css(self, style):
"""Sanitizes css in style tags"""
# Convert entities in the style so that it can be parsed as CSS
style = html5lib_shim.convert_entities(style)

# Drop any url values before we do anything else
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)

# The gauntlet of sanitization

# Validate the css in the style tag and if it's not valid, then drop
# the whole thing.
parts = style.split(";")
gauntlet = re.compile(
r"""^( # consider a style attribute value as composed of:
[/:,#%!.\s\w] # a non-newline character
|\w-\w # 3 characters in the form \w-\w
|'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
|"[\s\w]+" # a double quoted string of [\s\w]+
|\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
)*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
flags=re.U | re.VERBOSE,
)

for part in parts:
if not gauntlet.match(part):
return ""

if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ""

clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue

if prop.lower() in self.allowed_css_properties:
clean.append(prop + ": " + value + ";")

elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ": " + value + ";")

return " ".join(clean)

0 comments on commit a1139a5

Please sign in to comment.