Skip to content

Commit

Permalink
Merge pull request #671 from willkg/668-url-tests
Browse files Browse the repository at this point in the history
add urlparse tests based on wpt url tests (#668)
  • Loading branch information
willkg committed Jun 27, 2022
2 parents 5d4725c + 07613ff commit 43e20d2
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 11 deletions.
1 change: 1 addition & 0 deletions bleach/parse_shim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from bleach._vendor.parse import urlparse # noqa
25 changes: 14 additions & 11 deletions bleach/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import re
import warnings

from bleach._vendor.parse import urlparse
from xml.sax.saxutils import unescape

from bleach import html5lib_shim
from bleach import parse_shim


#: List of allowed tags
Expand Down Expand Up @@ -449,27 +449,27 @@ def sanitize_uri_value(self, value, allowed_protocols):
:returns: allowed value or None
"""
# NOTE(willkg): This transforms the value into one that's easier to
# match and verify, but shouldn't get returned since it's vastly
# different than the original value.
# NOTE(willkg): This transforms the value into a normalized one that's
# easier to match and verify, but shouldn't get returned since it's
# vastly different than the original value.

# Convert all character entities in the value
new_value = html5lib_shim.convert_entities(value)
normalized_uri = html5lib_shim.convert_entities(value)

# Nix backtick, space characters, and control characters
new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)

# Remove REPLACEMENT characters
new_value = new_value.replace("\ufffd", "")
normalized_uri = normalized_uri.replace("\ufffd", "")

# Lowercase it--this breaks the value, but makes it easier to match
# against
new_value = new_value.lower()
normalized_uri = normalized_uri.lower()

try:
# Drop attributes with uri values that have protocols that aren't
# allowed
parsed = urlparse(new_value)
parsed = parse_shim.urlparse(normalized_uri)
except ValueError:
# URI is impossible to parse, therefore it's not allowed
return None
Expand All @@ -481,11 +481,14 @@ def sanitize_uri_value(self, value, allowed_protocols):

else:
# Allow uris that are just an anchor
if new_value.startswith("#"):
if normalized_uri.startswith("#"):
return value

# Handle protocols that urlparse doesn't recognize like "myprotocol"
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
if (
":" in normalized_uri
and normalized_uri.split(":")[0] in allowed_protocols
):
return value

# If there's no protocol/scheme specified, then assume it's "http" or
Expand Down
111 changes: 111 additions & 0 deletions tests/test_parse_shim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from dataclasses import dataclass
import inspect

import pytest

from bleach.parse_shim import urlparse


@dataclass
class ParseResult:
scheme: str = ""
netloc: str = ""
path: str = ""
params: str = ""
query: str = ""
fragment: str = ""


# Tests from
# https://github.com/web-platform-tests/wpt/blob/master/url/resources/urltestdata.json
# commit ee566de4c5c65d7e8af8b2500f9b85a646ffeaa5


@pytest.mark.parametrize(
"uri, expected",
[
("", ParseResult()),
("http://example\t.\norg", ParseResult(scheme="http", netloc="example.org")),
(
"http://user:pass@foo:21/bar;par?b#c",
ParseResult(
scheme="http",
netloc="user:pass@foo:21",
path="/bar",
params="par",
query="b",
fragment="c",
),
),
("https://test:@test", ParseResult(scheme="https", netloc="test:@test")),
("https://:@test", ParseResult(scheme="https", netloc=":@test")),
(
"non-special://test:@test/x",
ParseResult(scheme="non-special", netloc="test:@test", path="/x"),
),
(
"non-special://:@test/x",
ParseResult(scheme="non-special", netloc=":@test", path="/x"),
),
("http:foo.com", ParseResult(scheme="http", path="foo.com")),
# NOTE(willkg): The wpt tests set the scheme to http becaue that's what
# the base url is. Since our parser is not using a baseurl, it sets the
# scheme to "". Further, our parser includes spaces at the beginning,
# but I don't see that as being problematic.
("\t :foo.com \n", ParseResult(path=" :foo.com ")),
# NOTE(willkg): The wpt tests set the path to "/foo/foo.com" because
# the base url is at "/foo"
(" foo.com ", ParseResult(path=" foo.com ")),
("a:\t foo.com", ParseResult(scheme="a", path=" foo.com")),
(
"http://f:21/ b ? d # e ",
ParseResult(
scheme="http", netloc="f:21", path="/ b ", query=" d ", fragment=" e "
),
),
(
"lolscheme:x x#x x",
ParseResult(scheme="lolscheme", path="x x", fragment="x x"),
),
("http://f:/c", ParseResult(scheme="http", netloc="f:", path="/c")),
("http://f:0/c", ParseResult(scheme="http", netloc="f:0", path="/c")),
# NOTE(willkg): The wpt tests normalize the 0000000000000 to 0 so the
# netloc should be "f:0".
(
"http://f:00000000000000/c",
ParseResult(scheme="http", netloc="f:00000000000000", path="/c"),
),
# NOTE(willkg): The wpt tests drop the 0000000000000000000 altogether
# so the netloc should be "f".
(
"http://f:00000000000000000000080/c",
ParseResult(scheme="http", netloc="f:00000000000000000000080", path="/c"),
),
# This is an invalid ipv6 url
("http://2001::1]", ValueError),
# NOTE(willkg): The wpt tests show this as a parse error, but our
# parser "parses" it.
("http://f:b/c", ParseResult(scheme="http", netloc="f:b", path="/c")),
# NOTE(willkg): The wpt tests show this as a parse error, but our
# parser "parses" it.
("http://f: /c", ParseResult(scheme="http", netloc="f: ", path="/c")),
# NOTE(willkg): The wpt tests show this as a parse error, but our
# parser "parses" it.
("http://f:999999/c", ParseResult(scheme="http", netloc="f:999999", path="/c")),
],
)
def test_urlparse(uri, expected):

if inspect.isclass(expected) and issubclass(expected, BaseException):
with pytest.raises(expected):
urlparse(uri)

else:
parsed = urlparse(uri)
print(parsed)
assert parsed.scheme == expected.scheme
assert parsed.netloc == expected.netloc
assert parsed.path == expected.path
assert parsed.params == expected.params
assert parsed.query == expected.query
assert parsed.fragment == expected.fragment

0 comments on commit 43e20d2

Please sign in to comment.