Merge pull request #671 from willkg/668-url-tests

add urlparse tests based on wpt url tests (#668)
mozilla · Jun 27, 2022 · 43e20d2 · 43e20d2
2 parents 5d4725c + 07613ff
commit 43e20d2
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 11 deletions.
diff --git a/bleach/parse_shim.py b/bleach/parse_shim.py
@@ -0,0 +1 @@
+from bleach._vendor.parse import urlparse  # noqa
diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
@@ -2,10 +2,10 @@
 import re
 import warnings
 
-from bleach._vendor.parse import urlparse
 from xml.sax.saxutils import unescape
 
 from bleach import html5lib_shim
+from bleach import parse_shim
 
 
 #: List of allowed tags
@@ -449,27 +449,27 @@ def sanitize_uri_value(self, value, allowed_protocols):
         :returns: allowed value or None
 
         """
-        # NOTE(willkg): This transforms the value into one that's easier to
-        # match and verify, but shouldn't get returned since it's vastly
-        # different than the original value.
+        # NOTE(willkg): This transforms the value into a normalized one that's
+        # easier to match and verify, but shouldn't get returned since it's
+        # vastly different than the original value.
 
         # Convert all character entities in the value
-        new_value = html5lib_shim.convert_entities(value)
+        normalized_uri = html5lib_shim.convert_entities(value)
 
         # Nix backtick, space characters, and control characters
-        new_value = re.sub(r"[`\000-\040\177-\240\s]+", "", new_value)
+        normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
 
         # Remove REPLACEMENT characters
-        new_value = new_value.replace("\ufffd", "")
+        normalized_uri = normalized_uri.replace("\ufffd", "")
 
         # Lowercase it--this breaks the value, but makes it easier to match
         # against
-        new_value = new_value.lower()
+        normalized_uri = normalized_uri.lower()
 
         try:
             # Drop attributes with uri values that have protocols that aren't
             # allowed
-            parsed = urlparse(new_value)
+            parsed = parse_shim.urlparse(normalized_uri)
         except ValueError:
             # URI is impossible to parse, therefore it's not allowed
             return None
@@ -481,11 +481,14 @@ def sanitize_uri_value(self, value, allowed_protocols):
 
         else:
             # Allow uris that are just an anchor
-            if new_value.startswith("#"):
+            if normalized_uri.startswith("#"):
                 return value
 
             # Handle protocols that urlparse doesn't recognize like "myprotocol"
-            if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
+            if (
+                ":" in normalized_uri
+                and normalized_uri.split(":")[0] in allowed_protocols
+            ):
                 return value
 
             # If there's no protocol/scheme specified, then assume it's "http" or

diff --git a/tests/test_parse_shim.py b/tests/test_parse_shim.py
@@ -0,0 +1,111 @@
+from dataclasses import dataclass
+import inspect
+
+import pytest
+
+from bleach.parse_shim import urlparse
+
+
+@dataclass
+class ParseResult:
+    scheme: str = ""
+    netloc: str = ""
+    path: str = ""
+    params: str = ""
+    query: str = ""
+    fragment: str = ""
+
+
+# Tests from
+# https://github.com/web-platform-tests/wpt/blob/master/url/resources/urltestdata.json
+# commit ee566de4c5c65d7e8af8b2500f9b85a646ffeaa5
+
+
+@pytest.mark.parametrize(
+    "uri, expected",
+    [
+        ("", ParseResult()),
+        ("http://example\t.\norg", ParseResult(scheme="http", netloc="example.org")),
+        (
+            "http://user:pass@foo:21/bar;par?b#c",
+            ParseResult(
+                scheme="http",
+                netloc="user:pass@foo:21",
+                path="/bar",
+                params="par",
+                query="b",
+                fragment="c",
+            ),
+        ),
+        ("https://test:@test", ParseResult(scheme="https", netloc="test:@test")),
+        ("https://:@test", ParseResult(scheme="https", netloc=":@test")),
+        (
+            "non-special://test:@test/x",
+            ParseResult(scheme="non-special", netloc="test:@test", path="/x"),
+        ),
+        (
+            "non-special://:@test/x",
+            ParseResult(scheme="non-special", netloc=":@test", path="/x"),
+        ),
+        ("http:foo.com", ParseResult(scheme="http", path="foo.com")),
+        # NOTE(willkg): The wpt tests set the scheme to http becaue that's what
+        # the base url is. Since our parser is not using a baseurl, it sets the
+        # scheme to "". Further, our parser includes spaces at the beginning,
+        # but I don't see that as being problematic.
+        ("\t   :foo.com   \n", ParseResult(path="   :foo.com   ")),
+        # NOTE(willkg): The wpt tests set the path to "/foo/foo.com" because
+        # the base url is at "/foo"
+        (" foo.com  ", ParseResult(path=" foo.com  ")),
+        ("a:\t foo.com", ParseResult(scheme="a", path=" foo.com")),
+        (
+            "http://f:21/ b ? d # e ",
+            ParseResult(
+                scheme="http", netloc="f:21", path="/ b ", query=" d ", fragment=" e "
+            ),
+        ),
+        (
+            "lolscheme:x x#x x",
+            ParseResult(scheme="lolscheme", path="x x", fragment="x x"),
+        ),
+        ("http://f:/c", ParseResult(scheme="http", netloc="f:", path="/c")),
+        ("http://f:0/c", ParseResult(scheme="http", netloc="f:0", path="/c")),
+        # NOTE(willkg): The wpt tests normalize the 0000000000000 to 0 so the
+        # netloc should be "f:0".
+        (
+            "http://f:00000000000000/c",
+            ParseResult(scheme="http", netloc="f:00000000000000", path="/c"),
+        ),
+        # NOTE(willkg): The wpt tests drop the 0000000000000000000 altogether
+        # so the netloc should be "f".
+        (
+            "http://f:00000000000000000000080/c",
+            ParseResult(scheme="http", netloc="f:00000000000000000000080", path="/c"),
+        ),
+        # This is an invalid ipv6 url
+        ("http://2001::1]", ValueError),
+        # NOTE(willkg): The wpt tests show this as a parse error, but our
+        # parser "parses" it.
+        ("http://f:b/c", ParseResult(scheme="http", netloc="f:b", path="/c")),
+        # NOTE(willkg): The wpt tests show this as a parse error, but our
+        # parser "parses" it.
+        ("http://f: /c", ParseResult(scheme="http", netloc="f: ", path="/c")),
+        # NOTE(willkg): The wpt tests show this as a parse error, but our
+        # parser "parses" it.
+        ("http://f:999999/c", ParseResult(scheme="http", netloc="f:999999", path="/c")),
+    ],
+)
+def test_urlparse(uri, expected):
+
+    if inspect.isclass(expected) and issubclass(expected, BaseException):
+        with pytest.raises(expected):
+            urlparse(uri)
+
+    else:
+        parsed = urlparse(uri)
+        print(parsed)
+        assert parsed.scheme == expected.scheme
+        assert parsed.netloc == expected.netloc
+        assert parsed.path == expected.path
+        assert parsed.params == expected.params
+        assert parsed.query == expected.query
+        assert parsed.fragment == expected.fragment