Implement RFC 3986 URL parsing (urllib3#1487)

speedplane · Dec 7, 2018 · 0aa3e24 · 0aa3e24
1 parent 5163354
commit 0aa3e24
Show file tree

Hide file tree

Showing 16 changed files with 2,462 additions and 109 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,7 +4,9 @@ Changes
 dev (master)
 ------------
 
-* Implemented a more efficient ``HTTPResponse.__iter__()`` method (Issue #1483)
+* Implemented a more efficient ``HTTPResponse.__iter__()`` method. (Issue #1483)
+
+* Upgraded ``urllib3.utils.parse_url()`` to be RFC 3986 compliant. (Issue #)
 
 * ... [Short description of non-trivial change.] (Issue #)
 

diff --git a/setup.py b/setup.py
@@ -49,9 +49,9 @@
       license='MIT',
       packages=['urllib3',
                 'urllib3.packages', 'urllib3.packages.ssl_match_hostname',
-                'urllib3.packages.backports', 'urllib3.contrib',
-                'urllib3.contrib._securetransport', 'urllib3.util',
-                ],
+                'urllib3.packages.backports', 'urllib3.packages.rfc3986',
+                'urllib3.contrib', 'urllib3.contrib._securetransport',
+                'urllib3.util'],
       package_dir={'': 'src'},
       requires=[],
       python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4",

diff --git a/src/urllib3/packages/rfc3986/__init__.py b/src/urllib3/packages/rfc3986/__init__.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Rackspace
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+An implementation of semantics and validations described in RFC 3986.
+
+See http://rfc3986.readthedocs.io/ for detailed documentation.
+
+:copyright: (c) 2014 Rackspace
+:license: Apache v2.0, see LICENSE for details
+"""
+
+from .api import is_valid_uri
+from .api import normalize_uri
+from .api import uri_reference
+from .api import URIReference
+from .api import urlparse
+from .parseresult import ParseResult
+
+__title__ = 'rfc3986'
+__author__ = 'Ian Stapleton Cordasco'
+__author_email__ = 'graffatcolmingov@gmail.com'
+__license__ = 'Apache v2.0'
+__copyright__ = 'Copyright 2014 Rackspace'
+__version__ = '1.2.0'
+
+__all__ = (
+    'ParseResult',
+    'URIReference',
+    'is_valid_uri',
+    'normalize_uri',
+    'uri_reference',
+    'urlparse',
+    '__title__',
+    '__author__',
+    '__author_email__',
+    '__license__',
+    '__copyright__',
+    '__version__',
+)
diff --git a/src/urllib3/packages/rfc3986/abnf_regexp.py b/src/urllib3/packages/rfc3986/abnf_regexp.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module for the regular expressions crafted from ABNF."""
+
+# https://tools.ietf.org/html/rfc3986#page-13
+GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
+GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
+# https://tools.ietf.org/html/rfc3986#page-13
+SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
+SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
+# Escape the '*' for use in regular expressions
+SUB_DELIMITERS_RE = r"!$&'()\*+,;="
+RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
+ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+DIGIT = '0123456789'
+# https://tools.ietf.org/html/rfc3986#section-2.3
+UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-'
+UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
+NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
+# We need to escape the '-' in this case:
+UNRESERVED_RE = r'A-Za-z0-9._~\-'
+
+# Percent encoded character values
+PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}'
+PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED
+
+# NOTE(sigmavirus24): We're going to use more strict regular expressions
+# than appear in Appendix B for scheme. This will prevent over-eager
+# consuming of items that aren't schemes.
+SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
+_AUTHORITY_RE = '[^/?#]*'
+_PATH_RE = '[^?#]*'
+_QUERY_RE = '[^#]*'
+_FRAGMENT_RE = '.*'
+
+# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
+COMPONENT_PATTERN_DICT = {
+    'scheme': SCHEME_RE,
+    'authority': _AUTHORITY_RE,
+    'path': _PATH_RE,
+    'query': _QUERY_RE,
+    'fragment': _FRAGMENT_RE,
+}
+
+# See http://tools.ietf.org/html/rfc3986#appendix-B
+# In this case, we name each of the important matches so we can use
+# SRE_Match#groupdict to parse the values out if we so choose. This is also
+# modified to ignore other matches that are not important to the parsing of
+# the reference so we can also simply use SRE_Match#groups.
+URL_PARSING_RE = (
+    r'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
+    r'(?P<path>{path})(?:\?(?P<query>{query}))?'
+    r'(?:#(?P<fragment>{fragment}))?'
+).format(**COMPONENT_PATTERN_DICT)
+
+
+# #########################
+# Authority Matcher Section
+# #########################
+
+# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
+# The pattern for a regular name, e.g.,  www.google.com, api.github.com
+REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format(
+    '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE
+)
+# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
+IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}'
+# Hexadecimal characters used in each piece of an IPv6 address
+HEXDIG_RE = '[0-9A-Fa-f]{1,4}'
+# Least-significant 32 bits of an IPv6 address
+LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE)
+# Substitutions into the following patterns for IPv6 patterns defined
+# http://tools.ietf.org/html/rfc3986#page-20
+_subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE}
+
+# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
+# about ABNF (Augmented Backus-Naur Form) use in the comments
+variations = [
+    #                            6( h16 ":" ) ls32
+    '(%(hex)s:){6}%(ls32)s' % _subs,
+    #                       "::" 5( h16 ":" ) ls32
+    '::(%(hex)s:){5}%(ls32)s' % _subs,
+    # [               h16 ] "::" 4( h16 ":" ) ls32
+    '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs,
+    # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+    '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs,
+    # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+    '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs,
+    # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+    '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs,
+    # [ *4( h16 ":" ) h16 ] "::"              ls32
+    '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs,
+    # [ *5( h16 ":" ) h16 ] "::"              h16
+    '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs,
+    # [ *6( h16 ":" ) h16 ] "::"
+    '((%(hex)s:){0,6}%(hex)s)?::' % _subs,
+]
+
+IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format(
+    *variations
+)
+
+IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % (
+    UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
+)
+
+
+# RFC 6874 Zone ID ABNF
+ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+'
+IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID
+
+IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format(
+    IPv6_RE,
+    IPv6_ADDRZ_RE,
+    IPv_FUTURE_RE,
+)
+
+# Pattern for matching the host piece of the authority
+HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format(
+    REG_NAME,
+    IPv4_RE,
+    IP_LITERAL_RE,
+)
+USERINFO_RE = '^([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':]|%s)+' % (
+    PCT_ENCODED
+)
+PORT_RE = '[0-9]{1,5}'
+
+# ####################
+# Path Matcher Section
+# ####################
+
+# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
+# about the path patterns defined below.
+segments = {
+    'segment': PCHAR + '*',
+    # Non-zero length segment
+    'segment-nz': PCHAR + '+',
+    # Non-zero length segment without ":"
+    'segment-nz-nc': PCHAR.replace(':', '') + '+'
+}
+
+# Path types taken from Section 3.3 (linked above)
+PATH_EMPTY = '^$'
+PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments
+PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments
+PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS
+PATH_ABEMPTY = '(/%(segment)s)*' % segments
+PATH_RE = '^(%s|%s|%s|%s|%s)$' % (
+    PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY
+)
+
+FRAGMENT_RE = QUERY_RE = (
+    '^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED
+)
+
+# ##########################
+# Relative reference matcher
+# ##########################
+
+# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
+RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % (
+    COMPONENT_PATTERN_DICT['authority'],
+    PATH_ABEMPTY,
+    PATH_ABSOLUTE,
+    PATH_NOSCHEME,
+    PATH_EMPTY,
+)
+
+# See http://tools.ietf.org/html/rfc3986#section-3 for definition
+HIER_PART_RE = '(//%s%s|%s|%s|%s)' % (
+    COMPONENT_PATTERN_DICT['authority'],
+    PATH_ABEMPTY,
+    PATH_ABSOLUTE,
+    PATH_ROOTLESS,
+    PATH_EMPTY,
+)
diff --git a/src/urllib3/packages/rfc3986/api.py b/src/urllib3/packages/rfc3986/api.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Rackspace
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Module containing the simple and functional API for rfc3986.
+
+This module defines functions and provides access to the public attributes
+and classes of rfc3986.
+"""
+
+from .parseresult import ParseResult
+from .uri import URIReference
+
+
+def uri_reference(uri, encoding='utf-8'):
+    """Parse a URI string into a URIReference.
+
+    This is a convenience function. You could achieve the same end by using
+    ``URIReference.from_string(uri)``.
+
+    :param str uri: The URI which needs to be parsed into a reference.
+    :param str encoding: The encoding of the string provided
+    :returns: A parsed URI
+    :rtype: :class:`URIReference`
+    """
+    return URIReference.from_string(uri, encoding)
+
+
+def is_valid_uri(uri, encoding='utf-8', **kwargs):
+    """Determine if the URI given is valid.
+
+    This is a convenience function. You could use either
+    ``uri_reference(uri).is_valid()`` or
+    ``URIReference.from_string(uri).is_valid()`` to achieve the same result.
+
+    :param str uri: The URI to be validated.
+    :param str encoding: The encoding of the string provided
+    :param bool require_scheme: Set to ``True`` if you wish to require the
+        presence of the scheme component.
+    :param bool require_authority: Set to ``True`` if you wish to require the
+        presence of the authority component.
+    :param bool require_path: Set to ``True`` if you wish to require the
+        presence of the path component.
+    :param bool require_query: Set to ``True`` if you wish to require the
+        presence of the query component.
+    :param bool require_fragment: Set to ``True`` if you wish to require the
+        presence of the fragment component.
+    :returns: ``True`` if the URI is valid, ``False`` otherwise.
+    :rtype: bool
+    """
+    return URIReference.from_string(uri, encoding).is_valid(**kwargs)
+
+
+def normalize_uri(uri, encoding='utf-8'):
+    """Normalize the given URI.
+
+    This is a convenience function. You could use either
+    ``uri_reference(uri).normalize().unsplit()`` or
+    ``URIReference.from_string(uri).normalize().unsplit()`` instead.
+
+    :param str uri: The URI to be normalized.
+    :param str encoding: The encoding of the string provided
+    :returns: The normalized URI.
+    :rtype: str
+    """
+    normalized_reference = URIReference.from_string(uri, encoding).normalize()
+    return normalized_reference.unsplit()
+
+
+def urlparse(uri, encoding='utf-8'):
+    """Parse a given URI and return a ParseResult.
+
+    This is a partial replacement of the standard library's urlparse function.
+
+    :param str uri: The URI to be parsed.
+    :param str encoding: The encoding of the string provided.
+    :returns: A parsed URI
+    :rtype: :class:`~rfc3986.parseresult.ParseResult`
+    """
+    return ParseResult.from_string(uri, encoding, strict=False)