forked from urllib3/urllib3
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement RFC 3986 URL parsing (urllib3#1487)
- Loading branch information
1 parent
5163354
commit 0aa3e24
Showing
16 changed files
with
2,462 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright (c) 2014 Rackspace | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
# implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
""" | ||
An implementation of semantics and validations described in RFC 3986. | ||
See http://rfc3986.readthedocs.io/ for detailed documentation. | ||
:copyright: (c) 2014 Rackspace | ||
:license: Apache v2.0, see LICENSE for details | ||
""" | ||
|
||
from .api import is_valid_uri | ||
from .api import normalize_uri | ||
from .api import uri_reference | ||
from .api import URIReference | ||
from .api import urlparse | ||
from .parseresult import ParseResult | ||
|
||
__title__ = 'rfc3986' | ||
__author__ = 'Ian Stapleton Cordasco' | ||
__author_email__ = 'graffatcolmingov@gmail.com' | ||
__license__ = 'Apache v2.0' | ||
__copyright__ = 'Copyright 2014 Rackspace' | ||
__version__ = '1.2.0' | ||
|
||
__all__ = ( | ||
'ParseResult', | ||
'URIReference', | ||
'is_valid_uri', | ||
'normalize_uri', | ||
'uri_reference', | ||
'urlparse', | ||
'__title__', | ||
'__author__', | ||
'__author_email__', | ||
'__license__', | ||
'__copyright__', | ||
'__version__', | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# -*- coding: utf-8 -*- | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
# implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Module for the regular expressions crafted from ABNF.""" | ||
|
||
# https://tools.ietf.org/html/rfc3986#page-13 | ||
GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@" | ||
GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS) | ||
# https://tools.ietf.org/html/rfc3986#page-13 | ||
SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;=" | ||
SUB_DELIMITERS_SET = set(SUB_DELIMITERS) | ||
# Escape the '*' for use in regular expressions | ||
SUB_DELIMITERS_RE = r"!$&'()\*+,;=" | ||
RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET) | ||
ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' | ||
DIGIT = '0123456789' | ||
# https://tools.ietf.org/html/rfc3986#section-2.3 | ||
UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-' | ||
UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS) | ||
NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET) | ||
# We need to escape the '-' in this case: | ||
UNRESERVED_RE = r'A-Za-z0-9._~\-' | ||
|
||
# Percent encoded character values | ||
PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}' | ||
PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED | ||
|
||
# NOTE(sigmavirus24): We're going to use more strict regular expressions | ||
# than appear in Appendix B for scheme. This will prevent over-eager | ||
# consuming of items that aren't schemes. | ||
SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*' | ||
_AUTHORITY_RE = '[^/?#]*' | ||
_PATH_RE = '[^?#]*' | ||
_QUERY_RE = '[^#]*' | ||
_FRAGMENT_RE = '.*' | ||
|
||
# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B | ||
COMPONENT_PATTERN_DICT = { | ||
'scheme': SCHEME_RE, | ||
'authority': _AUTHORITY_RE, | ||
'path': _PATH_RE, | ||
'query': _QUERY_RE, | ||
'fragment': _FRAGMENT_RE, | ||
} | ||
|
||
# See http://tools.ietf.org/html/rfc3986#appendix-B | ||
# In this case, we name each of the important matches so we can use | ||
# SRE_Match#groupdict to parse the values out if we so choose. This is also | ||
# modified to ignore other matches that are not important to the parsing of | ||
# the reference so we can also simply use SRE_Match#groups. | ||
URL_PARSING_RE = ( | ||
r'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?' | ||
r'(?P<path>{path})(?:\?(?P<query>{query}))?' | ||
r'(?:#(?P<fragment>{fragment}))?' | ||
).format(**COMPONENT_PATTERN_DICT) | ||
|
||
|
||
# ######################### | ||
# Authority Matcher Section | ||
# ######################### | ||
|
||
# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 | ||
# The pattern for a regular name, e.g., www.google.com, api.github.com | ||
REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format( | ||
'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE | ||
) | ||
# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, | ||
IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}' | ||
# Hexadecimal characters used in each piece of an IPv6 address | ||
HEXDIG_RE = '[0-9A-Fa-f]{1,4}' | ||
# Least-significant 32 bits of an IPv6 address | ||
LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE) | ||
# Substitutions into the following patterns for IPv6 patterns defined | ||
# http://tools.ietf.org/html/rfc3986#page-20 | ||
_subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE} | ||
|
||
# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details | ||
# about ABNF (Augmented Backus-Naur Form) use in the comments | ||
variations = [ | ||
# 6( h16 ":" ) ls32 | ||
'(%(hex)s:){6}%(ls32)s' % _subs, | ||
# "::" 5( h16 ":" ) ls32 | ||
'::(%(hex)s:){5}%(ls32)s' % _subs, | ||
# [ h16 ] "::" 4( h16 ":" ) ls32 | ||
'(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs, | ||
# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 | ||
'((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs, | ||
# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 | ||
'((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs, | ||
# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 | ||
'((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs, | ||
# [ *4( h16 ":" ) h16 ] "::" ls32 | ||
'((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs, | ||
# [ *5( h16 ":" ) h16 ] "::" h16 | ||
'((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs, | ||
# [ *6( h16 ":" ) h16 ] "::" | ||
'((%(hex)s:){0,6}%(hex)s)?::' % _subs, | ||
] | ||
|
||
IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format( | ||
*variations | ||
) | ||
|
||
IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % ( | ||
UNRESERVED_RE + SUB_DELIMITERS_RE + ':' | ||
) | ||
|
||
|
||
# RFC 6874 Zone ID ABNF | ||
ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+' | ||
IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID | ||
|
||
IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format( | ||
IPv6_RE, | ||
IPv6_ADDRZ_RE, | ||
IPv_FUTURE_RE, | ||
) | ||
|
||
# Pattern for matching the host piece of the authority | ||
HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format( | ||
REG_NAME, | ||
IPv4_RE, | ||
IP_LITERAL_RE, | ||
) | ||
USERINFO_RE = '^([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':]|%s)+' % ( | ||
PCT_ENCODED | ||
) | ||
PORT_RE = '[0-9]{1,5}' | ||
|
||
# #################### | ||
# Path Matcher Section | ||
# #################### | ||
|
||
# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information | ||
# about the path patterns defined below. | ||
segments = { | ||
'segment': PCHAR + '*', | ||
# Non-zero length segment | ||
'segment-nz': PCHAR + '+', | ||
# Non-zero length segment without ":" | ||
'segment-nz-nc': PCHAR.replace(':', '') + '+' | ||
} | ||
|
||
# Path types taken from Section 3.3 (linked above) | ||
PATH_EMPTY = '^$' | ||
PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments | ||
PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments | ||
PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS | ||
PATH_ABEMPTY = '(/%(segment)s)*' % segments | ||
PATH_RE = '^(%s|%s|%s|%s|%s)$' % ( | ||
PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY | ||
) | ||
|
||
FRAGMENT_RE = QUERY_RE = ( | ||
'^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED | ||
) | ||
|
||
# ########################## | ||
# Relative reference matcher | ||
# ########################## | ||
|
||
# See http://tools.ietf.org/html/rfc3986#section-4.2 for details | ||
RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % ( | ||
COMPONENT_PATTERN_DICT['authority'], | ||
PATH_ABEMPTY, | ||
PATH_ABSOLUTE, | ||
PATH_NOSCHEME, | ||
PATH_EMPTY, | ||
) | ||
|
||
# See http://tools.ietf.org/html/rfc3986#section-3 for definition | ||
HIER_PART_RE = '(//%s%s|%s|%s|%s)' % ( | ||
COMPONENT_PATTERN_DICT['authority'], | ||
PATH_ABEMPTY, | ||
PATH_ABSOLUTE, | ||
PATH_ROOTLESS, | ||
PATH_EMPTY, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright (c) 2014 Rackspace | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
# implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
""" | ||
Module containing the simple and functional API for rfc3986. | ||
This module defines functions and provides access to the public attributes | ||
and classes of rfc3986. | ||
""" | ||
|
||
from .parseresult import ParseResult | ||
from .uri import URIReference | ||
|
||
|
||
def uri_reference(uri, encoding='utf-8'): | ||
"""Parse a URI string into a URIReference. | ||
This is a convenience function. You could achieve the same end by using | ||
``URIReference.from_string(uri)``. | ||
:param str uri: The URI which needs to be parsed into a reference. | ||
:param str encoding: The encoding of the string provided | ||
:returns: A parsed URI | ||
:rtype: :class:`URIReference` | ||
""" | ||
return URIReference.from_string(uri, encoding) | ||
|
||
|
||
def is_valid_uri(uri, encoding='utf-8', **kwargs): | ||
"""Determine if the URI given is valid. | ||
This is a convenience function. You could use either | ||
``uri_reference(uri).is_valid()`` or | ||
``URIReference.from_string(uri).is_valid()`` to achieve the same result. | ||
:param str uri: The URI to be validated. | ||
:param str encoding: The encoding of the string provided | ||
:param bool require_scheme: Set to ``True`` if you wish to require the | ||
presence of the scheme component. | ||
:param bool require_authority: Set to ``True`` if you wish to require the | ||
presence of the authority component. | ||
:param bool require_path: Set to ``True`` if you wish to require the | ||
presence of the path component. | ||
:param bool require_query: Set to ``True`` if you wish to require the | ||
presence of the query component. | ||
:param bool require_fragment: Set to ``True`` if you wish to require the | ||
presence of the fragment component. | ||
:returns: ``True`` if the URI is valid, ``False`` otherwise. | ||
:rtype: bool | ||
""" | ||
return URIReference.from_string(uri, encoding).is_valid(**kwargs) | ||
|
||
|
||
def normalize_uri(uri, encoding='utf-8'): | ||
"""Normalize the given URI. | ||
This is a convenience function. You could use either | ||
``uri_reference(uri).normalize().unsplit()`` or | ||
``URIReference.from_string(uri).normalize().unsplit()`` instead. | ||
:param str uri: The URI to be normalized. | ||
:param str encoding: The encoding of the string provided | ||
:returns: The normalized URI. | ||
:rtype: str | ||
""" | ||
normalized_reference = URIReference.from_string(uri, encoding).normalize() | ||
return normalized_reference.unsplit() | ||
|
||
|
||
def urlparse(uri, encoding='utf-8'): | ||
"""Parse a given URI and return a ParseResult. | ||
This is a partial replacement of the standard library's urlparse function. | ||
:param str uri: The URI to be parsed. | ||
:param str encoding: The encoding of the string provided. | ||
:returns: A parsed URI | ||
:rtype: :class:`~rfc3986.parseresult.ParseResult` | ||
""" | ||
return ParseResult.from_string(uri, encoding, strict=False) |
Oops, something went wrong.