diff --git a/.gitignore b/.gitignore index dd9e006f35..de61154e3e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,12 @@ env/ .workon +# in case you work with IntelliJ/PyCharm +.idea +*.iml +.python-version + + t.py t2.py diff --git a/HISTORY.md b/HISTORY.md index 0331d187f7..9b08a7f2d6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,22 @@ dev - \[Short description of non-trivial change.\] +**Dependencies** + +- Instead of `chardet`, use the MIT-licensed `charset_normalizer` for Python3 + to remove license ambiguity for projects bundling requests. If `chardet` + is already installed on your machine it will be used instead of `charset_normalizer` + to keep backwards compatibility. + + You can also install `chardet` while installing requests by + specifying `[use_chardet_on_py3]` extra as follows: + + ```shell + pip install "requests[use_chardet_on_py3]" + ``` + + Python2 still depends upon the `chardet` module. + 2.25.1 (2020-12-16) ------------------- @@ -1707,4 +1723,3 @@ This is not a backwards compatible change. - Frustration - Conception - diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index aa4b1ddb6a..34d400d513 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -697,10 +697,22 @@ Encodings When you receive a response, Requests makes a guess at the encoding to use for decoding the response when you access the :attr:`Response.text ` attribute. Requests will first check for an -encoding in the HTTP header, and if none is present, will use `chardet -`_ to attempt to guess the encoding. - -The only time Requests will not do this is if no explicit charset +encoding in the HTTP header, and if none is present, will use +`charset_normalizer `_ +or `chardet `_ to attempt to +guess the encoding. + +If ``chardet`` is installed, ``requests`` uses it, however for python3 +``chardet`` is no longer a mandatory dependency. The ``chardet`` +library is an LGPL-licenced dependency and some users of requests +cannot depend on mandatory LGPL-licensed dependencies. + +When you install ``request`` without specifying ``[use_chardet_on_py3]]`` extra, +and ``chardet`` is not already installed, ``requests`` uses ``charset-normalizer`` +(MIT-licensed) to guess the encoding. For Python 2, ``requests`` uses only +``chardet`` and is a mandatory dependency there. + +The only time Requests will not guess the encoding is if no explicit charset is present in the HTTP headers **and** the ``Content-Type`` header contains ``text``. In this situation, `RFC 2616 `_ specifies diff --git a/requests/__init__.py b/requests/__init__.py index f8f94295f9..0ac7713b81 100644 --- a/requests/__init__.py +++ b/requests/__init__.py @@ -41,12 +41,20 @@ """ import urllib3 -import chardet import warnings from .exceptions import RequestsDependencyWarning +try: + from charset_normalizer import __version__ as charset_normalizer_version +except ImportError: + charset_normalizer_version = None -def check_compatibility(urllib3_version, chardet_version): +try: + from chardet import __version__ as chardet_version +except ImportError: + chardet_version = None + +def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version): urllib3_version = urllib3_version.split('.') assert urllib3_version != ['dev'] # Verify urllib3 isn't installed from git. @@ -62,12 +70,19 @@ def check_compatibility(urllib3_version, chardet_version): assert minor >= 21 assert minor <= 26 - # Check chardet for compatibility. - major, minor, patch = chardet_version.split('.')[:3] - major, minor, patch = int(major), int(minor), int(patch) - # chardet >= 3.0.2, < 5.0.0 - assert (3, 0, 2) <= (major, minor, patch) < (5, 0, 0) - + # Check charset_normalizer for compatibility. + if chardet_version: + major, minor, patch = chardet_version.split('.')[:3] + major, minor, patch = int(major), int(minor), int(patch) + # chardet_version >= 3.0.2, < 5.0.0 + assert (3, 0, 2) <= (major, minor, patch) < (5, 0, 0) + elif charset_normalizer_version: + major, minor, patch = charset_normalizer_version.split('.')[:3] + major, minor, patch = int(major), int(minor), int(patch) + # charset_normalizer >= 2.0.0 < 3.0.0 + assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0) + else: + raise Exception("You need either charset_normalizer or chardet installed") def _check_cryptography(cryptography_version): # cryptography < 1.3.4 @@ -82,10 +97,10 @@ def _check_cryptography(cryptography_version): # Check imported dependencies for compatibility. try: - check_compatibility(urllib3.__version__, chardet.__version__) + check_compatibility(urllib3.__version__, chardet_version, charset_normalizer_version) except (AssertionError, ValueError): - warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " - "version!".format(urllib3.__version__, chardet.__version__), + warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " + "version!".format(urllib3.__version__, chardet_version, charset_normalizer_version), RequestsDependencyWarning) # Attempt to enable urllib3's fallback for SNI support diff --git a/requests/compat.py b/requests/compat.py index 5de0769f50..0b14f5015c 100644 --- a/requests/compat.py +++ b/requests/compat.py @@ -8,7 +8,10 @@ Python 3. """ -import chardet +try: + import chardet +except ImportError: + import charset_normalizer as chardet import sys diff --git a/requests/help.py b/requests/help.py index e53d35ef6d..4cd6389f55 100644 --- a/requests/help.py +++ b/requests/help.py @@ -8,10 +8,19 @@ import idna import urllib3 -import chardet from . import __version__ as requests_version +try: + import charset_normalizer +except ImportError: + charset_normalizer = None + +try: + import chardet +except ImportError: + chardet = None + try: from urllib3.contrib import pyopenssl except ImportError: @@ -71,7 +80,12 @@ def info(): implementation_info = _implementation() urllib3_info = {'version': urllib3.__version__} - chardet_info = {'version': chardet.__version__} + charset_normalizer_info = {'version': None} + chardet_info = {'version': None} + if charset_normalizer: + charset_normalizer_info = {'version': charset_normalizer.__version__} + if chardet: + chardet_info = {'version': chardet.__version__} pyopenssl_info = { 'version': None, @@ -99,9 +113,11 @@ def info(): 'implementation': implementation_info, 'system_ssl': system_ssl_info, 'using_pyopenssl': pyopenssl is not None, + 'using_charset_normalizer': chardet is None, 'pyOpenSSL': pyopenssl_info, 'urllib3': urllib3_info, 'chardet': chardet_info, + 'charset_normalizer': charset_normalizer_info, 'cryptography': cryptography_info, 'idna': idna_info, 'requests': { diff --git a/requests/models.py b/requests/models.py index 93b901b4e3..aa6fb86e4e 100644 --- a/requests/models.py +++ b/requests/models.py @@ -731,7 +731,7 @@ def next(self): @property def apparent_encoding(self): - """The apparent encoding, provided by the chardet library.""" + """The apparent encoding, provided by the charset_normalizer or chardet libraries.""" return chardet.detect(self.content)['encoding'] def iter_content(self, chunk_size=1, decode_unicode=False): @@ -845,7 +845,7 @@ def text(self): """Content of the response, in unicode. If Response.encoding is None, encoding will be guessed using - ``chardet``. + ``charset_normalizer`` or ``chardet``. The encoding of the response content is determined based solely on HTTP headers, following RFC 2616 to the letter. If you can take advantage of @@ -893,7 +893,7 @@ def json(self, **kwargs): if not self.encoding and self.content and len(self.content) > 3: # No encoding set. JSON RFC 4627 section 3 states we should expect # UTF-8, -16 or -32. Detect which one to use; If the detection or - # decoding fails, fall back to `self.text` (using chardet to make + # decoding fails, fall back to `self.text` (using charset_normalizer to make # a best guess). encoding = guess_json_utf(self.content) if encoding is not None: diff --git a/requests/packages.py b/requests/packages.py index 7232fe0ff7..00196bff25 100644 --- a/requests/packages.py +++ b/requests/packages.py @@ -1,9 +1,17 @@ import sys +try: + import chardet +except ImportError: + import charset_normalizer as chardet + import warnings + + warnings.filterwarnings('ignore', 'Trying to detect', module='charset_normalizer') + # This code exists for backwards compatibility reasons. # I don't like it either. Just look the other way. :) -for package in ('urllib3', 'idna', 'chardet'): +for package in ('urllib3', 'idna'): locals()[package] = __import__(package) # This traversal is apparently necessary such that the identities are # preserved (requests.packages.urllib3.* is urllib3.*) @@ -11,4 +19,8 @@ if mod == package or mod.startswith(package + '.'): sys.modules['requests.packages.' + mod] = sys.modules[mod] +target = chardet.__name__ +for mod in list(sys.modules): + if mod == target or mod.startswith(target + '.'): + sys.modules['requests.packages.' + target.replace(target, 'chardet')] = sys.modules[mod] # Kinda cool, though, right? diff --git a/setup.py b/setup.py index 552c66de69..27f3948c66 100755 --- a/setup.py +++ b/setup.py @@ -41,7 +41,8 @@ def run_tests(self): packages = ['requests'] requires = [ - 'chardet>=3.0.2,<5', + 'charset_normalizer~=2.0.0; python_version >= "3"', + 'chardet>=3.0.2,<5; python_version < "3"', 'idna>=2.5,<3', 'urllib3>=1.21.1,<1.27', 'certifi>=2017.4.17' @@ -103,6 +104,7 @@ def run_tests(self): 'security': ['pyOpenSSL >= 0.14', 'cryptography>=1.3.4'], 'socks': ['PySocks>=1.5.6, !=1.5.7'], 'socks:sys_platform == "win32" and python_version == "2.7"': ['win_inet_pton'], + 'use_chardet_on_py3': ['chardet>=3.0.2,<5'] }, project_urls={ 'Documentation': 'https://requests.readthedocs.io', diff --git a/tox.ini b/tox.ini index 80454e0117..c8a63ee476 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,18 @@ [tox] -envlist = py27,py35,py36,py37,py38 +envlist = py{27,35,36,37,38}-{default,use_chardet_on_py3} [testenv] - +deps = -rrequirements-dev.txt +extras = + security + socks commands = - python setup.py test + pytest tests + +[testenv:default] + +[testenv:use_chardet_on_py3] +extras = + security + socks + use_chardet_on_py3