From 8de984f7696caac0fb7fcf80e58e305e017d1833 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Tue, 29 Mar 2022 11:11:39 +0100 Subject: [PATCH 01/12] Drop .apparent_encoding, in favour of .default_encoding --- httpx/_client.py | 4 ++++ httpx/_models.py | 31 +++++++++-------------------- tests/client/test_client.py | 2 +- tests/models/test_responses.py | 36 ++++++++++++++++++++-------------- 4 files changed, 35 insertions(+), 38 deletions(-) diff --git a/httpx/_client.py b/httpx/_client.py index c57cfb6ea9..91f80608dc 100644 --- a/httpx/_client.py +++ b/httpx/_client.py @@ -167,6 +167,7 @@ def __init__( event_hooks: typing.Mapping[str, typing.List[typing.Callable]] = None, base_url: URLTypes = "", trust_env: bool = True, + default_encoding: str = "utf-8", ): event_hooks = {} if event_hooks is None else event_hooks @@ -184,6 +185,7 @@ def __init__( "response": list(event_hooks.get("response", [])), } self._trust_env = trust_env + self._default_encoding = default_encoding self._netrc = NetRCInfo() self._state = ClientState.UNOPENED @@ -998,6 +1000,7 @@ def _send_single_request(self, request: Request) -> Response: response.stream = BoundSyncStream( response.stream, response=response, timer=timer ) + response.default_encoding = self._default_encoding self.cookies.extract_cookies(response) status = f"{response.status_code} {response.reason_phrase}" @@ -1702,6 +1705,7 @@ async def _send_single_request(self, request: Request) -> Response: response.stream = BoundAsyncStream( response.stream, response=response, timer=timer ) + response.default_encoding = self._default_encoding self.cookies.extract_cookies(response) status = f"{response.status_code} {response.reason_phrase}" diff --git a/httpx/_models.py b/httpx/_models.py index 5a213c3564..9fa37a94ad 100644 --- a/httpx/_models.py +++ b/httpx/_models.py @@ -7,8 +7,6 @@ from collections.abc import MutableMapping from http.cookiejar import Cookie, CookieJar -import charset_normalizer - from ._content import ByteStream, UnattachedStream, encode_request, encode_response from ._decoders import ( SUPPORTED_DECODERS, @@ -441,6 +439,7 @@ def __init__( request: Request = None, extensions: dict = None, history: typing.List["Response"] = None, + default_encoding: str = "utf-8", ): self.status_code = status_code self.headers = Headers(headers) @@ -454,6 +453,8 @@ def __init__( self.extensions = {} if extensions is None else extensions self.history = [] if history is None else list(history) + self.default_encoding = default_encoding + self.is_closed = False self.is_stream_consumed = False @@ -553,25 +554,24 @@ def text(self) -> str: if not content: self._text = "" else: - decoder = TextDecoder(encoding=self.encoding or "utf-8") + decoder = TextDecoder(encoding=self.encoding) self._text = "".join([decoder.decode(self.content), decoder.flush()]) return self._text @property - def encoding(self) -> typing.Optional[str]: + def encoding(self) -> str: """ Return an encoding to use for decoding the byte content into text. The priority for determining this is given by... * `.encoding = <>` has been set explicitly. * The encoding as specified by the charset parameter in the Content-Type header. - * The encoding as determined by `charset_normalizer`. - * UTF-8. + * The encoding as determined by `default_encoding`. """ if not hasattr(self, "_encoding"): encoding = self.charset_encoding if encoding is None or not is_known_encoding(encoding): - encoding = self.apparent_encoding + encoding = self.default_encoding self._encoding = encoding return self._encoding @@ -594,19 +594,6 @@ def charset_encoding(self) -> typing.Optional[str]: return params["charset"].strip("'\"") - @property - def apparent_encoding(self) -> typing.Optional[str]: - """ - Return the encoding, as determined by `charset_normalizer`. - """ - content = getattr(self, "_content", b"") - if len(content) < 32: - # charset_normalizer will issue warnings if we run it with - # fewer bytes than this cutoff. - return None - match = charset_normalizer.from_bytes(self.content).best() - return None if match is None else match.encoding - def _get_content_decoder(self) -> ContentDecoder: """ Returns a decoder instance which can be used to decode the raw byte @@ -825,7 +812,7 @@ def iter_text(self, chunk_size: int = None) -> typing.Iterator[str]: that handles both gzip, deflate, etc but also detects the content's string encoding. """ - decoder = TextDecoder(encoding=self.encoding or "utf-8") + decoder = TextDecoder(encoding=self.encoding) chunker = TextChunker(chunk_size=chunk_size) with request_context(request=self._request): for byte_content in self.iter_bytes(): @@ -923,7 +910,7 @@ async def aiter_text(self, chunk_size: int = None) -> typing.AsyncIterator[str]: that handles both gzip, deflate, etc but also detects the content's string encoding. """ - decoder = TextDecoder(encoding=self.encoding or "utf-8") + decoder = TextDecoder(encoding=self.encoding) chunker = TextChunker(chunk_size=chunk_size) with request_context(request=self._request): async for byte_content in self.aiter_bytes(): diff --git a/tests/client/test_client.py b/tests/client/test_client.py index 783d6d41f0..5283ee466b 100644 --- a/tests/client/test_client.py +++ b/tests/client/test_client.py @@ -15,7 +15,7 @@ def test_get(server): assert response.content == b"Hello, world!" assert response.text == "Hello, world!" assert response.http_version == "HTTP/1.1" - assert response.encoding is None + assert response.encoding == "utf-8" assert response.request.url == url assert response.headers assert response.is_redirect is False diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 8a7e7e1aa8..6682817936 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -164,9 +164,9 @@ def test_response_content_type_encoding(): assert response.encoding == "latin-1" -def test_response_autodetect_encoding(): +def test_response_default_encoding(): """ - Autodetect encoding if there is no Content-Type header. + Use default encoding if there is no Content-Type header. """ content = "おはようございます。".encode("utf-8") response = httpx.Response( @@ -174,12 +174,12 @@ def test_response_autodetect_encoding(): content=content, ) assert response.text == "おはようございます。" - assert response.encoding is None + assert response.encoding == "utf-8" def test_response_fallback_to_autodetect(): """ - Fallback to autodetection if we get an invalid charset in the Content-Type header. + Fallback to default encoding if we get an invalid charset in the Content-Type header. """ headers = {"Content-Type": "text-plain; charset=invalid-codec-name"} content = "おはようございます。".encode("utf-8") @@ -189,7 +189,7 @@ def test_response_fallback_to_autodetect(): headers=headers, ) assert response.text == "おはようございます。" - assert response.encoding is None + assert response.encoding == "utf-8" def test_response_no_charset_with_ascii_content(): @@ -205,7 +205,7 @@ def test_response_no_charset_with_ascii_content(): headers=headers, ) assert response.status_code == 200 - assert response.encoding is None + assert response.encoding == "utf-8" assert response.text == "Hello, world!" @@ -222,13 +222,15 @@ def test_response_no_charset_with_utf8_content(): headers=headers, ) assert response.text == "Unicode Snowman: ☃" - assert response.encoding is None + assert response.encoding == "utf-8" def test_response_no_charset_with_iso_8859_1_content(): """ A response with ISO 8859-1 encoded content should decode correctly, even with no charset specified. + + TODO: nope """ content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1") headers = {"Content-Type": "text/plain"} @@ -239,13 +241,16 @@ def test_response_no_charset_with_iso_8859_1_content(): ) assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None - assert response.apparent_encoding is not None + assert response.default_encoding == "utf-8" + assert response.encoding == "utf-8" def test_response_no_charset_with_cp_1252_content(): """ A response with Windows 1252 encoded content should decode correctly, even with no charset specified. + + TODO: nope """ content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252") headers = {"Content-Type": "text/plain"} @@ -256,12 +261,13 @@ def test_response_no_charset_with_cp_1252_content(): ) assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None - assert response.apparent_encoding is not None + assert response.default_encoding == "utf-8" + assert response.encoding == "utf-8" def test_response_non_text_encoding(): """ - Default to apparent encoding for non-text content-type headers. + Apply default encoding for non-text content-type headers. """ headers = {"Content-Type": "image/png"} response = httpx.Response( @@ -270,7 +276,7 @@ def test_response_non_text_encoding(): headers=headers, ) assert response.text == "xyz" - assert response.encoding is None + assert response.encoding == "utf-8" def test_response_set_explicit_encoding(): @@ -307,7 +313,7 @@ def test_read(): assert response.status_code == 200 assert response.text == "Hello, world!" - assert response.encoding is None + assert response.encoding == "utf-8" assert response.is_closed content = response.read() @@ -322,7 +328,7 @@ def test_empty_read(): assert response.status_code == 200 assert response.text == "" - assert response.encoding is None + assert response.encoding == "utf-8" assert response.is_closed content = response.read() @@ -341,7 +347,7 @@ async def test_aread(): assert response.status_code == 200 assert response.text == "Hello, world!" - assert response.encoding is None + assert response.encoding == "utf-8" assert response.is_closed content = await response.aread() @@ -357,7 +363,7 @@ async def test_empty_aread(): assert response.status_code == 200 assert response.text == "" - assert response.encoding is None + assert response.encoding == "utf-8" assert response.is_closed content = await response.aread() From 72f6b0a8328e013fde7ad1f66704cc6022ea312b Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 12:44:25 +0100 Subject: [PATCH 02/12] Add support for httpx.Client(default_encoding='chardet') and httpx.Client(default_encoding='charset_normalizer') --- httpx/__init__.py | 2 + httpx/_codecs.py | 149 ++++++++++++++++++++++++++++++++++++++++++++++ httpx/_models.py | 5 +- 3 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 httpx/_codecs.py diff --git a/httpx/__init__.py b/httpx/__init__.py index b93ca92229..8475c81f90 100644 --- a/httpx/__init__.py +++ b/httpx/__init__.py @@ -2,6 +2,7 @@ from ._api import delete, get, head, options, patch, post, put, request, stream from ._auth import Auth, BasicAuth, DigestAuth from ._client import USE_CLIENT_DEFAULT, AsyncClient, Client +from ._codecs import charset_autodetect from ._config import Limits, Proxy, Timeout, create_ssl_context from ._content import ByteStream from ._exceptions import ( @@ -72,6 +73,7 @@ def main() -> None: # type: ignore "BaseTransport", "BasicAuth", "ByteStream", + "charset_autodetect", "Client", "CloseError", "codes", diff --git a/httpx/_codecs.py b/httpx/_codecs.py new file mode 100644 index 0000000000..7ab1ecd515 --- /dev/null +++ b/httpx/_codecs.py @@ -0,0 +1,149 @@ +""" +The `httpx` package includes two optionally installable codecs, +which provide support for character-set autodetection. + +This can be useful for cases where you need the textual content of responses, +rather than the raw bytewise content, if the Content-Type does not include +a `charset` value, and the character set of the responses is unknown. + +There are two commonly used packages for this in the Python ecosystem. + +* chardet: https://chardet.readthedocs.io/ +* charset_normalizer: https://charset-normalizer.readthedocs.io/ + +--- + +## Using the default encoding. + +To understand this better let's start by looking at the default behaviour +without character-set auto-detection... + +```python +import httpx + +# Instantiate a client with the default configuration. +client = httpx.Client() + +# Using the client... +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "utf-8". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "utf-8". +``` + +This is normally absolutely fine. Most servers will respond with a properly +formatted Content-Type header, including a charset encoding. And in most cases +where no charset encoding is included, UTF-8 is very likely to be used, +since it is now so widely adopted. + +## Using an explict encoding. + +In some cases we might be making requests to a site, where no character +set information is being set explicitly by the server, but we know what +the encoding is. In this case it's best to set the default encoding +explicitly on the client. + +```python +import httpx + +# Instantiate a client with a Japanese character set as the default encoding. +client = httpx.Client(default_encoding="shift-jis") + +# Using the client... +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "shift-jis". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "shift-jis". +``` + +## Using character set auto-detection. + +In cases where the server is not reliably including character set information, +and where we don't know what encoding is being used, we can enable auto-detection +to make a best-guess attempt when decoding from bytes to text. + +```python +import codecs +import httpx + + +# Register the custom charset autodetect codecs. +# These codecs are then available as "chardet" and "charset_normalizer". +codecs.register(httpx.charset_autodetect) + +# Instantiate a client using "chardet" character set autodetection. +# When no explicit charset information is present on the response, +# the chardet package will be used to make a best-guess attempt. +client = httpx.Client(default_encoding="chardet") + +# Using the client with character-set autodetection enabled. +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "chardet". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "chardet" autodetection. +``` +""" +import codecs +import typing + + +class ChardetCodec(codecs.Codec): + def encode(input, errors="strict"): # type: ignore + raise RuntimeError("The 'chardet' codec does not support encoding.") + + def decode(input, errors="strict"): # type: ignore + import chardet + + content: bytes = bytes(input) + info: dict = chardet.detect(content) + encoding: str = info.get("encoding") or "utf-8" + return content.decode(encoding, errors=errors), len(content) + + +class CharsetNormalizerCodec(codecs.Codec): + def encode(input, errors="strict"): # type: ignore + raise RuntimeError("The 'charset_normalizer' codec does not support encoding.") + + def decode(input, errors="strict"): # type: ignore + import charset_normalizer + + content: bytes = bytes(input) + info: dict = charset_normalizer.detect(content) + encoding: str = info.get("encoding") or "utf-8" + return content.decode(encoding, errors=errors), len(content) + + +class NullIncrementalEncoder(codecs.IncrementalEncoder): + def encode(input, final=False): # type: ignore + raise RuntimeError("This codec does not support encoding.") + + +def charset_autodetect(encoding_name: str) -> typing.Optional[codecs.CodecInfo]: + if encoding_name == "chardet": + return codecs.CodecInfo( + name="chardet", + encode=ChardetCodec().encode, # type: ignore + decode=ChardetCodec().decode, # type: ignore + incrementalencoder=NullIncrementalEncoder, + # Note that for iter_text/aiter_text we *always* just fallback + # to using utf-8. Attempting character set autodetection in the + # incremental case can cause large amounts of buffering. + incrementaldecoder=codecs.getincrementaldecoder("utf-8"), + ) + + elif encoding_name == "charset_normalizer": + return codecs.CodecInfo( + name="charset_normalizer", + encode=CharsetNormalizerCodec().encode, # type: ignore + decode=CharsetNormalizerCodec().decode, # type: ignore + incrementalencoder=NullIncrementalEncoder, + # Note that for iter_text/aiter_text we *always* just fallback + # to using utf-8. Attempting character set autodetection in the + # incremental case can cause large amounts of buffering. + incrementaldecoder=codecs.getincrementaldecoder("utf-8"), + ) + + return None diff --git a/httpx/_models.py b/httpx/_models.py index 9fa37a94ad..722282e00b 100644 --- a/httpx/_models.py +++ b/httpx/_models.py @@ -1,4 +1,5 @@ import cgi +import codecs import datetime import email.message import json as jsonlib @@ -554,8 +555,8 @@ def text(self) -> str: if not content: self._text = "" else: - decoder = TextDecoder(encoding=self.encoding) - self._text = "".join([decoder.decode(self.content), decoder.flush()]) + codec = codecs.lookup(self.encoding) + self._text = codec.decode(self.content, errors="replace") return self._text @property From 0beeced2aa1683a7c5ec29359e171801c7423ddc Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 13:00:02 +0100 Subject: [PATCH 03/12] Docs for characterset autodetection --- README.md | 2 +- docs/advanced.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++ docs/index.md | 2 +- docs/quickstart.md | 2 +- httpx/_codecs.py | 2 +- requirements.txt | 2 ++ setup.py | 1 - 7 files changed, 85 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 23866adbbd..649bd4ec0b 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,6 @@ The HTTPX project relies on these excellent libraries: * `httpcore` - The underlying transport implementation for `httpx`. * `h11` - HTTP/1.1 support. * `certifi` - SSL certificates. -* `charset_normalizer` - Charset auto-detection. * `rfc3986` - URL parsing & normalization. * `idna` - Internationalized domain name support. * `sniffio` - Async library autodetection. @@ -140,6 +139,7 @@ As well as these optional installs: * `rich` - Rich terminal support. *(Optional, with `httpx[cli]`)* * `click` - Command line client support. *(Optional, with `httpx[cli]`)* * `brotli` or `brotlicffi` - Decoding for "brotli" compressed responses. *(Optional, with `httpx[brotli]`)* +* `chardet` or `charset_normalizer` - Optional charset auto-detection. * `async_generator` - Backport support for `contextlib.asynccontextmanager`. *(Only required for Python 3.6)* A huge amount of credit is due to `requests` for the API layout that diff --git a/docs/advanced.md b/docs/advanced.md index 9047814857..54c9cf180b 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -145,6 +145,85 @@ URL('http://httpbin.org/headers') For a list of all available client parameters, see the [`Client`](api.md#client) API reference. +--- + +## Character set encodings and auto-detection + +The `httpx` package includes two optionally installable codecs, which provide support for character-set autodetection. + +This can be useful for cases where you need the textual content of responses, rather than the raw bytewise content, if the Content-Type does not include a `charset` value, and the character set of the responses is unknown. + +There are two commonly used packages for this in the Python ecosystem. + +* [chardet](https://chardet.readthedocs.io/) +* [charset_normalizer](https://charset-normalizer.readthedocs.io/) + +### Using the default encoding + +To understand this better let's start by looking at the default behaviour without character-set auto-detection... + +```python +import httpx + +# Instantiate a client with the default configuration. +client = httpx.Client() + +# Using the client... +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "utf-8". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "utf-8". +``` + +This is normally absolutely fine. Most servers will respond with a properly formatted Content-Type header, including a charset encoding. And in most cases where no charset encoding is included, UTF-8 is very likely to be used, since it is now so widely adopted. + +### Using an explicit encoding. + +In some cases we might be making requests to a site, where no character set information is being set explicitly by the server, but we know what the encoding is. In this case it's best to set the default encoding explicitly on the client. + +```python +import httpx + +# Instantiate a client with a Japanese character set as the default encoding. +client = httpx.Client(default_encoding="shift-jis") + +# Using the client... +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "shift-jis". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "shift-jis". +``` + +### Using character set auto-detection + +In cases where the server is not reliably including character set information, and where we don't know what encoding is being used, we can enable auto-detection to make a best-guess attempt when decoding from bytes to text. + +```python +import codecs +import httpx + + +# Register the custom charset autodetect codecs. +# These codecs are then available as "chardet" and "charset_normalizer". +codecs.register(httpx.charset_autodetect) + +# Instantiate a client using "chardet" character set autodetection. +# When no explicit charset information is present on the response, +# the chardet package will be used to make a best-guess attempt. +client = httpx.Client(default_encoding="chardet") + +# Using the client with character-set autodetection enabled. +response = client.get(...) +print(response.encoding) # This will either print the charset given in + # the Content-Type charset, or else "chardet". +print(response.text) # The text will either be decoded with the Content-Type + # charset, or using "chardet" autodetection. +``` + +--- + ## Calling into Python Web Apps You can configure an `httpx` client to call directly into a Python web application using the WSGI protocol. diff --git a/docs/index.md b/docs/index.md index 2b3865bb8e..1dee47d6a6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -109,7 +109,6 @@ The HTTPX project relies on these excellent libraries: * `httpcore` - The underlying transport implementation for `httpx`. * `h11` - HTTP/1.1 support. * `certifi` - SSL certificates. -* `charset_normalizer` - Charset auto-detection. * `rfc3986` - URL parsing & normalization. * `idna` - Internationalized domain name support. * `sniffio` - Async library autodetection. @@ -121,6 +120,7 @@ As well as these optional installs: * `rich` - Rich terminal support. *(Optional, with `httpx[cli]`)* * `click` - Command line client support. *(Optional, with `httpx[cli]`)* * `brotli` or `brotlicffi` - Decoding for "brotli" compressed responses. *(Optional, with `httpx[brotli]`)* +* `chardet` or `charset_normalizer` - Optional charset auto-detection. * `async_generator` - Backport support for `contextlib.asynccontextmanager`. *(Only required for Python 3.6)* A huge amount of credit is due to `requests` for the API layout that diff --git a/docs/quickstart.md b/docs/quickstart.md index e8923f02d7..1aa24fe46e 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -73,7 +73,7 @@ You can inspect what encoding will be used to decode the response. ``` In some cases the response may not contain an explicit encoding, in which case HTTPX -will attempt to automatically determine an encoding to use. +will default to using "utf-8". ```pycon >>> r.encoding diff --git a/httpx/_codecs.py b/httpx/_codecs.py index 7ab1ecd515..ddc554a550 100644 --- a/httpx/_codecs.py +++ b/httpx/_codecs.py @@ -37,7 +37,7 @@ where no charset encoding is included, UTF-8 is very likely to be used, since it is now so widely adopted. -## Using an explict encoding. +## Using an explicit encoding. In some cases we might be making requests to a site, where no character set information is being set explicitly by the server, but we know what diff --git a/requirements.txt b/requirements.txt index b135cf5335..f94121338e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,8 @@ -e .[brotli,cli,http2,socks] charset-normalizer==2.0.6 +chardet==4.0.0 +types-chardet-4.0.3 # Documentation mkdocs==1.3.0 diff --git a/setup.py b/setup.py index 50aa35960a..63d92fbd48 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,6 @@ def get_packages(package): zip_safe=False, install_requires=[ "certifi", - "charset_normalizer", "sniffio", "rfc3986[idna2008]>=1.3,<2", "httpcore>=0.14.5,<0.15.0", From 5f80256607b5fdb10d27b0035d3b8968041b3919 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 13:04:30 +0100 Subject: [PATCH 04/12] Fix requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f94121338e..9142af6dfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ charset-normalizer==2.0.6 chardet==4.0.0 -types-chardet-4.0.3 +types-chardet==4.0.3 # Documentation mkdocs==1.3.0 From 5189473fd305f892309350a7b466cbc3ce285cad Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 13:08:47 +0100 Subject: [PATCH 05/12] Fix text decoding --- httpx/_models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/httpx/_models.py b/httpx/_models.py index 722282e00b..97a7ba9008 100644 --- a/httpx/_models.py +++ b/httpx/_models.py @@ -555,8 +555,7 @@ def text(self) -> str: if not content: self._text = "" else: - codec = codecs.lookup(self.encoding) - self._text = codec.decode(self.content, errors="replace") + self._text = self.content.decode(self.encoding, errors="replace") return self._text @property From fb39159670d7f9587c185cf5467ba6db2d94e78e Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:05:17 +0100 Subject: [PATCH 06/12] Drop unused import --- httpx/_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/httpx/_models.py b/httpx/_models.py index 97a7ba9008..02f4095849 100644 --- a/httpx/_models.py +++ b/httpx/_models.py @@ -1,5 +1,4 @@ import cgi -import codecs import datetime import email.message import json as jsonlib From 22dddd2c5d494bb2e2f0c0ad92442631ef1adf88 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:13:13 +0100 Subject: [PATCH 07/12] Fix-up charset autodetection tests --- tests/models/test_responses.py | 16 +++++++++------- tests/test_decoders.py | 6 +++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 6682817936..cfc398bf34 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -177,7 +177,7 @@ def test_response_default_encoding(): assert response.encoding == "utf-8" -def test_response_fallback_to_autodetect(): +def test_response_fallback_to_utf8(): """ Fallback to default encoding if we get an invalid charset in the Content-Type header. """ @@ -228,16 +228,17 @@ def test_response_no_charset_with_utf8_content(): def test_response_no_charset_with_iso_8859_1_content(): """ A response with ISO 8859-1 encoded content should decode correctly, - even with no charset specified. - - TODO: nope + even with no charset specified, if charset autodetection is enabled. """ + codecs.register(httpx.charset_autodetect) + content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1") headers = {"Content-Type": "text/plain"} response = httpx.Response( 200, content=content, headers=headers, + default_encoding="charset_normalizer" ) assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None @@ -248,16 +249,17 @@ def test_response_no_charset_with_iso_8859_1_content(): def test_response_no_charset_with_cp_1252_content(): """ A response with Windows 1252 encoded content should decode correctly, - even with no charset specified. - - TODO: nope + even with no charset specified, if charset autodetection is enabled. """ + codecs.register(httpx.charset_autodetect) + content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252") headers = {"Content-Type": "text/plain"} response = httpx.Response( 200, content=content, headers=headers, + default_encoding="charset_normalizer" ) assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None diff --git a/tests/test_decoders.py b/tests/test_decoders.py index f31abf098b..2124369a44 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -1,3 +1,4 @@ +import codecs import zlib import pytest @@ -184,16 +185,19 @@ def test_decoding_errors(header_value): ], ) @pytest.mark.asyncio -async def test_text_decoder(data, encoding): +async def test_charset_autodetection(data, encoding): async def iterator(): nonlocal data for chunk in data: yield chunk + codecs.register(httpx.charset_autodetect) + # Accessing `.text` on a read response. response = httpx.Response( 200, content=iterator(), + default_encoding="charset_normalizer" ) await response.aread() assert response.text == (b"".join(data)).decode(encoding) From cc120fca45e207d6dd7b13bdc0731539841a572c Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:16:37 +0100 Subject: [PATCH 08/12] Linting --- tests/models/test_responses.py | 10 ++-------- tests/test_decoders.py | 4 +--- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index cfc398bf34..0eca15df8c 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -235,10 +235,7 @@ def test_response_no_charset_with_iso_8859_1_content(): content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1") headers = {"Content-Type": "text/plain"} response = httpx.Response( - 200, - content=content, - headers=headers, - default_encoding="charset_normalizer" + 200, content=content, headers=headers, default_encoding="charset_normalizer" ) assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None @@ -256,10 +253,7 @@ def test_response_no_charset_with_cp_1252_content(): content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252") headers = {"Content-Type": "text/plain"} response = httpx.Response( - 200, - content=content, - headers=headers, - default_encoding="charset_normalizer" + 200, content=content, headers=headers, default_encoding="charset_normalizer" ) assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None diff --git a/tests/test_decoders.py b/tests/test_decoders.py index 2124369a44..d143f14ae5 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -195,9 +195,7 @@ async def iterator(): # Accessing `.text` on a read response. response = httpx.Response( - 200, - content=iterator(), - default_encoding="charset_normalizer" + 200, content=iterator(), default_encoding="charset_normalizer" ) await response.aread() assert response.text == (b"".join(data)).decode(encoding) From 05435f0f0841ed3d87883f56084fa783961e5196 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:18:27 +0100 Subject: [PATCH 09/12] Add missing import to tests --- tests/models/test_responses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 0eca15df8c..31ec81a079 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -1,3 +1,4 @@ +import codecs import json import pickle From f5495791d06012035af72f67ed2948e710ab72fb Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:22:03 +0100 Subject: [PATCH 10/12] Fix up test cases --- httpx/_codecs.py | 10 +++++----- tests/models/test_responses.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/httpx/_codecs.py b/httpx/_codecs.py index ddc554a550..6e3fb2c2a2 100644 --- a/httpx/_codecs.py +++ b/httpx/_codecs.py @@ -91,10 +91,10 @@ class ChardetCodec(codecs.Codec): - def encode(input, errors="strict"): # type: ignore + def encode(self, input, errors="strict"): # type: ignore raise RuntimeError("The 'chardet' codec does not support encoding.") - def decode(input, errors="strict"): # type: ignore + def decode(self, input, errors="strict"): # type: ignore import chardet content: bytes = bytes(input) @@ -104,10 +104,10 @@ def decode(input, errors="strict"): # type: ignore class CharsetNormalizerCodec(codecs.Codec): - def encode(input, errors="strict"): # type: ignore + def encode(self, input, errors="strict"): # type: ignore raise RuntimeError("The 'charset_normalizer' codec does not support encoding.") - def decode(input, errors="strict"): # type: ignore + def decode(self, input, errors="strict"): # type: ignore import charset_normalizer content: bytes = bytes(input) @@ -117,7 +117,7 @@ def decode(input, errors="strict"): # type: ignore class NullIncrementalEncoder(codecs.IncrementalEncoder): - def encode(input, final=False): # type: ignore + def encode(self, input, final=False): # type: ignore raise RuntimeError("This codec does not support encoding.") diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 31ec81a079..1e5d584fc2 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -240,8 +240,8 @@ def test_response_no_charset_with_iso_8859_1_content(): ) assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None - assert response.default_encoding == "utf-8" - assert response.encoding == "utf-8" + assert response.default_encoding == "charset_normalizer" + assert response.encoding == "charset_normalizer" def test_response_no_charset_with_cp_1252_content(): @@ -258,8 +258,8 @@ def test_response_no_charset_with_cp_1252_content(): ) assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz" assert response.charset_encoding is None - assert response.default_encoding == "utf-8" - assert response.encoding == "utf-8" + assert response.default_encoding == "charset_normalizer" + assert response.encoding == "charset_normalizer" def test_response_non_text_encoding(): From 3b83054f92281481ca6bf77b394321cad8935540 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:27:45 +0100 Subject: [PATCH 11/12] Drop now-incorrect portion of test case --- tests/test_decoders.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_decoders.py b/tests/test_decoders.py index d143f14ae5..0b8f3cf244 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -200,13 +200,6 @@ async def iterator(): await response.aread() assert response.text == (b"".join(data)).decode(encoding) - # Streaming `.aiter_text` iteratively. - # Note that if we streamed the text *without* having read it first, then - # we won't get a `charset_normalizer` guess, and will instead always rely - # on utf-8 if no charset is specified. - text = "".join([part async for part in response.aiter_text()]) - assert text == (b"".join(data)).decode(encoding) - @pytest.mark.asyncio async def test_text_decoder_known_encoding(): From 6136c0f7701c0277a2920926508d8bf91f4a266a Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Wed, 30 Mar 2022 14:34:12 +0100 Subject: [PATCH 12/12] Add chardet test case, and add 'nocover' lines --- httpx/_codecs.py | 10 +++++++--- tests/models/test_responses.py | 24 ++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/httpx/_codecs.py b/httpx/_codecs.py index 6e3fb2c2a2..eee1a82446 100644 --- a/httpx/_codecs.py +++ b/httpx/_codecs.py @@ -92,7 +92,9 @@ class ChardetCodec(codecs.Codec): def encode(self, input, errors="strict"): # type: ignore - raise RuntimeError("The 'chardet' codec does not support encoding.") + raise RuntimeError( + "The 'chardet' codec does not support encoding." + ) # pragma: nocover def decode(self, input, errors="strict"): # type: ignore import chardet @@ -105,7 +107,9 @@ def decode(self, input, errors="strict"): # type: ignore class CharsetNormalizerCodec(codecs.Codec): def encode(self, input, errors="strict"): # type: ignore - raise RuntimeError("The 'charset_normalizer' codec does not support encoding.") + raise RuntimeError( + "The 'charset_normalizer' codec does not support encoding." + ) # pragma: nocover def decode(self, input, errors="strict"): # type: ignore import charset_normalizer @@ -118,7 +122,7 @@ def decode(self, input, errors="strict"): # type: ignore class NullIncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): # type: ignore - raise RuntimeError("This codec does not support encoding.") + raise RuntimeError("This codec does not support encoding.") # pragma: nocover def charset_autodetect(encoding_name: str) -> typing.Optional[codecs.CodecInfo]: diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 1e5d584fc2..b39ca4513c 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -226,10 +226,11 @@ def test_response_no_charset_with_utf8_content(): assert response.encoding == "utf-8" -def test_response_no_charset_with_iso_8859_1_content(): +def test_response_no_charset_with_iso_8859_1_content_normalizer(): """ A response with ISO 8859-1 encoded content should decode correctly, - even with no charset specified, if charset autodetection is enabled. + even with no charset specified, if charset autodetection is enabled, + using charset_normalizer. """ codecs.register(httpx.charset_autodetect) @@ -244,6 +245,25 @@ def test_response_no_charset_with_iso_8859_1_content(): assert response.encoding == "charset_normalizer" +def test_response_no_charset_with_iso_8859_1_content_chardet(): + """ + A response with ISO 8859-1 encoded content should decode correctly, + even with no charset specified, if charset autodetection is enabled, + using chardet. + """ + codecs.register(httpx.charset_autodetect) + + content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1") + headers = {"Content-Type": "text/plain"} + response = httpx.Response( + 200, content=content, headers=headers, default_encoding="chardet" + ) + assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz" + assert response.charset_encoding is None + assert response.default_encoding == "chardet" + assert response.encoding == "chardet" + + def test_response_no_charset_with_cp_1252_content(): """ A response with Windows 1252 encoded content should decode correctly,