Skip to content

Commit

Permalink
Add a default_encoding parameter to [set|autodetect] the encoding i…
Browse files Browse the repository at this point in the history
…f no charset is found in the headers
  • Loading branch information
deedy5 committed Apr 7, 2024
1 parent 418e452 commit 5491d5e
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 28 deletions.
4 changes: 4 additions & 0 deletions curl_cffi/requests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def request(
impersonate: Optional[Union[str, BrowserType]] = None,
thread: Optional[ThreadType] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
http_version: Optional[CurlHttpVersion] = None,
debug: bool = False,
Expand Down Expand Up @@ -90,6 +91,8 @@ def request(
impersonate: which browser version to impersonate.
thread: work with other thread implementations. choices: eventlet, gevent.
default_headers: whether to set default browser headers.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
curl_options: extra curl options to use.
http_version: limiting http version, http2 will be tries by default.
debug: print extra curl debug info.
Expand Down Expand Up @@ -122,6 +125,7 @@ def request(
content_callback=content_callback,
impersonate=impersonate,
default_headers=default_headers,
default_encoding=default_encoding,
http_version=http_version,
interface=interface,
multipart=multipart,
Expand Down
67 changes: 59 additions & 8 deletions curl_cffi/requests/models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import queue
import re
import warnings
from concurrent.futures import Future
from functools import cached_property
from json import loads
from typing import Any, Awaitable, Dict, List, Optional
from typing import Any, Awaitable, Callable, Dict, List, Optional, Union

from .. import Curl
from .cookies import Cookies
from .errors import RequestsError
from .headers import Headers

CHARSET_RE = re.compile(r"charset=([\w-]+)")


def clear_queue(q: queue.Queue):
with q.mutex:
Expand Down Expand Up @@ -41,6 +45,8 @@ class Response:
elapsed: how many seconds the request cost.
encoding: http body encoding.
charset: alias for encoding.
charset_encoding: encoding specified by the Content-Type header.
default_encoding: user-defined encoding used for decoding content if charset is not found in headers.
redirect_count: how many redirects happened.
redirect_url: the final redirected url.
http_version: http version used.
Expand All @@ -58,8 +64,7 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.headers = Headers()
self.cookies = Cookies()
self.elapsed = 0.0
self.encoding = "utf-8"
self.charset = self.encoding
self.default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
self.redirect_count = 0
self.redirect_url = ""
self.http_version = 0
Expand All @@ -70,16 +75,62 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.astream_task: Optional[Awaitable] = None
self.quit_now = None

@property
def charset(self) -> str:
"""Alias for encoding."""
return self.encoding

@property
def encoding(self) -> str:
"""
Determines the encoding to decode byte content into text.
The method follows a specific priority to decide the encoding:
1. If `.encoding` has been explicitly set, it is used.
2. The encoding specified by the `charset` parameter in the `Content-Type` header.
3. The encoding specified by the `default_encoding` attribute. This can either be
a string (e.g., "utf-8") or a callable for charset autodetection.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None:
if isinstance(self.default_encoding, str):
encoding = self.default_encoding
elif callable(self.default_encoding):
encoding = self.default_encoding(self.content)
self._encoding = encoding or "utf-8"
return self._encoding

@encoding.setter
def encoding(self, value: str) -> None:
if hasattr(self, "_text"):
raise ValueError("Cannot set encoding after text has been accessed")
self._encoding = value

@property
def charset_encoding(self) -> Optional[str]:
"""Return the encoding, as specified by the Content-Type header."""
content_type = self.headers.get("Content-Type")
if content_type:
charset_match = CHARSET_RE.search(content_type)
return charset_match.group(1) if charset_match else None
return None

@property
def text(self) -> str:
if not hasattr(self, "_text"):
if not self.content:
self._text = ""
else:
self._text = self._decode(self.content)
return self._text

def _decode(self, content: bytes) -> str:
try:
return content.decode(self.charset, errors="replace")
return content.decode(self.encoding, errors="replace")
except (UnicodeDecodeError, LookupError):
return content.decode("utf-8-sig")

@property
def text(self) -> str:
return self._decode(self.content)

def raise_for_status(self):
"""Raise an error if status code is not in [200, 400)"""
if not self.ok:
Expand Down
36 changes: 18 additions & 18 deletions curl_cffi/requests/session.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import math
import queue
import re
import threading
import warnings
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -55,7 +54,6 @@ class ProxySpec(TypedDict, total=False):
else:
ProxySpec = Dict[str, str]

CHARSET_RE = re.compile(r"charset=([\w-]+)")
ThreadType = Literal["eventlet", "gevent"]


Expand Down Expand Up @@ -205,6 +203,7 @@ def __init__(
max_redirects: int = -1,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: bool = True,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
curl_infos: Optional[list] = None,
http_version: Optional[CurlHttpVersion] = None,
Expand All @@ -224,6 +223,7 @@ def __init__(
self.max_redirects = max_redirects
self.impersonate = impersonate
self.default_headers = default_headers
self.default_encoding = default_encoding
self.curl_options = curl_options or {}
self.curl_infos = curl_infos or []
self.http_version = http_version
Expand Down Expand Up @@ -547,7 +547,7 @@ def qput(chunk):

return req, buffer, header_buffer, q, header_recved, quit_now

def _parse_response(self, curl, buffer, header_buffer):
def _parse_response(self, curl, buffer, header_buffer, default_encoding):
c = curl
rsp = Response(c)
rsp.url = cast(bytes, c.getinfo(CurlInfo.EFFECTIVE_URL)).decode()
Expand Down Expand Up @@ -583,13 +583,7 @@ def _parse_response(self, curl, buffer, header_buffer):
rsp.cookies = self.cookies
# print("Cookies after extraction", self.cookies)

content_type = rsp.headers.get("Content-Type", default="")
charset_match = CHARSET_RE.search(content_type)
charset = charset_match.group(1) if charset_match else "utf-8"

rsp.charset = charset
rsp.encoding = charset # TODO use chardet

rsp.default_encoding = default_encoding
rsp.elapsed = cast(float, c.getinfo(CurlInfo.TOTAL_TIME))
rsp.redirect_count = cast(int, c.getinfo(CurlInfo.REDIRECT_COUNT))
rsp.redirect_url = cast(bytes, c.getinfo(CurlInfo.REDIRECT_URL)).decode()
Expand Down Expand Up @@ -639,6 +633,8 @@ def __init__(
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
interface: which interface use in request to server.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
Notes:
This class can be used as a context manager.
Expand Down Expand Up @@ -767,6 +763,7 @@ def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -825,7 +822,7 @@ def perform():
try:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
cast(queue.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -843,7 +840,7 @@ def cleanup(fut):

# Wait for the first chunk
cast(threading.Event, header_recved).wait()
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
header_parsed.set()

# Raise the exception if something wrong happens when receiving the header.
Expand All @@ -868,11 +865,11 @@ def cleanup(fut):
else:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down Expand Up @@ -919,6 +916,8 @@ def __init__(
allow_redirects: whether to allow redirection.
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
Notes:
This class can be used as a context manager, and it's recommended to use via
Expand Down Expand Up @@ -1043,6 +1042,7 @@ async def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -1093,7 +1093,7 @@ async def perform():
try:
await task
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
cast(asyncio.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -1113,7 +1113,7 @@ def cleanup(fut):
# Unlike threads, coroutines does not use preemptive scheduling.
# For asyncio, there is no need for a header_parsed event, the
# _parse_response will execute in the foreground, no background tasks running.
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)

first_element = _peek_aio_queue(cast(asyncio.Queue, q))
if isinstance(first_element, RequestsError):
Expand All @@ -1132,11 +1132,11 @@ def cleanup(fut):
await task
# print(curl.getinfo(CurlInfo.CAINFO))
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ description = "libcurl ffi bindings for Python, with impersonation support."
license = { file = "LICENSE" }
dependencies = [
"cffi>=1.12.0",
"certifi",
"certifi>=2024.2.2",
]
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -27,6 +27,7 @@ classifiers = [
[project.optional-dependencies]
dev = [
"autoflake==1.4",
"charset_normalizer>=3.3.2,<4",
"coverage==6.4.1",
"cryptography==38.0.3",
"flake8==6.0.0",
Expand All @@ -50,6 +51,7 @@ build = [
"wheel",
]
test = [
"charset_normalizer>=3.3.2,<4",
"cryptography==38.0.3",
"httpx==0.23.1",
"types-certifi==2021.10.8.2",
Expand Down
18 changes: 18 additions & 0 deletions tests/unittest/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ async def app(scope, receive, send):
await incomplete_read(scope, receive, send)
elif scope["path"].startswith("/gbk"):
await hello_world_gbk(scope, receive, send)
elif scope["path"].startswith("/windows1251"):
await hello_world_windows1251(scope, receive, send)
elif scope["path"].startswith("http://"):
await http_proxy(scope, receive, send)
elif scope["method"] == "CONNECT":
Expand Down Expand Up @@ -165,6 +167,22 @@ async def hello_world_gbk(scope, receive, send):
await send({"type": "http.response.body", "body": b"Hello, world!"})


async def hello_world_windows1251(scope, receive, send):
await send(
{
"type": "http.response.start",
"status": 200,
"headers": [[b"content-type", b"text/plain"]],
}
)
await send(
{
"type": "http.response.body",
"body": "Bсеки човек има право на образование.".encode("cp1251"),
}
)


async def http_proxy(scope, receive, send):
await send(
{
Expand Down
18 changes: 17 additions & 1 deletion tests/unittest/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from io import BytesIO

import pytest
from charset_normalizer import detect

from curl_cffi import CurlOpt, requests
from curl_cffi.const import CurlECode, CurlInfo
Expand Down Expand Up @@ -109,7 +110,22 @@ def test_headers(server):

def test_charset_parse(server):
r = requests.get(str(server.url.copy_with(path="/gbk")))
assert r.charset == "gbk"
assert r.encoding == "gbk"


def test_charset_default_encoding(server):
r = requests.get(
str(server.url.copy_with(path="/windows1251")), default_encoding="windows-1251"
)
assert r.encoding == "windows-1251"


def test_charset_default_encoding_autodetect(server):
def autodetect(content):
return detect(content).get("encoding")

r = requests.get(str(server.url.copy_with(path="/windows1251")), default_encoding=autodetect)
assert r.encoding == "windows-1251"


def test_content_type_header_with_json(server):
Expand Down

0 comments on commit 5491d5e

Please sign in to comment.