Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ignore charset_normalizer related warning #1242

Merged

Conversation

graingert
Copy link
Member

No description provided.

@graingert graingert requested a review from a team July 15, 2021 18:39
@Kludex
Copy link
Sponsor Member

Kludex commented Jul 16, 2021

I'll add more context for future reference.

This PR solves a UserWarning that is being raised by the requests package on the latest upgrade (2.26.0, 2021-07-13).

You can find the logs below:

Show test logs
__________________________________________________________________________________________ test_router[asyncio] ___________________________________________________________________________________________

client = <starlette.testclient.TestClient object at 0x7f9fd8775220>

def test_router(client):
    response = client.get("/")
    assert response.status_code == 200
    assert response.text == "Hello, world"

    response = client.post("/")
    assert response.status_code == 405
    assert response.text == "Method Not Allowed"

    response = client.get("/foo")
    assert response.status_code == 404
    assert response.text == "Not Found"

    response = client.get("/users")
    assert response.status_code == 200
    assert response.text == "All users"

    response = client.get("/users/tomchristie")
    assert response.status_code == 200
    assert response.text == "User tomchristie"

    response = client.get("/users/me")
    assert response.status_code == 200
    assert response.text == "User fixed me"

    response = client.get("/users/tomchristie/")
    assert response.status_code == 200
    assert response.url == "http://testserver/users/tomchristie"
    assert response.text == "User tomchristie"

    response = client.get("/users/nomatch")
    assert response.status_code == 200
    assert response.text == "User nomatch"

    response = client.get("/static/123")
    assert response.status_code == 200
  assert response.text == "xxxxx"

tests/test_routing.py:149:


../../../anaconda3/envs/starlette/lib/python3.8/site-packages/requests/models.py:865: in text
encoding = self.apparent_encoding
../../../anaconda3/envs/starlette/lib/python3.8/site-packages/requests/models.py:735: in apparent_encoding
return chardet.detect(self.content)['encoding']
../../../anaconda3/envs/starlette/lib/python3.8/site-packages/charset_normalizer/legacy.py:23: in detect
r = from_bytes(byte_str).best()


sequences = b'xxxxx', steps = 1, chunk_size = 5, threshold = 0.2, cp_isolation = [], cp_exclusion = [], preemptive_behaviour = True, explain = False

def from_bytes(
        sequences: bytes,
        steps: int = 5,
        chunk_size: int = 512,
        threshold: float = 0.2,
        cp_isolation: List[str] = None,
        cp_exclusion: List[str] = None,
        preemptive_behaviour: bool = True,
        explain: bool = False
) -> CharsetMatches:
    """
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    """

    if not explain:
        logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.INFO)

    length = len(sequences)  # type: int

    if length == 0:
        logger.warning("Given content is empty, stopping the process very early, returning empty utf_8 str match")
        return CharsetMatches(
            [
                CharsetMatch(
                    sequences,
                    "utf_8",
                    0.,
                    False,
                    [],
                    ""
                )
            ]
        )

    if cp_isolation is not None:
        logger.warning('cp_isolation is set. use this flag for debugging purpose. '
                       'limited list of encoding allowed : %s.',
                       ', '.join(cp_isolation))
        cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
    else:
        cp_isolation = []

    if cp_exclusion is not None:
        logger.warning(
            'cp_exclusion is set. use this flag for debugging purpose. '
            'limited list of encoding excluded : %s.',
            ', '.join(cp_exclusion))
        cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
    else:
        cp_exclusion = []

    if length <= (chunk_size * steps):
        logger.warning(
            'override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.',
            steps, chunk_size, length)
        steps = 1
        chunk_size = length

    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)

    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool

    if is_too_small_sequence:
      warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length))

E UserWarning: Trying to detect encoding from a tiny portion of (5) byte(s).

../../../anaconda3/envs/starlette/lib/python3.8/site-packages/charset_normalizer/api.py:105: UserWarning
____________________________________________________________________________________________ test_router[trio] ____________________________________________________________________________________________

client = <starlette.testclient.TestClient object at 0x7f9fd84e2340>

def test_router(client):
    response = client.get("/")
    assert response.status_code == 200
    assert response.text == "Hello, world"

    response = client.post("/")
    assert response.status_code == 405
    assert response.text == "Method Not Allowed"

    response = client.get("/foo")
    assert response.status_code == 404
    assert response.text == "Not Found"

    response = client.get("/users")
    assert response.status_code == 200
    assert response.text == "All users"

    response = client.get("/users/tomchristie")
    assert response.status_code == 200
    assert response.text == "User tomchristie"

    response = client.get("/users/me")
    assert response.status_code == 200
    assert response.text == "User fixed me"

    response = client.get("/users/tomchristie/")
    assert response.status_code == 200
    assert response.url == "http://testserver/users/tomchristie"
    assert response.text == "User tomchristie"

    response = client.get("/users/nomatch")
    assert response.status_code == 200
    assert response.text == "User nomatch"

    response = client.get("/static/123")
    assert response.status_code == 200
  assert response.text == "xxxxx"

tests/test_routing.py:149:


../../../anaconda3/envs/starlette/lib/python3.8/site-packages/requests/models.py:865: in text
encoding = self.apparent_encoding
../../../anaconda3/envs/starlette/lib/python3.8/site-packages/requests/models.py:735: in apparent_encoding
return chardet.detect(self.content)['encoding']
../../../anaconda3/envs/starlette/lib/python3.8/site-packages/charset_normalizer/legacy.py:23: in detect
r = from_bytes(byte_str).best()


sequences = b'xxxxx', steps = 1, chunk_size = 5, threshold = 0.2, cp_isolation = [], cp_exclusion = [], preemptive_behaviour = True, explain = False

def from_bytes(
        sequences: bytes,
        steps: int = 5,
        chunk_size: int = 512,
        threshold: float = 0.2,
        cp_isolation: List[str] = None,
        cp_exclusion: List[str] = None,
        preemptive_behaviour: bool = True,
        explain: bool = False
) -> CharsetMatches:
    """
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    """

    if not explain:
        logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.INFO)

    length = len(sequences)  # type: int

    if length == 0:
        logger.warning("Given content is empty, stopping the process very early, returning empty utf_8 str match")
        return CharsetMatches(
            [
                CharsetMatch(
                    sequences,
                    "utf_8",
                    0.,
                    False,
                    [],
                    ""
                )
            ]
        )

    if cp_isolation is not None:
        logger.warning('cp_isolation is set. use this flag for debugging purpose. '
                       'limited list of encoding allowed : %s.',
                       ', '.join(cp_isolation))
        cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
    else:
        cp_isolation = []

    if cp_exclusion is not None:
        logger.warning(
            'cp_exclusion is set. use this flag for debugging purpose. '
            'limited list of encoding excluded : %s.',
            ', '.join(cp_exclusion))
        cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
    else:
        cp_exclusion = []

    if length <= (chunk_size * steps):
        logger.warning(
            'override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.',
            steps, chunk_size, length)
        steps = 1
        chunk_size = length

    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)

    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool

    if is_too_small_sequence:
      warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length))

E UserWarning: Trying to detect encoding from a tiny portion of (5) byte(s).

EDIT: More specifically, this was introduced by charset_normalizer (requests dependency) on their 2.0 release here.

@graingert graingert merged commit b0a6d6f into encode:master Jul 16, 2021
@graingert graingert deleted the filterwarnings-charset-normalizer branch July 16, 2021 10:07
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

2 participants