Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✔️ Add some additional test case + ✏️ Complete docs #51

Merged
merged 5 commits into from Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 11 additions & 1 deletion charset_normalizer/api.py
Expand Up @@ -37,6 +37,16 @@ def from_bytes(
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.

You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.

This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
"""

if not explain:
Expand Down Expand Up @@ -152,7 +162,7 @@ def from_bytes(
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[:int(50e4)],
sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)],
encoding=encoding_iana
)
else:
Expand Down
2 changes: 1 addition & 1 deletion docs/handling_result.rst
Expand Up @@ -15,7 +15,7 @@ When initiating search upon a buffer, bytes or file you can assign the return va

print(result.encoding) # gb18030

Using CharsetNormalizerMatch
Using CharsetMatch
----------------------------

Here, ``result`` is a ``CharsetMatch`` object or ``None``.
Expand Down
52 changes: 52 additions & 0 deletions tests/test_on_byte.py
@@ -1,6 +1,7 @@
import unittest

from charset_normalizer.api import from_bytes
from charset_normalizer.constant import TOO_BIG_SEQUENCE


class TestBytes(unittest.TestCase):
Expand All @@ -27,6 +28,57 @@ def test_empty_bytes(self):
len(r.alphabets)
)

def test_empty_str_with_sig_gb18030(self):
r = from_bytes('\uFEFF'.encode('gb18030')).best()

self.assertIsNotNone(r)
self.assertEqual(
"",
str(r)
)
self.assertEqual(
"gb18030",
r.encoding
)
self.assertEqual(
4,
len(r.raw)
)

def test_empty_str_with_sig_utf8(self):
r = from_bytes(b'\xef\xbb\xbf').best()

self.assertIsNotNone(r)
self.assertEqual(
"",
str(r)
)
self.assertEqual(
"utf_8",
r.encoding
)
self.assertEqual(
3,
len(r.raw)
)

def test_empty_str_with_large_sig_utf8(self):
r = from_bytes(b'\xef\xbb\xbf' + (b'0' * TOO_BIG_SEQUENCE)).best()

self.assertIsNotNone(r)
self.assertEqual(
'0' * TOO_BIG_SEQUENCE,
str(r)
)
self.assertEqual(
"utf_8",
r.encoding
)
self.assertEqual(
TOO_BIG_SEQUENCE + 3,
len(r.raw)
)

def test_on_empty_json(self):

with self.subTest("Detecting empty JSON as ASCII"):
Expand Down