diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 3cd69315..bf2dcf07 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -37,6 +37,16 @@ def from_bytes( """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. If there is no results, it is a strong indicator that the source is binary/not text. + By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence. + And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will. + + The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page + but never take it for granted. Can improve the performance. + + You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that + purpose. + + This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32. """ if not explain: @@ -152,7 +162,7 @@ def from_bytes( try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[:int(50e4)], + sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)], encoding=encoding_iana ) else: diff --git a/docs/handling_result.rst b/docs/handling_result.rst index f161ffb8..067771ee 100755 --- a/docs/handling_result.rst +++ b/docs/handling_result.rst @@ -15,7 +15,7 @@ When initiating search upon a buffer, bytes or file you can assign the return va print(result.encoding) # gb18030 -Using CharsetNormalizerMatch +Using CharsetMatch ---------------------------- Here, ``result`` is a ``CharsetMatch`` object or ``None``. diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py index ed8a9173..99518b7a 100644 --- a/tests/test_on_byte.py +++ b/tests/test_on_byte.py @@ -1,6 +1,7 @@ import unittest from charset_normalizer.api import from_bytes +from charset_normalizer.constant import TOO_BIG_SEQUENCE class TestBytes(unittest.TestCase): @@ -27,6 +28,57 @@ def test_empty_bytes(self): len(r.alphabets) ) + def test_empty_str_with_sig_gb18030(self): + r = from_bytes('\uFEFF'.encode('gb18030')).best() + + self.assertIsNotNone(r) + self.assertEqual( + "", + str(r) + ) + self.assertEqual( + "gb18030", + r.encoding + ) + self.assertEqual( + 4, + len(r.raw) + ) + + def test_empty_str_with_sig_utf8(self): + r = from_bytes(b'\xef\xbb\xbf').best() + + self.assertIsNotNone(r) + self.assertEqual( + "", + str(r) + ) + self.assertEqual( + "utf_8", + r.encoding + ) + self.assertEqual( + 3, + len(r.raw) + ) + + def test_empty_str_with_large_sig_utf8(self): + r = from_bytes(b'\xef\xbb\xbf' + (b'0' * TOO_BIG_SEQUENCE)).best() + + self.assertIsNotNone(r) + self.assertEqual( + '0' * TOO_BIG_SEQUENCE, + str(r) + ) + self.assertEqual( + "utf_8", + r.encoding + ) + self.assertEqual( + TOO_BIG_SEQUENCE + 3, + len(r.raw) + ) + def test_on_empty_json(self): with self.subTest("Detecting empty JSON as ASCII"):