From 85d0e8b42db22ad77c4e6d3f968dd6b5cc2177ab Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 12 Jul 2021 21:33:45 +0200 Subject: [PATCH 1/4] Always strip the SIG in raw bytes except on u16/u32 --- charset_normalizer/api.py | 14 ++++++++++++-- tests/test_on_byte.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 0161b8ee..6a246a57 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -37,6 +37,16 @@ def from_bytes( """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. If there is no results, it is a strong indicator that the source is binary/not text. + By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence. + And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will. + + The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page + but never take it for granted. Can improve the performance. + + You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that + purpose. + + This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32. """ if not explain: @@ -152,7 +162,7 @@ def from_bytes( try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[:int(50e4)], + sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)], encoding=encoding_iana ) else: @@ -273,7 +283,7 @@ def from_bytes( results.append( CharsetMatch( - sequences, + sequences if strip_sig_or_bom is False else sequences[len(sig_payload):], encoding_iana, mean_mess_ratio, bom_or_sig_available, diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py index 945fff90..9444f6b7 100644 --- a/tests/test_on_byte.py +++ b/tests/test_on_byte.py @@ -27,6 +27,40 @@ def test_empty_bytes(self): len(r.alphabets) ) + def test_empty_str_with_sig_gb18030(self): + r = from_bytes('\uFEFF'.encode('gb18030')).best() + + self.assertIsNotNone(r) + self.assertEqual( + "", + str(r) + ) + self.assertEqual( + "gb18030", + r.encoding + ) + self.assertEqual( + 0, + len(r.raw) + ) + + def test_empty_str_with_sig_utf8(self): + r = from_bytes(b'\xef\xbb\xbf').best() + + self.assertIsNotNone(r) + self.assertEqual( + "", + str(r) + ) + self.assertEqual( + "utf_8", + r.encoding + ) + self.assertEqual( + 0, + len(r.raw) + ) + def test_bom_detection(self): with self.subTest('GB18030 UNAVAILABLE SIG'): self.assertFalse( From 84c2bb9cb5e1ca238662f340141d27cf471c6bdf Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 14 Jul 2021 22:03:28 +0200 Subject: [PATCH 2/4] =?UTF-8?q?=E2=8F=AA=EF=B8=8F=20Revert=20backward=20in?= =?UTF-8?q?compatible=20change=20in=20resulting=20CharsetMatch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the 'raw' property should remain untouched no matter what. --- charset_normalizer/api.py | 2 +- tests/test_on_byte.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 4bae8163..bf2dcf07 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -283,7 +283,7 @@ def from_bytes( results.append( CharsetMatch( - sequences if strip_sig_or_bom is False else sequences[len(sig_payload):], + sequences, encoding_iana, mean_mess_ratio, bom_or_sig_available, diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py index 9a0845ae..096f50bb 100644 --- a/tests/test_on_byte.py +++ b/tests/test_on_byte.py @@ -40,7 +40,7 @@ def test_empty_str_with_sig_gb18030(self): r.encoding ) self.assertEqual( - 0, + 4, len(r.raw) ) @@ -57,7 +57,7 @@ def test_empty_str_with_sig_utf8(self): r.encoding ) self.assertEqual( - 0, + 3, len(r.raw) ) From 103b2c64ad45ada0ce6cab3c7cbfec9ced44be2b Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 14 Jul 2021 22:17:19 +0200 Subject: [PATCH 3/4] :heavy_check_mark: Add additional test case scenario --- tests/test_on_byte.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py index 096f50bb..99518b7a 100644 --- a/tests/test_on_byte.py +++ b/tests/test_on_byte.py @@ -1,6 +1,7 @@ import unittest from charset_normalizer.api import from_bytes +from charset_normalizer.constant import TOO_BIG_SEQUENCE class TestBytes(unittest.TestCase): @@ -61,6 +62,23 @@ def test_empty_str_with_sig_utf8(self): len(r.raw) ) + def test_empty_str_with_large_sig_utf8(self): + r = from_bytes(b'\xef\xbb\xbf' + (b'0' * TOO_BIG_SEQUENCE)).best() + + self.assertIsNotNone(r) + self.assertEqual( + '0' * TOO_BIG_SEQUENCE, + str(r) + ) + self.assertEqual( + "utf_8", + r.encoding + ) + self.assertEqual( + TOO_BIG_SEQUENCE + 3, + len(r.raw) + ) + def test_on_empty_json(self): with self.subTest("Detecting empty JSON as ASCII"): From 044eed309f6fba3afef7a9e3910bee51b5a5b2d4 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 14 Jul 2021 22:20:34 +0200 Subject: [PATCH 4/4] :pencil: Fix typo in docs CharsetMatch instead of CharsetNormalizerMatch --- docs/handling_result.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/handling_result.rst b/docs/handling_result.rst index f161ffb8..067771ee 100755 --- a/docs/handling_result.rst +++ b/docs/handling_result.rst @@ -15,7 +15,7 @@ When initiating search upon a buffer, bytes or file you can assign the return va print(result.encoding) # gb18030 -Using CharsetNormalizerMatch +Using CharsetMatch ---------------------------- Here, ``result`` is a ``CharsetMatch`` object or ``None``.