From 3b148f31ae93c629cdcecc9ec51522557192bca1 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 9 Nov 2021 20:57:26 +0100 Subject: [PATCH 1/8] :heavy_check_mark: Add simple test case that show the problem (Issue #136) --- tests/test_large_payload.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index c66cd3c0..d75c47af 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -22,3 +22,17 @@ def test_large_payload_ascii_basic_entry(): assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!" assert best_guess.bom is False, "SIG/BOM property should be False" assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + + +def test_misleading_large_sequence(): + content = (b"hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')) + + guesses = from_bytes(content) + + assert len(guesses) > 0 + + match = guesses.best() + + assert match is not None + + assert str(match) is not None From d0ad20b9d291376453f8efe08f9cda903123fc10 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 9 Nov 2021 20:58:23 +0100 Subject: [PATCH 2/8] :bug: Fix getting misleaded by large sequence (lazy str loading) Fix issue #136 --- charset_normalizer/api.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index a42ec693..2087b8bf 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -273,7 +273,19 @@ def from_bytes( if bom_or_sig_available and strip_sig_or_bom is False: cut_sequence = sig_payload + cut_sequence - chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str + try: + chunk = cut_sequence.decode( + encoding_iana, + errors="ignore" if is_multi_byte_decoder else "strict" + ) # type: str + except UnicodeDecodeError as e: # Lazy str loading may have missed something there + logger.warning( + "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + early_stop_count = max_chunk_gave_up + break # multi-byte bad cutting detector and adjustment # not the cleanest way to perform that fix but clever enough for now. @@ -369,6 +381,20 @@ def from_bytes( ) ) + # We might want to check the sequence again with the whole content + # Only if initial MD/CD tests passes + if is_too_large_sequence and not is_multi_byte_decoder: + try: + sequences[int(50e3):].decode(encoding_iana, errors="strict") + except UnicodeDecodeError as e: + logger.warning( + "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + tested_but_hard_failure.append(encoding_iana) + continue + results.append( CharsetMatch( sequences, From 85bd55bc8417bfd6e107f780c4f4c158433e281c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 9 Nov 2021 21:05:03 +0100 Subject: [PATCH 3/8] :art: reformat file --- charset_normalizer/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 2087b8bf..bcb48de6 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -276,7 +276,7 @@ def from_bytes( try: chunk = cut_sequence.decode( encoding_iana, - errors="ignore" if is_multi_byte_decoder else "strict" + errors="ignore" if is_multi_byte_decoder else "strict", ) # type: str except UnicodeDecodeError as e: # Lazy str loading may have missed something there logger.warning( @@ -385,7 +385,7 @@ def from_bytes( # Only if initial MD/CD tests passes if is_too_large_sequence and not is_multi_byte_decoder: try: - sequences[int(50e3):].decode(encoding_iana, errors="strict") + sequences[int(50e3) :].decode(encoding_iana, errors="strict") except UnicodeDecodeError as e: logger.warning( "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s", From b9c094a4839d07341a59861a1be43267530c8a69 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 9 Nov 2021 21:18:01 +0100 Subject: [PATCH 4/8] :bookmark: Bump 2.0.8.dev2 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index b06aeb39..a8395307 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev1" +__version__ = "2.0.8.dev2" VERSION = __version__.split(".") From 10304132f3346a57b22fc2697ab61b8f4bb25b60 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 17 Nov 2021 22:38:55 +0100 Subject: [PATCH 5/8] :heavy_check_mark: Simplify/Improve the added test --- tests/test_large_payload.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index d75c47af..b3893e3a 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -25,14 +25,12 @@ def test_large_payload_ascii_basic_entry(): def test_misleading_large_sequence(): - content = (b"hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')) + content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8') guesses = from_bytes(content) assert len(guesses) > 0 - match = guesses.best() - assert match is not None - + assert match.encoding == 'utf_8' assert str(match) is not None From b035085988db2abda9e8e75cf12fa82fe2f7e024 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 17 Nov 2021 22:40:39 +0100 Subject: [PATCH 6/8] :bug: Ignore too insignificant extracted chunk Found this edge case while doing extended tests around this PR --- charset_normalizer/api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index bcb48de6..9a942bad 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -268,6 +268,9 @@ def from_bytes( md_ratios = [] for i in r_: + if i + chunk_size > length+8: + continue + cut_sequence = sequences[i : i + chunk_size] if bom_or_sig_available and strip_sig_or_bom is False: From b29b477ca4a6c2966b6c74a177b7ea7ccb0afac6 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 17 Nov 2021 22:40:57 +0100 Subject: [PATCH 7/8] :bookmark: Bump to 2.0.8.dev3 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index a8395307..068af607 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev2" +__version__ = "2.0.8.dev3" VERSION = __version__.split(".") From cc56e3ee913b1de84d36e0ab789ea45536f1120b Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 17 Nov 2021 22:43:08 +0100 Subject: [PATCH 8/8] :art: reformat api.py --- charset_normalizer/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 9a942bad..16b25dbb 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -268,7 +268,7 @@ def from_bytes( md_ratios = [] for i in r_: - if i + chunk_size > length+8: + if i + chunk_size > length + 8: continue cut_sequence = sequences[i : i + chunk_size]