From b7fca11ae569496baf3a9c3fcd3af20d22e9edc0 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Sat, 20 Nov 2021 23:30:41 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20large=20(misleading)=20seq?= =?UTF-8?q?uence=20giving=20UnicodeDecodeError=20(#137)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * :heavy_check_mark: Add simple test case that show the problem (Issue #136) * :bug: Fix getting misleaded by large sequence (lazy str loading) * :bug: Ignore too insignificant extracted chunk * :bookmark: Bump to 2.0.8.dev3 --- charset_normalizer/api.py | 31 ++++++++++++++++++++++++++++++- charset_normalizer/version.py | 2 +- tests/test_large_payload.py | 12 ++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index a42ec693..16b25dbb 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -268,12 +268,27 @@ def from_bytes( md_ratios = [] for i in r_: + if i + chunk_size > length + 8: + continue + cut_sequence = sequences[i : i + chunk_size] if bom_or_sig_available and strip_sig_or_bom is False: cut_sequence = sig_payload + cut_sequence - chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str + try: + chunk = cut_sequence.decode( + encoding_iana, + errors="ignore" if is_multi_byte_decoder else "strict", + ) # type: str + except UnicodeDecodeError as e: # Lazy str loading may have missed something there + logger.warning( + "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + early_stop_count = max_chunk_gave_up + break # multi-byte bad cutting detector and adjustment # not the cleanest way to perform that fix but clever enough for now. @@ -369,6 +384,20 @@ def from_bytes( ) ) + # We might want to check the sequence again with the whole content + # Only if initial MD/CD tests passes + if is_too_large_sequence and not is_multi_byte_decoder: + try: + sequences[int(50e3) :].decode(encoding_iana, errors="strict") + except UnicodeDecodeError as e: + logger.warning( + "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + tested_but_hard_failure.append(encoding_iana) + continue + results.append( CharsetMatch( sequences, diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index b06aeb39..068af607 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev1" +__version__ = "2.0.8.dev3" VERSION = __version__.split(".") diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index c66cd3c0..b3893e3a 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -22,3 +22,15 @@ def test_large_payload_ascii_basic_entry(): assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!" assert best_guess.bom is False, "SIG/BOM property should be False" assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + + +def test_misleading_large_sequence(): + content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8') + + guesses = from_bytes(content) + + assert len(guesses) > 0 + match = guesses.best() + assert match is not None + assert match.encoding == 'utf_8' + assert str(match) is not None