From b7fca11ae569496baf3a9c3fcd3af20d22e9edc0 Mon Sep 17 00:00:00 2001
From: TAHRI Ahmed R <Ousret@users.noreply.github.com>
Date: Sat, 20 Nov 2021 23:30:41 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20large=20(misleading)=20seq?=
 =?UTF-8?q?uence=20giving=20UnicodeDecodeError=20(#137)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* :heavy_check_mark: Add simple test case that show the problem (Issue #136)

* :bug: Fix getting misleaded by large sequence (lazy str loading)

* :bug: Ignore too insignificant extracted chunk

* :bookmark: Bump to 2.0.8.dev3
---
 charset_normalizer/api.py     | 31 ++++++++++++++++++++++++++++++-
 charset_normalizer/version.py |  2 +-
 tests/test_large_payload.py   | 12 ++++++++++++
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
index a42ec693..16b25dbb 100644
--- a/charset_normalizer/api.py
+++ b/charset_normalizer/api.py
@@ -268,12 +268,27 @@ def from_bytes(
         md_ratios = []
 
         for i in r_:
+            if i + chunk_size > length + 8:
+                continue
+
             cut_sequence = sequences[i : i + chunk_size]
 
             if bom_or_sig_available and strip_sig_or_bom is False:
                 cut_sequence = sig_payload + cut_sequence
 
-            chunk = cut_sequence.decode(encoding_iana, errors="ignore")  # type: str
+            try:
+                chunk = cut_sequence.decode(
+                    encoding_iana,
+                    errors="ignore" if is_multi_byte_decoder else "strict",
+                )  # type: str
+            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+                logger.warning(
+                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                early_stop_count = max_chunk_gave_up
+                break
 
             # multi-byte bad cutting detector and adjustment
             # not the cleanest way to perform that fix but clever enough for now.
@@ -369,6 +384,20 @@ def from_bytes(
                 )
             )
 
+        # We might want to check the sequence again with the whole content
+        # Only if initial MD/CD tests passes
+        if is_too_large_sequence and not is_multi_byte_decoder:
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.warning(
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
         results.append(
             CharsetMatch(
                 sequences,
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
index b06aeb39..068af607 100644
--- a/charset_normalizer/version.py
+++ b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.8.dev1"
+__version__ = "2.0.8.dev3"
 VERSION = __version__.split(".")
diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py
index c66cd3c0..b3893e3a 100644
--- a/tests/test_large_payload.py
+++ b/tests/test_large_payload.py
@@ -22,3 +22,15 @@ def test_large_payload_ascii_basic_entry():
     assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
     assert best_guess.bom is False, "SIG/BOM property should be False"
     assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+
+
+def test_misleading_large_sequence():
+    content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨，磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')
+
+    guesses = from_bytes(content)
+
+    assert len(guesses) > 0
+    match = guesses.best()
+    assert match is not None
+    assert match.encoding == 'utf_8'
+    assert str(match) is not None