🐛 Fix large (misleading) sequence giving UnicodeDecodeError (#137)

* ✔️ Add simple test case that show the problem (Issue #136) * 🐛 Fix getting misleaded by large sequence (lazy str loading) * 🐛 Ignore too insignificant extracted chunk * 🔖 Bump to 2.0.8.dev3
Ousret · Nov 20, 2021 · b7fca11 · b7fca11
1 parent 00ffea0
commit b7fca11
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 2 deletions.
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -268,12 +268,27 @@ def from_bytes(
         md_ratios = []
 
         for i in r_:
+            if i + chunk_size > length + 8:
+                continue
+
             cut_sequence = sequences[i : i + chunk_size]
 
             if bom_or_sig_available and strip_sig_or_bom is False:
                 cut_sequence = sig_payload + cut_sequence
 
-            chunk = cut_sequence.decode(encoding_iana, errors="ignore")  # type: str
+            try:
+                chunk = cut_sequence.decode(
+                    encoding_iana,
+                    errors="ignore" if is_multi_byte_decoder else "strict",
+                )  # type: str
+            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+                logger.warning(
+                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                early_stop_count = max_chunk_gave_up
+                break
 
             # multi-byte bad cutting detector and adjustment
             # not the cleanest way to perform that fix but clever enough for now.
@@ -369,6 +384,20 @@ def from_bytes(
                 )
             )
 
+        # We might want to check the sequence again with the whole content
+        # Only if initial MD/CD tests passes
+        if is_too_large_sequence and not is_multi_byte_decoder:
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.warning(
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
         results.append(
             CharsetMatch(
                 sequences,

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.8.dev1"
+__version__ = "2.0.8.dev3"
 VERSION = __version__.split(".")
diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py
@@ -22,3 +22,15 @@ def test_large_payload_ascii_basic_entry():
     assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
     assert best_guess.bom is False, "SIG/BOM property should be False"
     assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
+
+
+def test_misleading_large_sequence():
+    content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨，磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')
+
+    guesses = from_bytes(content)
+
+    assert len(guesses) > 0
+    match = guesses.best()
+    assert match is not None
+    assert match.encoding == 'utf_8'
+    assert str(match) is not None