🐛 Fix fallback match being created when lazystr failed decoding (#154)

jawah · Dec 23, 2021 · 0fe3d54 · 0fe3d54
1 parent 12a10a6
commit 0fe3d54
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 - [Short description of non-trivial change.]
 
+### Fixed
+- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
+
 ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
 
 ### Changed

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -265,6 +265,7 @@ def from_bytes(
 
         max_chunk_gave_up = max(max_chunk_gave_up, 2)
         early_stop_count = 0  # type: int
+        lazy_str_hard_failure = False
 
         md_chunks = []  # type: List[str]
         md_ratios = []
@@ -290,6 +291,7 @@ def from_bytes(
                     str(e),
                 )
                 early_stop_count = max_chunk_gave_up
+                lazy_str_hard_failure = True
                 break
 
             # multi-byte bad cutting detector and adjustment
@@ -325,6 +327,24 @@ def from_bytes(
             ):
                 break
 
+        # We might want to check the sequence again with the whole content
+        # Only if initial MD tests passes
+        if (
+            not lazy_str_hard_failure
+            and is_too_large_sequence
+            and not is_multi_byte_decoder
+        ):
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.warning(
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
         mean_mess_ratio = (
             sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
         )  # type: float
@@ -338,7 +358,10 @@ def from_bytes(
                 round(mean_mess_ratio * 100, ndigits=3),
             )
             # Preparing those fallbacks in case we got nothing.
-            if encoding_iana in ["ascii", "utf_8", specified_encoding]:
+            if (
+                encoding_iana in ["ascii", "utf_8", specified_encoding]
+                and not lazy_str_hard_failure
+            ):
                 fallback_entry = CharsetMatch(
                     sequences, encoding_iana, threshold, False, [], decoded_payload
                 )
@@ -386,20 +409,6 @@ def from_bytes(
                 )
             )
 
-        # We might want to check the sequence again with the whole content
-        # Only if initial MD/CD tests passes
-        if is_too_large_sequence and not is_multi_byte_decoder:
-            try:
-                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
-            except UnicodeDecodeError as e:
-                logger.warning(
-                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                tested_but_hard_failure.append(encoding_iana)
-                continue
-
         results.append(
             CharsetMatch(
                 sequences,