diff --git a/CHANGELOG.md b/CHANGELOG.md index ee7608be..4623ed76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - [Short description of non-trivial change.] +### Fixed +- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154) + ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03) ### Changed diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index d9e5866a..2e258f26 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -265,6 +265,7 @@ def from_bytes( max_chunk_gave_up = max(max_chunk_gave_up, 2) early_stop_count = 0 # type: int + lazy_str_hard_failure = False md_chunks = [] # type: List[str] md_ratios = [] @@ -290,6 +291,7 @@ def from_bytes( str(e), ) early_stop_count = max_chunk_gave_up + lazy_str_hard_failure = True break # multi-byte bad cutting detector and adjustment @@ -325,6 +327,24 @@ def from_bytes( ): break + # We might want to check the sequence again with the whole content + # Only if initial MD tests passes + if ( + not lazy_str_hard_failure + and is_too_large_sequence + and not is_multi_byte_decoder + ): + try: + sequences[int(50e3) :].decode(encoding_iana, errors="strict") + except UnicodeDecodeError as e: + logger.warning( + "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + tested_but_hard_failure.append(encoding_iana) + continue + mean_mess_ratio = ( sum(md_ratios) / len(md_ratios) if md_ratios else 0.0 ) # type: float @@ -338,7 +358,10 @@ def from_bytes( round(mean_mess_ratio * 100, ndigits=3), ) # Preparing those fallbacks in case we got nothing. - if encoding_iana in ["ascii", "utf_8", specified_encoding]: + if ( + encoding_iana in ["ascii", "utf_8", specified_encoding] + and not lazy_str_hard_failure + ): fallback_entry = CharsetMatch( sequences, encoding_iana, threshold, False, [], decoded_payload ) @@ -386,20 +409,6 @@ def from_bytes( ) ) - # We might want to check the sequence again with the whole content - # Only if initial MD/CD tests passes - if is_too_large_sequence and not is_multi_byte_decoder: - try: - sequences[int(50e3) :].decode(encoding_iana, errors="strict") - except UnicodeDecodeError as e: - logger.warning( - "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s", - encoding_iana, - str(e), - ) - tested_but_hard_failure.append(encoding_iana) - continue - results.append( CharsetMatch( sequences,