Skip to content

Commit

Permalink
🐛 Fix fallback match being created when lazystr failed decoding (#154)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ousret committed Dec 23, 2021
1 parent 12a10a6 commit 0fe3d54
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 15 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

- [Short description of non-trivial change.]

### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)

## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)

### Changed
Expand Down
39 changes: 24 additions & 15 deletions charset_normalizer/api.py
Expand Up @@ -265,6 +265,7 @@ def from_bytes(

max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count = 0 # type: int
lazy_str_hard_failure = False

md_chunks = [] # type: List[str]
md_ratios = []
Expand All @@ -290,6 +291,7 @@ def from_bytes(
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
break

# multi-byte bad cutting detector and adjustment
Expand Down Expand Up @@ -325,6 +327,24 @@ def from_bytes(
):
break

# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.warning(
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue

mean_mess_ratio = (
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
) # type: float
Expand All @@ -338,7 +358,10 @@ def from_bytes(
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if encoding_iana in ["ascii", "utf_8", specified_encoding]:
if (
encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
)
Expand Down Expand Up @@ -386,20 +409,6 @@ def from_bytes(
)
)

# We might want to check the sequence again with the whole content
# Only if initial MD/CD tests passes
if is_too_large_sequence and not is_multi_byte_decoder:
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.warning(
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue

results.append(
CharsetMatch(
sequences,
Expand Down

0 comments on commit 0fe3d54

Please sign in to comment.