Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix large (misleading) sequence giving UnicodeDecodeError #137

Merged
merged 8 commits into from Nov 20, 2021
31 changes: 30 additions & 1 deletion charset_normalizer/api.py
Expand Up @@ -268,12 +268,27 @@ def from_bytes(
md_ratios = []

for i in r_:
if i + chunk_size > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str
try:
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.warning(
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
break

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
Expand Down Expand Up @@ -369,6 +384,20 @@ def from_bytes(
)
)

# We might want to check the sequence again with the whole content
# Only if initial MD/CD tests passes
if is_too_large_sequence and not is_multi_byte_decoder:
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.warning(
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue

results.append(
CharsetMatch(
sequences,
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.8.dev1"
__version__ = "2.0.8.dev3"
VERSION = __version__.split(".")
12 changes: 12 additions & 0 deletions tests/test_large_payload.py
Expand Up @@ -22,3 +22,15 @@ def test_large_payload_ascii_basic_entry():
assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"


def test_misleading_large_sequence():
content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')

guesses = from_bytes(content)

assert len(guesses) > 0
match = guesses.best()
assert match is not None
assert match.encoding == 'utf_8'
assert str(match) is not None