Skip to content

Commit

Permalink
🐛 Fix large (misleading) sequence giving UnicodeDecodeError (#137)
Browse files Browse the repository at this point in the history
* ✔️ Add simple test case that show the problem (Issue #136)

* 🐛 Fix getting misleaded by large sequence (lazy str loading)

* 🐛 Ignore too insignificant extracted chunk

* 🔖 Bump to 2.0.8.dev3
  • Loading branch information
Ousret committed Nov 20, 2021
1 parent 00ffea0 commit b7fca11
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 2 deletions.
31 changes: 30 additions & 1 deletion charset_normalizer/api.py
Expand Up @@ -268,12 +268,27 @@ def from_bytes(
md_ratios = []

for i in r_:
if i + chunk_size > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str
try:
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.warning(
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
break

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
Expand Down Expand Up @@ -369,6 +384,20 @@ def from_bytes(
)
)

# We might want to check the sequence again with the whole content
# Only if initial MD/CD tests passes
if is_too_large_sequence and not is_multi_byte_decoder:
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.warning(
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue

results.append(
CharsetMatch(
sequences,
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.8.dev1"
__version__ = "2.0.8.dev3"
VERSION = __version__.split(".")
12 changes: 12 additions & 0 deletions tests/test_large_payload.py
Expand Up @@ -22,3 +22,15 @@ def test_large_payload_ascii_basic_entry():
assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"


def test_misleading_large_sequence():
content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8')

guesses = from_bytes(content)

assert len(guesses) > 0
match = guesses.best()
assert match is not None
assert match.encoding == 'utf_8'
assert str(match) is not None

0 comments on commit b7fca11

Please sign in to comment.