Skip to content

Commit

Permalink
❇️ Allow fallback on specified encoding if any (#71)
Browse files Browse the repository at this point in the history
* ❇️ Allow fallback on specified encoding if any

Charset-Normalizer comes with a preemptive encoding detector but until this instant, it was not trusted at all when the MD computed a ratio too high in the first place when decoding using the specified encoding.

Now, with this PR, in case the detection would not find any best suitable match, will use the specified one if it suits the content.
  • Loading branch information
Ousret committed Jul 30, 2021
1 parent d283ef5 commit 8247f3a
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions charset_normalizer/api.py
Expand Up @@ -118,6 +118,7 @@ def from_bytes(

fallback_ascii = None # type: Optional[CharsetMatch]
fallback_u8 = None # type: Optional[CharsetMatch]
fallback_specified = None # type: Optional[CharsetMatch]

single_byte_hard_failure_count = 0 # type: int
single_byte_soft_failure_count = 0 # type: int
Expand Down Expand Up @@ -255,7 +256,7 @@ def from_bytes(
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3))
# Preparing those fallbacks in case we got nothing.
if encoding_iana in ["ascii", "utf_8"]:
if encoding_iana in ["ascii", "utf_8", specified_encoding]:
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
Expand All @@ -264,7 +265,9 @@ def from_bytes(
[],
decoded_payload
)
if encoding_iana == "ascii":
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
Expand Down Expand Up @@ -332,10 +335,13 @@ def from_bytes(
)

if len(results) == 0:
if fallback_u8 or fallback_ascii:
logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8 fallback.")
if fallback_u8 or fallback_ascii or fallback_specified:
logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.")

if (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint):
if fallback_specified:
logger.warning("%s will be used as a fallback match", fallback_specified.encoding)
results.append(fallback_specified)
elif (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint):
logger.warning("utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
Expand Down

0 comments on commit 8247f3a

Please sign in to comment.