From 8247f3a8e52ecb0af15a359c6c85d7b69f4ae677 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Fri, 30 Jul 2021 13:50:36 -0700 Subject: [PATCH] =?UTF-8?q?=E2=9D=87=EF=B8=8F=20Allow=20fallback=20on=20sp?= =?UTF-8?q?ecified=20encoding=20if=20any=20(#71)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * :sparkle: Allow fallback on specified encoding if any Charset-Normalizer comes with a preemptive encoding detector but until this instant, it was not trusted at all when the MD computed a ratio too high in the first place when decoding using the specified encoding. Now, with this PR, in case the detection would not find any best suitable match, will use the specified one if it suits the content. --- charset_normalizer/api.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 7ad912cb..388e841b 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -118,6 +118,7 @@ def from_bytes( fallback_ascii = None # type: Optional[CharsetMatch] fallback_u8 = None # type: Optional[CharsetMatch] + fallback_specified = None # type: Optional[CharsetMatch] single_byte_hard_failure_count = 0 # type: int single_byte_soft_failure_count = 0 # type: int @@ -255,7 +256,7 @@ def from_bytes( early_stop_count, round(mean_mess_ratio * 100, ndigits=3)) # Preparing those fallbacks in case we got nothing. - if encoding_iana in ["ascii", "utf_8"]: + if encoding_iana in ["ascii", "utf_8", specified_encoding]: fallback_entry = CharsetMatch( sequences, encoding_iana, @@ -264,7 +265,9 @@ def from_bytes( [], decoded_payload ) - if encoding_iana == "ascii": + if encoding_iana == specified_encoding: + fallback_specified = fallback_entry + elif encoding_iana == "ascii": fallback_ascii = fallback_entry else: fallback_u8 = fallback_entry @@ -332,10 +335,13 @@ def from_bytes( ) if len(results) == 0: - if fallback_u8 or fallback_ascii: - logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8 fallback.") + if fallback_u8 or fallback_ascii or fallback_specified: + logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.") - if (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint): + if fallback_specified: + logger.warning("%s will be used as a fallback match", fallback_specified.encoding) + results.append(fallback_specified) + elif (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint): logger.warning("utf_8 will be used as a fallback match") results.append(fallback_u8) elif fallback_ascii: