🔧 Logging behavior to be less verbose under INFO and DEBUG (#163)

* 🔧 The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels Following #162 and #145 and #134
jawah · Jan 29, 2022 · 0cc1528 · 0cc1528
1 parent ce38494
commit 0cc1528
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [2.0.11.dev0](https://github.com/Ousret/charset_normalizer/compare/2.0.10...master) (unreleased)
+
+- [Short description of non-trivial change.]
+
+### Changed
+- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163)
+
 ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
 
 ### Fixed

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -13,7 +13,7 @@
     mb_encoding_languages,
     merge_coherence_ratios,
 )
-from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE
+from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
 from .md import mess_ratio
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
@@ -25,6 +25,7 @@
     should_strip_sig_or_bom,
 )
 
+logging.addLevelName(TRACE, "TRACE")
 logger = logging.getLogger("charset_normalizer")
 explain_handler = logging.StreamHandler()
 explain_handler.setFormatter(
@@ -70,19 +71,20 @@ def from_bytes(
     if explain:
         previous_logger_level = logger.level  # type: int
         logger.addHandler(explain_handler)
-        logger.setLevel(logging.DEBUG)
+        logger.setLevel(TRACE)
 
     length = len(sequences)  # type: int
 
     if length == 0:
-        logger.warning("Encoding detection on empty bytes, assuming utf_8 intention.")
+        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
         if explain:
             logger.removeHandler(explain_handler)
             logger.setLevel(previous_logger_level or logging.WARNING)
         return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
 
     if cp_isolation is not None:
-        logger.debug(
+        logger.log(
+            TRACE,
             "cp_isolation is set. use this flag for debugging purpose. "
             "limited list of encoding allowed : %s.",
             ", ".join(cp_isolation),
@@ -92,7 +94,8 @@ def from_bytes(
         cp_isolation = []
 
     if cp_exclusion is not None:
-        logger.debug(
+        logger.log(
+            TRACE,
             "cp_exclusion is set. use this flag for debugging purpose. "
             "limited list of encoding excluded : %s.",
             ", ".join(cp_exclusion),
@@ -102,7 +105,8 @@ def from_bytes(
         cp_exclusion = []
 
     if length <= (chunk_size * steps):
-        logger.debug(
+        logger.log(
+            TRACE,
             "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
             steps,
             chunk_size,
@@ -118,16 +122,18 @@ def from_bytes(
     is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
 
     if is_too_small_sequence:
-        logger.warning(
+        logger.log(
+            TRACE,
             "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
                 length
-            )
+            ),
         )
     elif is_too_large_sequence:
-        logger.info(
+        logger.log(
+            TRACE,
             "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
                 length
-            )
+            ),
         )
 
     prioritized_encodings = []  # type: List[str]
@@ -138,7 +144,8 @@ def from_bytes(
 
     if specified_encoding is not None:
         prioritized_encodings.append(specified_encoding)
-        logger.info(
+        logger.log(
+            TRACE,
             "Detected declarative mark in sequence. Priority +1 given for %s.",
             specified_encoding,
         )
@@ -157,7 +164,8 @@ def from_bytes(
 
     if sig_encoding is not None:
         prioritized_encodings.append(sig_encoding)
-        logger.info(
+        logger.log(
+            TRACE,
             "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
             len(sig_payload),
             sig_encoding,
@@ -188,7 +196,8 @@ def from_bytes(
         )  # type: bool
 
         if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
-            logger.debug(
+            logger.log(
+                TRACE,
                 "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
                 encoding_iana,
             )
@@ -197,8 +206,10 @@ def from_bytes(
         try:
             is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
         except (ModuleNotFoundError, ImportError):
-            logger.debug(
-                "Encoding %s does not provide an IncrementalDecoder", encoding_iana
+            logger.log(
+                TRACE,
+                "Encoding %s does not provide an IncrementalDecoder",
+                encoding_iana,
             )
             continue
 
@@ -219,7 +230,8 @@ def from_bytes(
                 )
         except (UnicodeDecodeError, LookupError) as e:
             if not isinstance(e, LookupError):
-                logger.debug(
+                logger.log(
+                    TRACE,
                     "Code page %s does not fit given bytes sequence at ALL. %s",
                     encoding_iana,
                     str(e),
@@ -235,7 +247,8 @@ def from_bytes(
                 break
 
         if similar_soft_failure_test:
-            logger.debug(
+            logger.log(
+                TRACE,
                 "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
                 encoding_iana,
                 encoding_soft_failed,
@@ -255,7 +268,8 @@ def from_bytes(
         )  # type: bool
 
         if multi_byte_bonus:
-            logger.debug(
+            logger.log(
+                TRACE,
                 "Code page %s is a multi byte encoding table and it appear that at least one character "
                 "was encoded using n-bytes.",
                 encoding_iana,
@@ -285,7 +299,8 @@ def from_bytes(
                     errors="ignore" if is_multi_byte_decoder else "strict",
                 )  # type: str
             except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.debug(
+                logger.log(
+                    TRACE,
                     "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
                     encoding_iana,
                     str(e),
@@ -337,7 +352,8 @@ def from_bytes(
             try:
                 sequences[int(50e3) :].decode(encoding_iana, errors="strict")
             except UnicodeDecodeError as e:
-                logger.debug(
+                logger.log(
+                    TRACE,
                     "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
                     encoding_iana,
                     str(e),
@@ -350,7 +366,8 @@ def from_bytes(
         )  # type: float
         if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
             tested_but_soft_failure.append(encoding_iana)
-            logger.info(
+            logger.log(
+                TRACE,
                 "%s was excluded because of initial chaos probing. Gave up %i time(s). "
                 "Computed mean chaos is %f %%.",
                 encoding_iana,
@@ -373,7 +390,8 @@ def from_bytes(
                     fallback_u8 = fallback_entry
             continue
 
-        logger.info(
+        logger.log(
+            TRACE,
             "%s passed initial chaos probing. Mean measured chaos is %f %%",
             encoding_iana,
             round(mean_mess_ratio * 100, ndigits=3),
@@ -385,10 +403,11 @@ def from_bytes(
             target_languages = mb_encoding_languages(encoding_iana)
 
         if target_languages:
-            logger.debug(
+            logger.log(
+                TRACE,
                 "{} should target any language(s) of {}".format(
                     encoding_iana, str(target_languages)
-                )
+                ),
             )
 
         cd_ratios = []
@@ -406,10 +425,11 @@ def from_bytes(
         cd_ratios_merged = merge_coherence_ratios(cd_ratios)
 
         if cd_ratios_merged:
-            logger.info(
+            logger.log(
+                TRACE,
                 "We detected language {} using {}".format(
                     cd_ratios_merged, encoding_iana
-                )
+                ),
             )
 
         results.append(
@@ -427,17 +447,18 @@ def from_bytes(
             encoding_iana in [specified_encoding, "ascii", "utf_8"]
             and mean_mess_ratio < 0.1
         ):
-            logger.info(
-                "%s is most likely the one. Stopping the process.", encoding_iana
+            logger.debug(
+                "Encoding detection: %s is most likely the one.", encoding_iana
             )
             if explain:
                 logger.removeHandler(explain_handler)
                 logger.setLevel(previous_logger_level)
             return CharsetMatches([results[encoding_iana]])
 
         if encoding_iana == sig_encoding:
-            logger.info(
-                "%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
+            logger.debug(
+                "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
+                "the beginning of the sequence.",
                 encoding_iana,
             )
             if explain:
@@ -447,13 +468,15 @@ def from_bytes(
 
     if len(results) == 0:
         if fallback_u8 or fallback_ascii or fallback_specified:
-            logger.debug(
-                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback."
+            logger.log(
+                TRACE,
+                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
             )
 
         if fallback_specified:
             logger.debug(
-                "%s will be used as a fallback match", fallback_specified.encoding
+                "Encoding detection: %s will be used as a fallback match",
+                fallback_specified.encoding,
             )
             results.append(fallback_specified)
         elif (
@@ -465,12 +488,21 @@ def from_bytes(
             )
             or (fallback_u8 is not None)
         ):
-            logger.warning("utf_8 will be used as a fallback match")
+            logger.debug("Encoding detection: utf_8 will be used as a fallback match")
             results.append(fallback_u8)
         elif fallback_ascii:
-            logger.warning("ascii will be used as a fallback match")
+            logger.debug("Encoding detection: ascii will be used as a fallback match")
             results.append(fallback_ascii)
 
+    if results:
+        logger.debug(
+            "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
+            results.best().encoding,  # type: ignore
+            len(results) - 1,
+        )
+    else:
+        logger.debug("Encoding detection: Unable to determine any suitable charset.")
+
     if explain:
         logger.removeHandler(explain_handler)
         logger.setLevel(previous_logger_level)

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -498,3 +498,6 @@
 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
 
 LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
+
+# Logging LEVEL bellow DEBUG
+TRACE = 5  # type: int
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.10"
+__version__ = "2.0.11.dev0"
 VERSION = __version__.split(".")
diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -3,6 +3,7 @@
 
 from charset_normalizer.utils import set_logging_handler
 from charset_normalizer.api import from_bytes, explain_handler
+from charset_normalizer.constant import TRACE
 
 
 class TestLogBehaviorClass:
@@ -17,24 +18,24 @@ def test_explain_true_behavior(self, caplog):
         from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
         assert explain_handler not in self.logger.handlers
         for record in caplog.records:
-            assert record.levelname in ["INFO", "DEBUG"]
+            assert record.levelname in ["TRACE", "DEBUG"]
 
     def test_explain_false_handler_set_behavior(self, caplog):
         test_sequence = b'This is a test sequence of bytes that should be sufficient'
-        set_logging_handler(level=logging.INFO, format_string="%(message)s")
+        set_logging_handler(level=TRACE, format_string="%(message)s")
         from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
         assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
         for record in caplog.records:
-            assert record.levelname in ["INFO", "DEBUG"]
-        assert "ascii is most likely the one. Stopping the process." in caplog.text
+            assert record.levelname in ["TRACE", "DEBUG"]
+        assert "Encoding detection: ascii is most likely the one." in caplog.text
 
     def test_set_stream_handler(self, caplog):
         set_logging_handler(
             "charset_normalizer", level=logging.DEBUG
         )
         self.logger.debug("log content should log with default format")
         for record in caplog.records:
-            assert record.levelname in ["INFO", "DEBUG"]
+            assert record.levelname in ["TRACE", "DEBUG"]
         assert "log content should log with default format" in caplog.text
 
     def test_set_stream_handler_format(self, caplog):