Revise the logger instanciation/initial handlers (#135)

* Update logger to use NullHandler by default * Add test class to cover Logging behavior * ❇️ Having a dedicated final handler attached to the explain toggle * 🔧 Having the logger initialized in top-level init * ✔️ Add tests / enforce hdl checks Co-authored-by: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Ousret · Nov 24, 2021 · b5e48c4 · b5e48c4
1 parent 6480728
commit b5e48c4
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Code style as refactored by Sourcery-AI (PR #131) 
 - Minor adjustment on the MD around european words (PR #133)
 - Remove and replace SRTs from assets / tests (PR #139)
+- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
+- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
 
 ### Fixed
 - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
@@ -19,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
 ### Added
 - Add support for Kazakh (Cyrillic) language detection (PR #109)
+- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
 
 ### Changed
 - Further, improve inferring the language from a given single-byte code page (PR #112)

diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
@@ -19,6 +19,8 @@
 :copyright: (c) 2021 by Ahmed TAHRI
 :license: MIT, see LICENSE for more details.
 """
+import logging
+
 from .api import from_bytes, from_fp, from_path, normalize
 from .legacy import (
     CharsetDetector,
@@ -28,6 +30,7 @@
     detect,
 )
 from .models import CharsetMatch, CharsetMatches
+from .utils import set_logging_handler
 from .version import VERSION, __version__
 
 __all__ = (
@@ -44,4 +47,10 @@
     "CharsetDoctor",
     "__version__",
     "VERSION",
+    "set_logging_handler",
 )
+
+# Attach a NullHandler to the top level logger by default
+# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
+
+logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -1,3 +1,4 @@
+import logging
 from os.path import basename, splitext
 from typing import BinaryIO, List, Optional, Set
 
@@ -6,8 +7,6 @@
 except ImportError:  # pragma: no cover
     PathLike = str  # type: ignore
 
-import logging
-
 from .cd import (
     coherence_ratio,
     encoding_languages,
@@ -27,11 +26,10 @@
 )
 
 logger = logging.getLogger("charset_normalizer")
-logger.setLevel(logging.DEBUG)
-
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
-logger.addHandler(handler)
+explain_handler = logging.StreamHandler()
+explain_handler.setFormatter(
+    logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+)
 
 
 def from_bytes(
@@ -57,6 +55,9 @@ def from_bytes(
     purpose.
 
     This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
+    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
+    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
+    Custom logging format and handler can be set manually.
     """
 
     if not isinstance(sequences, (bytearray, bytes)):
@@ -66,17 +67,17 @@ def from_bytes(
             )
         )
 
-    if not explain:
-        logger.setLevel(logging.CRITICAL)
-    else:
-        logger.setLevel(logging.INFO)
+    if explain:
+        logger.addHandler(explain_handler)
 
     length = len(sequences)  # type: int
 
     if length == 0:
         logger.warning(
             "Given content is empty, stopping the process very early, returning empty utf_8 str match"
         )
+        if explain:
+            logger.removeHandler(explain_handler)
         return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
 
     if cp_isolation is not None:
@@ -416,13 +417,17 @@ def from_bytes(
             logger.info(
                 "%s is most likely the one. Stopping the process.", encoding_iana
             )
+            if explain:
+                logger.removeHandler(explain_handler)
             return CharsetMatches([results[encoding_iana]])
 
         if encoding_iana == sig_encoding:
             logger.info(
                 "%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
                 encoding_iana,
             )
+            if explain:
+                logger.removeHandler(explain_handler)
             return CharsetMatches([results[encoding_iana]])
 
     if len(results) == 0:
@@ -451,6 +456,9 @@ def from_bytes(
             logger.warning("ascii will be used as a fallback match")
             results.append(fallback_ascii)
 
+    if explain:
+        logger.removeHandler(explain_handler)
+
     return results
 
 

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -4,6 +4,7 @@
     import unicodedata  # type: ignore[no-redef]
 
 import importlib
+import logging
 from codecs import IncrementalDecoder
 from encodings.aliases import aliases
 from functools import lru_cache
@@ -325,3 +326,17 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
         iana_name_a in IANA_SUPPORTED_SIMILAR
         and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
     )
+
+
+def set_logging_handler(
+    name: str = "charset_normalizer",
+    level: int = logging.INFO,
+    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter(format_string))
+    logger.addHandler(handler)
diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -0,0 +1,51 @@
+import pytest
+import logging
+
+from charset_normalizer.utils import set_logging_handler
+from charset_normalizer.api import from_bytes, explain_handler
+
+
+class TestLogBehaviorClass:
+    def setup(self):
+        self.logger = logging.getLogger("charset_normalizer")
+        self.logger.handlers.clear()
+        self.logger.addHandler(logging.NullHandler())
+        self.logger.level = None
+
+    def test_explain_true_behavior(self, caplog):
+        test_sequence = b'This is a test sequence of bytes that should be sufficient'
+        from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
+        assert explain_handler not in self.logger.handlers
+        for record in caplog.records:
+            assert record.levelname == "INFO"
+
+    def test_explain_false_handler_set_behavior(self, caplog):
+        test_sequence = b'This is a test sequence of bytes that should be sufficient'
+        set_logging_handler(level=logging.INFO, format_string="%(message)s")
+        from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
+        assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
+        for record in caplog.records:
+            assert record.levelname == "INFO"
+        assert "ascii is most likely the one. Stopping the process." in caplog.text
+
+    def test_set_stream_handler(self, caplog):
+        set_logging_handler(
+            "charset_normalizer", level=logging.DEBUG
+        )
+        self.logger.debug("log content should log with default format")
+        for record in caplog.records:
+            assert record.levelname == "DEBUG"
+        assert "log content should log with default format" in caplog.text
+
+    def test_set_stream_handler_format(self, caplog):
+        set_logging_handler(
+            "charset_normalizer", format_string="%(message)s"
+        )
+        self.logger.info("log content should only be this message")
+        assert caplog.record_tuples == [
+            (
+                "charset_normalizer",
+                logging.INFO,
+                "log content should only be this message",
+            )
+        ]
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,31 +1,34 @@
+import logging
 import pytest
-from charset_normalizer.utils import is_accentuated, cp_similarity
+from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler
 
 
 @pytest.mark.parametrize(
     "character, expected_is_accentuated",
     [
-        ('é', True),
-        ('è', True),
-        ('à', True),
-        ('À', True),
-        ('Ù', True),
-        ('ç', True),
-        ('a', False),
-        ('€', False),
-        ('&', False),
-        ('Ö', True),
+        ("é", True),
+        ("è", True),
+        ("à", True),
+        ("À", True),
+        ("Ù", True),
+        ("ç", True),
+        ("a", False),
+        ("€", False),
+        ("&", False),
+        ("Ö", True),
         ("ü", True),
         ("ê", True),
-        ('Ñ', True),
-        ('Ý', True),
-        ('Ω', False),
-        ('ø', False),
-        ('Ё', False),
-    ]
+        ("Ñ", True),
+        ("Ý", True),
+        ("Ω", False),
+        ("ø", False),
+        ("Ё", False),
+    ],
 )
 def test_is_accentuated(character, expected_is_accentuated):
-    assert is_accentuated(character) is expected_is_accentuated, "is_accentuated behavior incomplete"
+    assert (
+        is_accentuated(character) is expected_is_accentuated
+    ), "is_accentuated behavior incomplete"
 
 
 @pytest.mark.parametrize(
@@ -37,7 +40,7 @@ def test_is_accentuated(character, expected_is_accentuated):
         ("latin_1", "iso8859_4", True),
         ("latin_1", "cp1251", False),
         ("cp1251", "mac_turkish", False),
-    ]
+    ],
 )
 def test_cp_similarity(cp_name_a, cp_name_b, expected_is_similar):
     is_similar = cp_similarity(cp_name_a, cp_name_b) >= 0.8