Skip to content

Commit

Permalink
Revise the logger instanciation/initial handlers (#135)
Browse files Browse the repository at this point in the history
* Update logger to use NullHandler by default

* Add test class to cover Logging behavior

* ❇️ Having a dedicated final handler attached to the explain toggle

* 🔧 Having the logger initialized in top-level init

* ✔️ Add tests / enforce hdl checks

Co-authored-by: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
  • Loading branch information
nmaynes and Ousret committed Nov 24, 2021
1 parent 6480728 commit b5e48c4
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)

### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
Expand All @@ -19,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)

### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
Expand Down
9 changes: 9 additions & 0 deletions charset_normalizer/__init__.py
Expand Up @@ -19,6 +19,8 @@
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging

from .api import from_bytes, from_fp, from_path, normalize
from .legacy import (
CharsetDetector,
Expand All @@ -28,6 +30,7 @@
detect,
)
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__

__all__ = (
Expand All @@ -44,4 +47,10 @@
"CharsetDoctor",
"__version__",
"VERSION",
"set_logging_handler",
)

# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library

logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
30 changes: 19 additions & 11 deletions charset_normalizer/api.py
@@ -1,3 +1,4 @@
import logging
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set

Expand All @@ -6,8 +7,6 @@
except ImportError: # pragma: no cover
PathLike = str # type: ignore

import logging

from .cd import (
coherence_ratio,
encoding_languages,
Expand All @@ -27,11 +26,10 @@
)

logger = logging.getLogger("charset_normalizer")
logger.setLevel(logging.DEBUG)

handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(handler)
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)


def from_bytes(
Expand All @@ -57,6 +55,9 @@ def from_bytes(
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""

if not isinstance(sequences, (bytearray, bytes)):
Expand All @@ -66,17 +67,17 @@ def from_bytes(
)
)

if not explain:
logger.setLevel(logging.CRITICAL)
else:
logger.setLevel(logging.INFO)
if explain:
logger.addHandler(explain_handler)

length = len(sequences) # type: int

if length == 0:
logger.warning(
"Given content is empty, stopping the process very early, returning empty utf_8 str match"
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

if cp_isolation is not None:
Expand Down Expand Up @@ -416,13 +417,17 @@ def from_bytes(
logger.info(
"%s is most likely the one. Stopping the process.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([results[encoding_iana]])

if encoding_iana == sig_encoding:
logger.info(
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([results[encoding_iana]])

if len(results) == 0:
Expand Down Expand Up @@ -451,6 +456,9 @@ def from_bytes(
logger.warning("ascii will be used as a fallback match")
results.append(fallback_ascii)

if explain:
logger.removeHandler(explain_handler)

return results


Expand Down
15 changes: 15 additions & 0 deletions charset_normalizer/utils.py
Expand Up @@ -4,6 +4,7 @@
import unicodedata # type: ignore[no-redef]

import importlib
import logging
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
Expand Down Expand Up @@ -325,3 +326,17 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)


def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:

logger = logging.getLogger(name)
logger.setLevel(level)

handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
51 changes: 51 additions & 0 deletions tests/test_logging.py
@@ -0,0 +1,51 @@
import pytest
import logging

from charset_normalizer.utils import set_logging_handler
from charset_normalizer.api import from_bytes, explain_handler


class TestLogBehaviorClass:
def setup(self):
self.logger = logging.getLogger("charset_normalizer")
self.logger.handlers.clear()
self.logger.addHandler(logging.NullHandler())
self.logger.level = None

def test_explain_true_behavior(self, caplog):
test_sequence = b'This is a test sequence of bytes that should be sufficient'
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
assert explain_handler not in self.logger.handlers
for record in caplog.records:
assert record.levelname == "INFO"

def test_explain_false_handler_set_behavior(self, caplog):
test_sequence = b'This is a test sequence of bytes that should be sufficient'
set_logging_handler(level=logging.INFO, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers)
for record in caplog.records:
assert record.levelname == "INFO"
assert "ascii is most likely the one. Stopping the process." in caplog.text

def test_set_stream_handler(self, caplog):
set_logging_handler(
"charset_normalizer", level=logging.DEBUG
)
self.logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname == "DEBUG"
assert "log content should log with default format" in caplog.text

def test_set_stream_handler_format(self, caplog):
set_logging_handler(
"charset_normalizer", format_string="%(message)s"
)
self.logger.info("log content should only be this message")
assert caplog.record_tuples == [
(
"charset_normalizer",
logging.INFO,
"log content should only be this message",
)
]
41 changes: 22 additions & 19 deletions tests/test_utils.py
@@ -1,31 +1,34 @@
import logging
import pytest
from charset_normalizer.utils import is_accentuated, cp_similarity
from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler


@pytest.mark.parametrize(
"character, expected_is_accentuated",
[
('é', True),
('è', True),
('à', True),
('À', True),
('Ù', True),
('ç', True),
('a', False),
('€', False),
('&', False),
('Ö', True),
("é", True),
("è", True),
("à", True),
("À", True),
("Ù", True),
("ç", True),
("a", False),
("€", False),
("&", False),
("Ö", True),
("ü", True),
("ê", True),
('Ñ', True),
('Ý', True),
('Ω', False),
('ø', False),
('Ё', False),
]
("Ñ", True),
("Ý", True),
("Ω", False),
("ø", False),
("Ё", False),
],
)
def test_is_accentuated(character, expected_is_accentuated):
assert is_accentuated(character) is expected_is_accentuated, "is_accentuated behavior incomplete"
assert (
is_accentuated(character) is expected_is_accentuated
), "is_accentuated behavior incomplete"


@pytest.mark.parametrize(
Expand All @@ -37,7 +40,7 @@ def test_is_accentuated(character, expected_is_accentuated):
("latin_1", "iso8859_4", True),
("latin_1", "cp1251", False),
("cp1251", "mac_turkish", False),
]
],
)
def test_cp_similarity(cp_name_a, cp_name_b, expected_is_similar):
is_similar = cp_similarity(cp_name_a, cp_name_b) >= 0.8
Expand Down

0 comments on commit b5e48c4

Please sign in to comment.