Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise the logger instanciation/initial handlers #135

Merged
merged 17 commits into from Nov 24, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)

### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
Expand All @@ -19,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)

### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
Expand Down
9 changes: 9 additions & 0 deletions charset_normalizer/__init__.py
Expand Up @@ -19,6 +19,8 @@
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging

from .api import from_bytes, from_fp, from_path, normalize
from .legacy import (
CharsetDetector,
Expand All @@ -28,6 +30,7 @@
detect,
)
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__

__all__ = (
Expand All @@ -44,4 +47,10 @@
"CharsetDoctor",
"__version__",
"VERSION",
"set_logging_handler",
)

# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library

logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
32 changes: 20 additions & 12 deletions charset_normalizer/api.py
@@ -1,3 +1,4 @@
import logging
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set

Expand All @@ -6,8 +7,6 @@
except ImportError: # pragma: no cover
PathLike = str # type: ignore

import logging

from .cd import (
coherence_ratio,
encoding_languages,
Expand All @@ -26,12 +25,11 @@
should_strip_sig_or_bom,
)

logger = logging.getLogger("charset_normalizer")
logger.setLevel(logging.DEBUG)

handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(handler)
logger = logging.getLogger(__name__)
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)


def from_bytes(
Expand All @@ -57,6 +55,9 @@ def from_bytes(
purpose.

This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""

if not isinstance(sequences, (bytearray, bytes)):
Expand All @@ -66,17 +67,17 @@ def from_bytes(
)
)

if not explain:
logger.setLevel(logging.CRITICAL)
else:
logger.setLevel(logging.INFO)
if explain:
logger.addHandler(explain_handler)

length = len(sequences) # type: int

if length == 0:
logger.warning(
"Given content is empty, stopping the process very early, returning empty utf_8 str match"
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

if cp_isolation is not None:
Expand Down Expand Up @@ -416,13 +417,17 @@ def from_bytes(
logger.info(
"%s is most likely the one. Stopping the process.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([results[encoding_iana]])

if encoding_iana == sig_encoding:
logger.info(
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
return CharsetMatches([results[encoding_iana]])

if len(results) == 0:
Expand Down Expand Up @@ -451,6 +456,9 @@ def from_bytes(
logger.warning("ascii will be used as a fallback match")
results.append(fallback_ascii)

if explain:
logger.removeHandler(explain_handler)

return results


Expand Down
15 changes: 15 additions & 0 deletions charset_normalizer/utils.py
Expand Up @@ -4,6 +4,7 @@
import unicodedata # type: ignore[no-redef]

import importlib
import logging
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
Expand Down Expand Up @@ -325,3 +326,17 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)


def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:

logger = logging.getLogger(name)
logger.setLevel(level)

handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
52 changes: 52 additions & 0 deletions tests/test_logging.py
@@ -0,0 +1,52 @@
import pytest
import logging

from charset_normalizer.utils import set_logging_handler
from charset_normalizer.api import from_bytes


logger = logging.getLogger("charset_normalizer")


class TestLogBehaviorClass:
def setup(self):
self.logger = logging.getLogger("charset_normalizer")
self.logger.handlers.clear()
self.logger.addHandler(logging.NullHandler())
self.logger.level = None

def test_explain_true_behavior(self, caplog):
test_sequence = b'This is a test sequence of bytes that should be sufficient'
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
for record in caplog.records:
assert record.levelname == "INFO"

def test_explain_false_handler_set_behavior(self, caplog):
test_sequence = b'This is a test sequence of bytes that should be sufficient'
set_logging_handler(level=logging.INFO, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
for record in caplog.records:
assert record.levelname == "INFO"
assert "ascii is most likely the one. Stopping the process." in caplog.text

def test_set_stream_handler(self, caplog):
set_logging_handler(
"charset_normalizer", level=logging.DEBUG
)
logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname == "DEBUG"
assert "log content should log with default format" in caplog.text

def test_set_stream_handler_format(self, caplog):
set_logging_handler(
"charset_normalizer", format_string="%(message)s"
)
logger.info("log content should only be this message")
assert caplog.record_tuples == [
(
"charset_normalizer",
logging.INFO,
"log content should only be this message",
)
]
nmaynes marked this conversation as resolved.
Show resolved Hide resolved
41 changes: 22 additions & 19 deletions tests/test_utils.py
@@ -1,31 +1,34 @@
import logging
import pytest
from charset_normalizer.utils import is_accentuated, cp_similarity
from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler


@pytest.mark.parametrize(
"character, expected_is_accentuated",
[
('é', True),
('è', True),
('à', True),
('À', True),
('Ù', True),
('ç', True),
('a', False),
('€', False),
('&', False),
('Ö', True),
("é", True),
("è", True),
("à", True),
("À", True),
("Ù", True),
("ç", True),
("a", False),
("€", False),
("&", False),
("Ö", True),
("ü", True),
("ê", True),
('Ñ', True),
('Ý', True),
('Ω', False),
('ø', False),
('Ё', False),
]
("Ñ", True),
("Ý", True),
("Ω", False),
("ø", False),
("Ё", False),
],
)
def test_is_accentuated(character, expected_is_accentuated):
assert is_accentuated(character) is expected_is_accentuated, "is_accentuated behavior incomplete"
assert (
is_accentuated(character) is expected_is_accentuated
), "is_accentuated behavior incomplete"


@pytest.mark.parametrize(
Expand All @@ -37,7 +40,7 @@ def test_is_accentuated(character, expected_is_accentuated):
("latin_1", "iso8859_4", True),
("latin_1", "cp1251", False),
("cp1251", "mac_turkish", False),
]
],
)
def test_cp_similarity(cp_name_a, cp_name_b, expected_is_similar):
is_similar = cp_similarity(cp_name_a, cp_name_b) >= 0.8
Expand Down