From 370d9ee14ed95931e26c50d5606682580a703457 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 19:20:59 +0200 Subject: [PATCH] :art: Enable strict type check and improve the project typing Following https://github.com/Ousret/charset_normalizer/issues/182 --- .github/workflows/lint.yml | 2 +- charset_normalizer/api.py | 22 +++++++++++----------- charset_normalizer/cd.py | 4 ++-- charset_normalizer/cli/normalizer.py | 4 ++-- charset_normalizer/models.py | 4 ++-- charset_normalizer/utils.py | 14 ++++++++++---- 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 02e770ca..877b890e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -28,7 +28,7 @@ jobs: python setup.py install - name: Type checking (Mypy) run: | - mypy charset_normalizer + mypy --strict charset_normalizer - name: Import sorting check (isort) run: | isort --check charset_normalizer diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index ae08361b..621d1fce 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,7 +1,7 @@ import logging from os import PathLike from os.path import basename, splitext -from typing import BinaryIO, List, Optional, Set +from typing import BinaryIO, List, Optional, Set, Any from .cd import ( coherence_ratio, @@ -36,8 +36,8 @@ def from_bytes( steps: int = 5, chunk_size: int = 512, threshold: float = 0.2, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -486,8 +486,8 @@ def from_fp( steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -508,12 +508,12 @@ def from_fp( def from_path( - path: PathLike, + path: 'PathLike[Any]', steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -535,12 +535,12 @@ def from_path( def normalize( - path: PathLike, + path: 'PathLike[Any]', steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, ) -> CharsetMatch: """ diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index 8998bb54..7d119f6e 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -24,7 +24,7 @@ def encoding_unicode_range(iana_name: str) -> List[str]: if is_multi_byte_encoding(iana_name): raise IOError("Function not supported on multi-byte code page") - decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore + decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder p: IncrementalDecoder = decoder(errors="ignore") seen_ranges: Dict[str, int] = {} @@ -307,7 +307,7 @@ def coherence_ratio( lg_inclusion_list.remove("Latin Based") for layer in alpha_unicode_split(decoded_sequence): - sequence_frequencies: Counter = Counter(layer) + sequence_frequencies: Counter[str] = Counter(layer) most_common = sequence_frequencies.most_common() character_count: int = sum(o for c, o in most_common) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 540e5e2a..b8b652a5 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -3,7 +3,7 @@ from json import dumps from os.path import abspath from platform import python_version -from typing import List +from typing import List, Optional try: from unicodedata2 import unidata_version @@ -48,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool: sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") -def cli_detect(argv: List[str] = None) -> int: +def cli_detect(argv: Optional[List[str]] = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index b9d71eb4..ddf633d3 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -95,7 +95,7 @@ def coherence_non_latin(self) -> float: return 0.0 @property - def w_counter(self) -> Counter: + def w_counter(self) -> Counter[str]: """ Word counter instance on decoded text. Notice: Will be removed in 3.0 @@ -280,7 +280,7 @@ class CharsetMatches: Act like a list(iterable) but does not implements all related methods. """ - def __init__(self, results: List[CharsetMatch] = None): + def __init__(self, results: Optional[List[CharsetMatch]] = None): self._results: List[CharsetMatch] = sorted(results) if results else [] def __iter__(self) -> Iterator[CharsetMatch]: diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 0640deb0..f12edf4d 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -13,7 +13,7 @@ from re import findall from typing import Generator, List, Optional, Set, Tuple, Union -from _multibytecodec import MultibyteIncrementalDecoder # type: ignore +from _multibytecodec import MultibyteIncrementalDecoder from .constant import ( ENCODING_MARKS, @@ -231,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional for specified_encoding in results: specified_encoding = specified_encoding.lower().replace("-", "_") + encoding_alias: str + encoding_iana: str + for encoding_alias, encoding_iana in aliases.items(): if encoding_alias == specified_encoding: return encoding_iana @@ -256,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool: "utf_32_be", "utf_7", } or issubclass( - importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore + importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, MultibyteIncrementalDecoder, ) @@ -286,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool: def iana_name(cp_name: str, strict: bool = True) -> str: cp_name = cp_name.lower().replace("-", "_") + encoding_alias: str + encoding_iana: str + for encoding_alias, encoding_iana in aliases.items(): if cp_name in [encoding_alias, encoding_iana]: return encoding_iana @@ -315,8 +321,8 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): return 0.0 - decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore - decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore + decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder + decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder id_a: IncrementalDecoder = decoder_a(errors="ignore") id_b: IncrementalDecoder = decoder_b(errors="ignore")