diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 02e770ca..877b890e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -28,7 +28,7 @@ jobs: python setup.py install - name: Type checking (Mypy) run: | - mypy charset_normalizer + mypy --strict charset_normalizer - name: Import sorting check (isort) run: | isort --check charset_normalizer diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index ae08361b..3697291c 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,7 +1,7 @@ import logging from os import PathLike from os.path import basename, splitext -from typing import BinaryIO, List, Optional, Set +from typing import Any, BinaryIO, List, Optional, Set from .cd import ( coherence_ratio, @@ -36,8 +36,8 @@ def from_bytes( steps: int = 5, chunk_size: int = 512, threshold: float = 0.2, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -486,8 +486,8 @@ def from_fp( steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -508,12 +508,12 @@ def from_fp( def from_path( - path: PathLike, + path: "PathLike[Any]", steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, ) -> CharsetMatches: @@ -535,12 +535,12 @@ def from_path( def normalize( - path: PathLike, + path: "PathLike[Any]", steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, ) -> CharsetMatch: """ diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index 8998bb54..ee4b7424 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -2,7 +2,7 @@ from codecs import IncrementalDecoder from collections import Counter from functools import lru_cache -from typing import Dict, List, Optional, Tuple +from typing import Counter as TypeCounter, Dict, List, Optional, Tuple from .assets import FREQUENCIES from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES @@ -24,7 +24,9 @@ def encoding_unicode_range(iana_name: str) -> List[str]: if is_multi_byte_encoding(iana_name): raise IOError("Function not supported on multi-byte code page") - decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore + decoder = importlib.import_module( + "encodings.{}".format(iana_name) + ).IncrementalDecoder p: IncrementalDecoder = decoder(errors="ignore") seen_ranges: Dict[str, int] = {} @@ -307,7 +309,7 @@ def coherence_ratio( lg_inclusion_list.remove("Latin Based") for layer in alpha_unicode_split(decoded_sequence): - sequence_frequencies: Counter = Counter(layer) + sequence_frequencies: TypeCounter[str] = Counter(layer) most_common = sequence_frequencies.most_common() character_count: int = sum(o for c, o in most_common) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 540e5e2a..b8b652a5 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -3,7 +3,7 @@ from json import dumps from os.path import abspath from platform import python_version -from typing import List +from typing import List, Optional try: from unicodedata2 import unidata_version @@ -48,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool: sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") -def cli_detect(argv: List[str] = None) -> int: +def cli_detect(argv: Optional[List[str]] = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index b9d71eb4..ccb0d475 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -4,7 +4,16 @@ from hashlib import sha256 from json import dumps from re import sub -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import ( + Any, + Counter as TypeCounter, + Dict, + Iterator, + List, + Optional, + Tuple, + Union, +) from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE from .md import mess_ratio @@ -95,7 +104,7 @@ def coherence_non_latin(self) -> float: return 0.0 @property - def w_counter(self) -> Counter: + def w_counter(self) -> TypeCounter[str]: """ Word counter instance on decoded text. Notice: Will be removed in 3.0 @@ -280,7 +289,7 @@ class CharsetMatches: Act like a list(iterable) but does not implements all related methods. """ - def __init__(self, results: List[CharsetMatch] = None): + def __init__(self, results: Optional[List[CharsetMatch]] = None): self._results: List[CharsetMatch] = sorted(results) if results else [] def __iter__(self) -> Iterator[CharsetMatch]: diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 0640deb0..859f212b 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -13,7 +13,7 @@ from re import findall from typing import Generator, List, Optional, Set, Tuple, Union -from _multibytecodec import MultibyteIncrementalDecoder # type: ignore +from _multibytecodec import MultibyteIncrementalDecoder from .constant import ( ENCODING_MARKS, @@ -231,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional for specified_encoding in results: specified_encoding = specified_encoding.lower().replace("-", "_") + encoding_alias: str + encoding_iana: str + for encoding_alias, encoding_iana in aliases.items(): if encoding_alias == specified_encoding: return encoding_iana @@ -256,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool: "utf_32_be", "utf_7", } or issubclass( - importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore + importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, MultibyteIncrementalDecoder, ) @@ -286,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool: def iana_name(cp_name: str, strict: bool = True) -> str: cp_name = cp_name.lower().replace("-", "_") + encoding_alias: str + encoding_iana: str + for encoding_alias, encoding_iana in aliases.items(): if cp_name in [encoding_alias, encoding_iana]: return encoding_iana @@ -315,8 +321,12 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): return 0.0 - decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore - decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore + decoder_a = importlib.import_module( + "encodings.{}".format(iana_name_a) + ).IncrementalDecoder + decoder_b = importlib.import_module( + "encodings.{}".format(iana_name_b) + ).IncrementalDecoder id_a: IncrementalDecoder = decoder_a(errors="ignore") id_b: IncrementalDecoder = decoder_b(errors="ignore")