diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 24bd8fa9..d0740caa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,12 @@ repos: hooks: - id: black args: ["--target-version", "py36"] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.961 + hooks: + - id: mypy + files: ^chardet/ + args: ["--strict", "--pretty", "--show-error-codes"] - repo: https://github.com/PyCQA/prospector rev: 1.7.7 # The version of Prospector to use, if not 'master' for latest hooks: diff --git a/MANIFEST.in b/MANIFEST.in index be5768dd..2f471334 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include LICENSE include *.rst +include chardet/py.typed include requirements.txt include test.py recursive-include docs * diff --git a/chardet/__init__.py b/chardet/__init__.py index e91ad618..2112ee14 100644 --- a/chardet/__init__.py +++ b/chardet/__init__.py @@ -15,14 +15,19 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + +from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState +from .resultdict import ResultDict from .universaldetector import UniversalDetector from .version import VERSION, __version__ __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] -def detect(byte_str): +def detect(byte_str: Union[bytes, bytearray]) -> ResultDict: """ Detect the encoding of the given byte string. @@ -40,7 +45,9 @@ def detect(byte_str): return detector.close() -def detect_all(byte_str, ignore_threshold=False): +def detect_all( + byte_str: Union[bytes, bytearray], ignore_threshold: bool = False +) -> List[ResultDict]: """ Detect all the possible encodings of the given byte string. @@ -63,10 +70,10 @@ def detect_all(byte_str, ignore_threshold=False): detector.close() if detector.input_state == InputState.HIGH_BYTE: - results = [] - probers = [] + results: List[ResultDict] = [] + probers: List[CharSetProber] = [] for prober in detector.charset_probers: - if hasattr(prober, "probers"): + if isinstance(prober, CharSetGroupProber): probers.extend(p for p in prober.probers) else: probers.append(prober) diff --git a/chardet/big5prober.py b/chardet/big5prober.py index e4dfa7aa..ef09c60e 100644 --- a/chardet/big5prober.py +++ b/chardet/big5prober.py @@ -32,16 +32,16 @@ class Big5Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.distribution_analyzer = Big5DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Big5" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py index 27b4a293..176cb996 100644 --- a/chardet/chardistribution.py +++ b/chardet/chardistribution.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Tuple, Union + from .big5freq import ( BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, @@ -59,22 +61,22 @@ class CharDistributionAnalysis: SURE_NO = 0.01 MINIMUM_DATA_THRESHOLD = 3 - def __init__(self): + def __init__(self) -> None: # Mapping table to get frequency order from char order (get from # GetOrder()) - self._char_to_freq_order = tuple() - self._table_size = None # Size of above table + self._char_to_freq_order: Tuple[int, ...] = tuple() + self._table_size = 0 # Size of above table # This is a constant value which varies from language to language, # used in calculating confidence. See # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html # for further detail. - self.typical_distribution_ratio = None - self._done = None - self._total_chars = None - self._freq_chars = None + self.typical_distribution_ratio = 0.0 + self._done = False + self._total_chars = 0 + self._freq_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: """reset analyser, clear any state""" # If this flag is set to True, detection is done and conclusion has # been made @@ -83,7 +85,7 @@ def reset(self): # The number of characters whose frequency order is less than 512 self._freq_chars = 0 - def feed(self, char, char_len): + def feed(self, char: Union[bytes, bytearray], char_len: int) -> None: """feed a character with known length""" if char_len == 2: # we only care about 2-bytes character in our distribution analysis @@ -97,7 +99,7 @@ def feed(self, char, char_len): if 512 > self._char_to_freq_order[order]: self._freq_chars += 1 - def get_confidence(self): + def get_confidence(self) -> float: """return confidence based on existing data""" # if we didn't receive any character in our consideration range, # return negative answer @@ -114,12 +116,12 @@ def get_confidence(self): # normalize confidence (we don't want to be 100% sure) return self.SURE_YES - def got_enough_data(self): + def got_enough_data(self) -> bool: # It is not necessary to receive all data to draw conclusion. # For charset detection, certain amount of data is enough return self._total_chars > self.ENOUGH_DATA_THRESHOLD - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> int: # We do not handle characters based on the original encoding string, # but convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency @@ -128,13 +130,13 @@ def get_order(self, _): class EUCTWDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._table_size = EUCTW_TABLE_SIZE self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -146,13 +148,13 @@ def get_order(self, byte_str): class EUCKRDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -164,13 +166,13 @@ def get_order(self, byte_str): class JOHABDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: first_char = byte_str[0] if 0x88 <= first_char < 0xD4: code = first_char * 256 + byte_str[1] @@ -179,13 +181,13 @@ def get_order(self, byte_str): class GB2312DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._table_size = GB2312_TABLE_SIZE self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -197,13 +199,13 @@ def get_order(self, byte_str): class Big5DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._table_size = BIG5_TABLE_SIZE self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe @@ -217,13 +219,13 @@ def get_order(self, byte_str): class SJISDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe @@ -242,13 +244,13 @@ def get_order(self, byte_str): class EUCJPDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py index 4c41ed2a..6def56b4 100644 --- a/chardet/charsetgroupprober.py +++ b/chardet/charsetgroupprober.py @@ -25,18 +25,20 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Optional, Union + from .charsetprober import CharSetProber -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState class CharSetGroupProber(CharSetProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self._active_num = 0 - self.probers = [] - self._best_guess_prober = None + self.probers: List[CharSetProber] = [] + self._best_guess_prober: Optional[CharSetProber] = None - def reset(self): + def reset(self) -> None: super().reset() self._active_num = 0 for prober in self.probers: @@ -46,7 +48,7 @@ def reset(self): self._best_guess_prober = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: @@ -54,14 +56,14 @@ def charset_name(self): return self._best_guess_prober.charset_name @property - def language(self): + def language(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: return None return self._best_guess_prober.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for prober in self.probers: if not prober.active: continue @@ -80,7 +82,7 @@ def feed(self, byte_str): return self.state return self.state - def get_confidence(self): + def get_confidence(self) -> float: state = self.state if state == ProbingState.FOUND_IT: return 0.99 diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py index 9f1afd99..a103ca11 100644 --- a/chardet/charsetprober.py +++ b/chardet/charsetprober.py @@ -28,8 +28,9 @@ import logging import re +from typing import Optional, Union -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState INTERNATIONAL_WORDS_PATTERN = re.compile( b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" @@ -40,35 +41,40 @@ class CharSetProber: SHORTCUT_THRESHOLD = 0.95 - def __init__(self, lang_filter=None): - self._state = None + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + self._state = ProbingState.DETECTING + self.active = True self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - def reset(self): + def reset(self) -> None: self._state = ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return None - def feed(self, byte_str): + @property + def language(self) -> Optional[str]: + raise NotImplementedError + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: raise NotImplementedError @property - def state(self): + def state(self) -> ProbingState: return self._state - def get_confidence(self): + def get_confidence(self) -> float: return 0.0 @staticmethod - def filter_high_byte_only(buf): + def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: buf = re.sub(b"([\x00-\x7F])+", b" ", buf) return buf @staticmethod - def filter_international_words(buf): + def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: """ We define three types of bytes: alphabet: english alphabets [a-zA-Z] @@ -102,7 +108,7 @@ def filter_international_words(buf): return filtered @staticmethod - def remove_xml_tags(buf): + def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes: """ Returns a copy of ``buf`` that retains only the sequences of English alphabet and high byte characters that are not between <> characters. @@ -117,10 +123,13 @@ def remove_xml_tags(buf): for curr, buf_char in enumerate(buf): # Check if we're coming out of or entering an XML tag - if buf_char == b">": + + # https://github.com/python/typeshed/issues/8182 + if buf_char == b">": # type: ignore[comparison-overlap] prev = curr + 1 in_tag = False - elif buf_char == b"<": + # https://github.com/python/typeshed/issues/8182 + elif buf_char == b"<": # type: ignore[comparison-overlap] if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py index 4959ad6e..94076a4b 100644 --- a/chardet/cli/chardetect.py +++ b/chardet/cli/chardetect.py @@ -15,12 +15,15 @@ import argparse import sys +from typing import Iterable, List, Optional from .. import __version__ from ..universaldetector import UniversalDetector -def description_of(lines, name="stdin", minimal=False): +def description_of( + lines: Iterable[bytes], name: str = "stdin", minimal: bool = False +) -> Optional[str]: """ Return a string describing the probable encoding of a file or list of strings. @@ -46,7 +49,7 @@ def description_of(lines, name="stdin", minimal=False): return f"{name}: no result" -def main(argv=None): +def main(argv: Optional[List[str]] = None) -> None: """ Handles command line arguments and gets things started. diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py index d3e3e825..8ed4a877 100644 --- a/chardet/codingstatemachine.py +++ b/chardet/codingstatemachine.py @@ -27,6 +27,7 @@ import logging +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState @@ -53,18 +54,19 @@ class CodingStateMachine: encoding from consideration from here on. """ - def __init__(self, sm): + def __init__(self, sm: CodingStateMachineDict) -> None: self._model = sm self._curr_byte_pos = 0 self._curr_char_len = 0 - self._curr_state = None + self._curr_state = MachineState.START + self.active = True self.logger = logging.getLogger(__name__) self.reset() - def reset(self): + def reset(self) -> None: self._curr_state = MachineState.START - def next_state(self, c): + def next_state(self, c: int) -> int: # for each byte we get its class # if it is first byte, we also get byte length byte_class = self._model["class_table"][c] @@ -77,12 +79,12 @@ def next_state(self, c): self._curr_byte_pos += 1 return self._curr_state - def get_current_charlen(self): + def get_current_charlen(self) -> int: return self._curr_char_len - def get_coding_state_machine(self): + def get_coding_state_machine(self) -> str: return self._model["name"] @property - def language(self): + def language(self) -> str: return self._model["language"] diff --git a/chardet/codingstatemachinedict.py b/chardet/codingstatemachinedict.py new file mode 100644 index 00000000..7a3c4c7e --- /dev/null +++ b/chardet/codingstatemachinedict.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING, Tuple + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class CodingStateMachineDict(TypedDict, total=False): + class_table: Tuple[int, ...] + class_factor: int + state_table: Tuple[int, ...] + char_len_table: Tuple[int, ...] + name: str + language: str # Optional key + +else: + CodingStateMachineDict = dict diff --git a/chardet/cp949prober.py b/chardet/cp949prober.py index 28a1f3db..fa7307ed 100644 --- a/chardet/cp949prober.py +++ b/chardet/cp949prober.py @@ -32,7 +32,7 @@ class CP949Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(CP949_SM_MODEL) # NOTE: CP949 is a superset of EUC-KR, so the distribution should be @@ -41,9 +41,9 @@ def __init__(self): self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "CP949" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/chardet/enums.py b/chardet/enums.py index 32a77e76..5e3e1982 100644 --- a/chardet/enums.py +++ b/chardet/enums.py @@ -4,6 +4,8 @@ :author: Dan Blanchard (dan.blanchard@gmail.com) """ +from enum import Enum, Flag + class InputState: """ @@ -15,12 +17,13 @@ class InputState: HIGH_BYTE = 2 -class LanguageFilter: +class LanguageFilter(Flag): """ This enum represents the different language filters we can apply to a ``UniversalDetector``. """ + NONE = 0x00 CHINESE_SIMPLIFIED = 0x01 CHINESE_TRADITIONAL = 0x02 JAPANESE = 0x04 @@ -31,7 +34,7 @@ class LanguageFilter: CJK = CHINESE | JAPANESE | KOREAN -class ProbingState: +class ProbingState(Enum): """ This enum represents the different states a prober can be in. """ @@ -62,7 +65,7 @@ class SequenceLikelihood: POSITIVE = 3 @classmethod - def get_num_categories(cls): + def get_num_categories(cls) -> int: """:returns: The number of likelihood categories in the enum.""" return 4 diff --git a/chardet/escprober.py b/chardet/escprober.py index c91c0781..fd713830 100644 --- a/chardet/escprober.py +++ b/chardet/escprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import LanguageFilter, MachineState, ProbingState @@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber): identify these encodings. """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.coding_sm = [] if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: @@ -53,13 +55,13 @@ def __init__(self, lang_filter=None): self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) if self.lang_filter & LanguageFilter.KOREAN: self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) - self.active_sm_count = None - self._detected_charset = None - self._detected_language = None - self._state = None + self.active_sm_count = 0 + self._detected_charset: Optional[str] = None + self._detected_language: Optional[str] = None + self._state = ProbingState.DETECTING self.reset() - def reset(self): + def reset(self) -> None: super().reset() for coding_sm in self.coding_sm: coding_sm.active = True @@ -69,17 +71,17 @@ def reset(self): self._detected_language = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return self._detected_charset @property - def language(self): + def language(self) -> Optional[str]: return self._detected_language - def get_confidence(self): + def get_confidence(self) -> float: return 0.99 if self._detected_charset else 0.00 - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: for coding_sm in self.coding_sm: if not coding_sm.active: diff --git a/chardet/escsm.py b/chardet/escsm.py index 3aa0f4d9..11d4adf7 100644 --- a/chardet/escsm.py +++ b/chardet/escsm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # fmt: off @@ -75,7 +76,7 @@ HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -HZ_SM_MODEL = { +HZ_SM_MODEL: CodingStateMachineDict = { "class_table": HZ_CLS, "class_factor": 6, "state_table": HZ_ST, @@ -134,7 +135,7 @@ ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022CN_SM_MODEL = { +ISO2022CN_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022CN_CLS, "class_factor": 9, "state_table": ISO2022CN_ST, @@ -194,7 +195,7 @@ ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022JP_SM_MODEL = { +ISO2022JP_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022JP_CLS, "class_factor": 10, "state_table": ISO2022JP_ST, @@ -250,7 +251,7 @@ ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -ISO2022KR_SM_MODEL = { +ISO2022KR_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022KR_CLS, "class_factor": 6, "state_table": ISO2022KR_ST, diff --git a/chardet/eucjpprober.py b/chardet/eucjpprober.py index abf2e66e..39487f40 100644 --- a/chardet/eucjpprober.py +++ b/chardet/eucjpprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import EUCJPDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ class EUCJPProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.distribution_analyzer = EUCJPDistributionAnalysis() self.context_analyzer = EUCJPContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-JP" @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): # PY3K: byte_str is a byte array, so byte is an int, not a byte coding_state = self.coding_sm.next_state(byte) @@ -89,7 +94,9 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/chardet/euckrprober.py b/chardet/euckrprober.py index 154a6d21..1fc5de04 100644 --- a/chardet/euckrprober.py +++ b/chardet/euckrprober.py @@ -32,16 +32,16 @@ class EUCKRProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.distribution_analyzer = EUCKRDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-KR" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/chardet/euctwprober.py b/chardet/euctwprober.py index ca10a23c..a37ab189 100644 --- a/chardet/euctwprober.py +++ b/chardet/euctwprober.py @@ -32,16 +32,16 @@ class EUCTWProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.distribution_analyzer = EUCTWDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-TW" @property - def language(self): + def language(self) -> str: return "Taiwan" diff --git a/chardet/gb2312prober.py b/chardet/gb2312prober.py index 251c0429..d423e731 100644 --- a/chardet/gb2312prober.py +++ b/chardet/gb2312prober.py @@ -32,16 +32,16 @@ class GB2312Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.distribution_analyzer = GB2312DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "GB2312" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py index 3ca634bf..785d0057 100644 --- a/chardet/hebrewprober.py +++ b/chardet/hebrewprober.py @@ -25,8 +25,11 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .enums import ProbingState +from .sbcharsetprober import SingleByteCharSetProber # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -127,6 +130,7 @@ class HebrewProber(CharSetProber): + SPACE = 0x20 # windows-1255 / ISO-8859-8 code points of interest FINAL_KAF = 0xEA NORMAL_KAF = 0xEB @@ -152,31 +156,35 @@ class HebrewProber(CharSetProber): VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" - def __init__(self): + def __init__(self) -> None: super().__init__() - self._final_char_logical_score = None - self._final_char_visual_score = None - self._prev = None - self._before_prev = None - self._logical_prober = None - self._visual_prober = None + self._final_char_logical_score = 0 + self._final_char_visual_score = 0 + self._prev = self.SPACE + self._before_prev = self.SPACE + self._logical_prober: Optional[SingleByteCharSetProber] = None + self._visual_prober: Optional[SingleByteCharSetProber] = None self.reset() - def reset(self): + def reset(self) -> None: self._final_char_logical_score = 0 self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate # a word delimiter at the beginning of the data - self._prev = " " - self._before_prev = " " + self._prev = self.SPACE + self._before_prev = self.SPACE # These probers are owned by the group prober. - def set_model_probers(self, logical_prober, visual_prober): + def set_model_probers( + self, + logical_prober: SingleByteCharSetProber, + visual_prober: SingleByteCharSetProber, + ) -> None: self._logical_prober = logical_prober self._visual_prober = visual_prober - def is_final(self, c): + def is_final(self, c: int) -> bool: return c in [ self.FINAL_KAF, self.FINAL_MEM, @@ -185,7 +193,7 @@ def is_final(self, c): self.FINAL_TSADI, ] - def is_non_final(self, c): + def is_non_final(self, c: int) -> bool: # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters @@ -198,7 +206,7 @@ def is_non_final(self, c): # since these words are quite rare. return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE] - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew # or visual Hebrew. @@ -232,9 +240,9 @@ def feed(self, byte_str): byte_str = self.filter_high_byte_only(byte_str) for cur in byte_str: - if cur == " ": + if cur == self.SPACE: # We stand on a space - a word just ended - if self._before_prev != " ": + if self._before_prev != self.SPACE: # next-to-last char was not a space so self._prev is not a # 1 letter word if self.is_final(self._prev): @@ -247,9 +255,9 @@ def feed(self, byte_str): else: # Not standing on a space if ( - (self._before_prev == " ") + (self._before_prev == self.SPACE) and (self.is_final(self._prev)) - and (cur != " ") + and (cur != self.SPACE) ): # case (3) [-2:space][-1:final letter][cur:not space] self._final_char_visual_score += 1 @@ -261,7 +269,10 @@ def feed(self, byte_str): return ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> str: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = self._final_char_logical_score - self._final_char_visual_score @@ -289,11 +300,14 @@ def charset_name(self): return self.LOGICAL_HEBREW_NAME @property - def language(self): + def language(self) -> str: return "Hebrew" @property - def state(self): + def state(self) -> ProbingState: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Remain active as long as any of the model probers are active. if (self._logical_prober.state == ProbingState.NOT_ME) and ( self._visual_prober.state == ProbingState.NOT_ME diff --git a/chardet/johabprober.py b/chardet/johabprober.py index 6f359d19..d7364ba6 100644 --- a/chardet/johabprober.py +++ b/chardet/johabprober.py @@ -32,16 +32,16 @@ class JOHABProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL) self.distribution_analyzer = JOHABDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Johab" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py index 7a8e5be0..2f53bdda 100644 --- a/chardet/jpcntx.py +++ b/chardet/jpcntx.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Tuple, Union # This is hiragana 2-char sequence table, the number in each cell represents its frequency category # fmt: off @@ -123,15 +124,15 @@ class JapaneseContextAnalysis: MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 - def __init__(self): - self._total_rel = None - self._rel_sample = None - self._need_to_skip_char_num = None - self._last_char_order = None - self._done = None + def __init__(self) -> None: + self._total_rel = 0 + self._rel_sample: List[int] = [] + self._need_to_skip_char_num = 0 + self._last_char_order = -1 + self._done = False self.reset() - def reset(self): + def reset(self) -> None: self._total_rel = 0 # total sequence received # category counters, each integer counts sequence in its category self._rel_sample = [0] * self.NUM_OF_CATEGORY @@ -143,7 +144,7 @@ def reset(self): # been made self._done = False - def feed(self, byte_str, num_bytes): + def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None: if self._done: return @@ -172,29 +173,29 @@ def feed(self, byte_str, num_bytes): ] += 1 self._last_char_order = order - def got_enough_data(self): + def got_enough_data(self) -> bool: return self._total_rel > self.ENOUGH_REL_THRESHOLD - def get_confidence(self): + def get_confidence(self) -> float: # This is just one way to calculate confidence. It works well for me. if self._total_rel > self.MINIMUM_DATA_THRESHOLD: return (self._total_rel - self._rel_sample[0]) / self._total_rel return self.DONT_KNOW - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]: return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._charset_name = "SHIFT_JIS" @property - def charset_name(self): + def charset_name(self) -> str: return self._charset_name - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length @@ -216,7 +217,7 @@ def get_order(self, byte_str): class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length diff --git a/chardet/latin1prober.py b/chardet/latin1prober.py index 241f14ab..59a01d91 100644 --- a/chardet/latin1prober.py +++ b/chardet/latin1prober.py @@ -26,6 +26,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -96,26 +98,26 @@ class Latin1Prober(CharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() - self._last_char_class = None - self._freq_counter = None + self._last_char_class = OTH + self._freq_counter: List[int] = [] self.reset() - def reset(self): + def reset(self) -> None: self._last_char_class = OTH self._freq_counter = [0] * FREQ_CAT_NUM super().reset() @property - def charset_name(self): + def charset_name(self) -> str: return "ISO-8859-1" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: byte_str = self.remove_xml_tags(byte_str) for c in byte_str: char_class = Latin1_CharToClass[c] @@ -128,7 +130,7 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: if self.state == ProbingState.NOT_ME: return 0.01 diff --git a/chardet/macromanprober.py b/chardet/macromanprober.py index a7bf5b41..1425d10e 100644 --- a/chardet/macromanprober.py +++ b/chardet/macromanprober.py @@ -28,6 +28,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -105,13 +107,13 @@ class MacRomanProber(CharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() - self._last_char_class = None - self._freq_counter = None + self._last_char_class = OTH + self._freq_counter: List[int] = [] self.reset() - def reset(self): + def reset(self) -> None: self._last_char_class = OTH self._freq_counter = [0] * FREQ_CAT_NUM @@ -123,14 +125,14 @@ def reset(self): super().reset() @property - def charset_name(self): + def charset_name(self) -> str: return "MacRoman" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: byte_str = self.remove_xml_tags(byte_str) for c in byte_str: char_class = MacRoman_CharToClass[c] @@ -143,7 +145,7 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: if self.state == ProbingState.NOT_ME: return 0.01 diff --git a/chardet/mbcharsetprober.py b/chardet/mbcharsetprober.py index bf96ad5d..666307e8 100644 --- a/chardet/mbcharsetprober.py +++ b/chardet/mbcharsetprober.py @@ -27,8 +27,12 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + +from .chardistribution import CharDistributionAnalysis from .charsetprober import CharSetProber -from .enums import MachineState, ProbingState +from .codingstatemachine import CodingStateMachine +from .enums import LanguageFilter, MachineState, ProbingState class MultiByteCharSetProber(CharSetProber): @@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber): MultiByteCharSetProber """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) - self.distribution_analyzer = None - self.coding_sm = None - self._last_char = [0, 0] + self.distribution_analyzer: Optional[CharDistributionAnalysis] = None + self.coding_sm: Optional[CodingStateMachine] = None + self._last_char = bytearray(b"\0\0") - def reset(self): + def reset(self) -> None: super().reset() if self.coding_sm: self.coding_sm.reset() if self.distribution_analyzer: self.distribution_analyzer.reset() - self._last_char = [0, 0] - - @property - def charset_name(self): - raise NotImplementedError + self._last_char = bytearray(b"\0\0") - @property - def language(self): - raise NotImplementedError + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None - def feed(self, byte_str): for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -91,5 +90,6 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None return self.distribution_analyzer.get_confidence() diff --git a/chardet/mbcsgroupprober.py b/chardet/mbcsgroupprober.py index 94488360..6cb9cc7b 100644 --- a/chardet/mbcsgroupprober.py +++ b/chardet/mbcsgroupprober.py @@ -30,6 +30,7 @@ from .big5prober import Big5Prober from .charsetgroupprober import CharSetGroupProber from .cp949prober import CP949Prober +from .enums import LanguageFilter from .eucjpprober import EUCJPProber from .euckrprober import EUCKRProber from .euctwprober import EUCTWProber @@ -40,7 +41,7 @@ class MBCSGroupProber(CharSetGroupProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.probers = [ UTF8Prober(), diff --git a/chardet/mbcssm.py b/chardet/mbcssm.py index d3b9c4b7..7bbe97e6 100644 --- a/chardet/mbcssm.py +++ b/chardet/mbcssm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # BIG5 @@ -74,7 +75,7 @@ BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) -BIG5_SM_MODEL = { +BIG5_SM_MODEL: CodingStateMachineDict = { "class_table": BIG5_CLS, "class_factor": 5, "state_table": BIG5_ST, @@ -117,7 +118,7 @@ CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) -CP949_SM_MODEL = { +CP949_SM_MODEL: CodingStateMachineDict = { "class_table": CP949_CLS, "class_factor": 10, "state_table": CP949_ST, @@ -173,7 +174,7 @@ EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) -EUCJP_SM_MODEL = { +EUCJP_SM_MODEL: CodingStateMachineDict = { "class_table": EUCJP_CLS, "class_factor": 6, "state_table": EUCJP_ST, @@ -226,7 +227,7 @@ EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) -EUCKR_SM_MODEL = { +EUCKR_SM_MODEL: CodingStateMachineDict = { "class_table": EUCKR_CLS, "class_factor": 4, "state_table": EUCKR_ST, @@ -283,7 +284,7 @@ JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) -JOHAB_SM_MODEL = { +JOHAB_SM_MODEL: CodingStateMachineDict = { "class_table": JOHAB_CLS, "class_factor": 10, "state_table": JOHAB_ST, @@ -340,7 +341,7 @@ EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) -EUCTW_SM_MODEL = { +EUCTW_SM_MODEL: CodingStateMachineDict = { "class_table": EUCTW_CLS, "class_factor": 7, "state_table": EUCTW_ST, @@ -402,7 +403,7 @@ # 2 here. GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2) -GB2312_SM_MODEL = { +GB2312_SM_MODEL: CodingStateMachineDict = { "class_table": GB2312_CLS, "class_factor": 7, "state_table": GB2312_ST, @@ -458,7 +459,7 @@ SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) -SJIS_SM_MODEL = { +SJIS_SM_MODEL: CodingStateMachineDict = { "class_table": SJIS_CLS, "class_factor": 6, "state_table": SJIS_ST, @@ -516,7 +517,7 @@ UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) -UCS2BE_SM_MODEL = { +UCS2BE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2BE_CLS, "class_factor": 6, "state_table": UCS2BE_ST, @@ -574,7 +575,7 @@ UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) -UCS2LE_SM_MODEL = { +UCS2LE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2LE_CLS, "class_factor": 6, "state_table": UCS2LE_ST, @@ -651,7 +652,7 @@ UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) -UTF8_SM_MODEL = { +UTF8_SM_MODEL: CodingStateMachineDict = { "class_table": UTF8_CLS, "class_factor": 16, "state_table": UTF8_ST, diff --git a/chardet/metadata/languages.py b/chardet/metadata/languages.py index e06ec9f5..558a8981 100644 --- a/chardet/metadata/languages.py +++ b/chardet/metadata/languages.py @@ -6,6 +6,7 @@ """ from string import ascii_letters +from typing import List, Optional # TODO: Add Ukrainian (KOI8-U) @@ -33,13 +34,13 @@ class Language: def __init__( self, - name=None, - iso_code=None, - use_ascii=True, - charsets=None, - alphabet=None, - wiki_start_pages=None, - ): + name: Optional[str] = None, + iso_code: Optional[str] = None, + use_ascii: bool = True, + charsets: Optional[List[str]] = None, + alphabet: Optional[str] = None, + wiki_start_pages: Optional[List[str]] = None, + ) -> None: super().__init__() self.name = name self.iso_code = iso_code @@ -55,7 +56,7 @@ def __init__( self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None self.wiki_start_pages = wiki_start_pages - def __repr__(self): + def __repr__(self) -> str: param_str = ", ".join( f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") ) diff --git a/chardet/py.typed b/chardet/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/chardet/resultdict.py b/chardet/resultdict.py new file mode 100644 index 00000000..7d36e64c --- /dev/null +++ b/chardet/resultdict.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class ResultDict(TypedDict): + encoding: Optional[str] + confidence: float + language: Optional[str] + +else: + ResultDict = dict diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py index 31d70e15..0ffbcdd2 100644 --- a/chardet/sbcharsetprober.py +++ b/chardet/sbcharsetprober.py @@ -26,23 +26,20 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from collections import namedtuple +from typing import Dict, List, NamedTuple, Optional, Union from .charsetprober import CharSetProber from .enums import CharacterCategory, ProbingState, SequenceLikelihood -SingleByteCharSetModel = namedtuple( - "SingleByteCharSetModel", - [ - "charset_name", - "language", - "char_to_order_map", - "language_model", - "typical_positive_ratio", - "keep_ascii_letters", - "alphabet", - ], -) + +class SingleByteCharSetModel(NamedTuple): + charset_name: str + language: str + char_to_order_map: Dict[int, int] + language_model: Dict[int, Dict[int, int]] + typical_positive_ratio: float + keep_ascii_letters: bool + alphabet: str class SingleByteCharSetProber(CharSetProber): @@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber): POSITIVE_SHORTCUT_THRESHOLD = 0.95 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 - def __init__(self, model, is_reversed=False, name_prober=None): + def __init__( + self, + model: SingleByteCharSetModel, + is_reversed: bool = False, + name_prober: Optional[CharSetProber] = None, + ) -> None: super().__init__() self._model = model # TRUE if we need to reverse every pair in the model lookup self._reversed = is_reversed # Optional auxiliary prober for name decision self._name_prober = name_prober - self._last_order = None - self._seq_counters = None - self._total_seqs = None - self._total_char = None - self._control_char = None - self._freq_char = None + self._last_order = 255 + self._seq_counters: List[int] = [] + self._total_seqs = 0 + self._total_char = 0 + self._control_char = 0 + self._freq_char = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() # char order of last character self._last_order = 255 @@ -78,18 +80,18 @@ def reset(self): self._freq_char = 0 @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if self._name_prober: return self._name_prober.charset_name return self._model.charset_name @property - def language(self): + def language(self) -> Optional[str]: if self._name_prober: return self._name_prober.language return self._model.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # TODO: Make filter_international_words keep things in self.alphabet if not self._model.keep_ascii_letters: byte_str = self.filter_international_words(byte_str) @@ -139,7 +141,7 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: r = 0.01 if self._total_seqs > 0: r = ( diff --git a/chardet/sbcsgroupprober.py b/chardet/sbcsgroupprober.py index cad001cb..890ae846 100644 --- a/chardet/sbcsgroupprober.py +++ b/chardet/sbcsgroupprober.py @@ -48,7 +48,7 @@ class SBCSGroupProber(CharSetGroupProber): - def __init__(self): + def __init__(self) -> None: super().__init__() hebrew_prober = HebrewProber() logical_hebrew_prober = SingleByteCharSetProber( diff --git a/chardet/sjisprober.py b/chardet/sjisprober.py index 3bcbdb71..91df0779 100644 --- a/chardet/sjisprober.py +++ b/chardet/sjisprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import SJISDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ class SJISProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.distribution_analyzer = SJISDistributionAnalysis() self.context_analyzer = SJISContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return self.context_analyzer.charset_name @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -92,7 +97,9 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py index de2d4c0f..f94961e6 100644 --- a/chardet/universaldetector.py +++ b/chardet/universaldetector.py @@ -39,13 +39,16 @@ class a user of ``chardet`` should use. import codecs import logging import re +from typing import List, Optional, Union from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState, LanguageFilter, ProbingState from .escprober import EscCharSetProber from .latin1prober import Latin1Prober from .macromanprober import MacRomanProber from .mbcsgroupprober import MBCSGroupProber +from .resultdict import ResultDict from .sbcsgroupprober import SBCSGroupProber from .utf1632prober import UTF1632Prober @@ -82,33 +85,37 @@ class UniversalDetector: "iso-8859-13": "Windows-1257", } - def __init__(self, lang_filter=LanguageFilter.ALL): - self._esc_charset_prober = None - self._utf1632_prober = None - self._charset_probers = [] - self.result = None - self.done = None - self._got_data = None - self._input_state = None - self._last_char = None + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None: + self._esc_charset_prober: Optional[EscCharSetProber] = None + self._utf1632_prober: Optional[UTF1632Prober] = None + self._charset_probers: List[CharSetProber] = [] + self.result: ResultDict = { + "encoding": None, + "confidence": 0.0, + "language": None, + } + self.done = False + self._got_data = False + self._input_state = InputState.PURE_ASCII + self._last_char = b"" self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - self._has_win_bytes = None + self._has_win_bytes = False self.reset() @property - def input_state(self): + def input_state(self) -> int: return self._input_state @property - def has_win_bytes(self): + def has_win_bytes(self) -> bool: return self._has_win_bytes @property - def charset_probers(self): + def charset_probers(self) -> List[CharSetProber]: return self._charset_probers - def reset(self): + def reset(self) -> None: """ Reset the UniversalDetector and all of its probers back to their initial states. This is called by ``__init__``, so you only need to @@ -127,7 +134,7 @@ def reset(self): for prober in self._charset_probers: prober.reset() - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> None: """ Takes a chunk of a document and feeds it through all of the relevant charset probers. @@ -256,7 +263,7 @@ def feed(self, byte_str): if self.WIN_BYTE_DETECTOR.search(byte_str): self._has_win_bytes = True - def close(self): + def close(self) -> ResultDict: """ Stop analyzing the current document and come up with a final prediction. @@ -290,7 +297,8 @@ def close(self): max_prober = prober if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): charset_name = max_prober.charset_name - lower_charset_name = max_prober.charset_name.lower() + assert charset_name is not None + lower_charset_name = charset_name.lower() confidence = max_prober.get_confidence() # Use Windows encoding name instead of ISO-8859 if we saw any # extra Windows-specific bytes diff --git a/chardet/utf1632prober.py b/chardet/utf1632prober.py index 9fd1580b..6bdec63d 100644 --- a/chardet/utf1632prober.py +++ b/chardet/utf1632prober.py @@ -18,6 +18,8 @@ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber): # a fixed constant ratio of expected zeros or non-zeros in modulo-position. EXPECTED_RATIO = 0.94 - def __init__(self): + def __init__(self) -> None: super().__init__() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -51,7 +53,7 @@ def __init__(self): self.first_half_surrogate_pair_detected_16le = False self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -66,7 +68,7 @@ def reset(self): self.quad = [0, 0, 0, 0] @property - def charset_name(self): + def charset_name(self) -> str: if self.is_likely_utf32be(): return "utf-32be" if self.is_likely_utf32le(): @@ -79,16 +81,16 @@ def charset_name(self): return "utf-16" @property - def language(self): + def language(self) -> str: return "" - def approx_32bit_chars(self): + def approx_32bit_chars(self) -> float: return max(1.0, self.position / 4.0) - def approx_16bit_chars(self): + def approx_16bit_chars(self) -> float: return max(1.0, self.position / 2.0) - def is_likely_utf32be(self): + def is_likely_utf32be(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -98,7 +100,7 @@ def is_likely_utf32be(self): and not self.invalid_utf32be ) - def is_likely_utf32le(self): + def is_likely_utf32le(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -108,7 +110,7 @@ def is_likely_utf32le(self): and not self.invalid_utf32le ) - def is_likely_utf16be(self): + def is_likely_utf16be(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars @@ -118,7 +120,7 @@ def is_likely_utf16be(self): and not self.invalid_utf16be ) - def is_likely_utf16le(self): + def is_likely_utf16le(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars @@ -128,7 +130,7 @@ def is_likely_utf16le(self): and not self.invalid_utf16le ) - def validate_utf32_characters(self, quad): + def validate_utf32_characters(self, quad: List[int]) -> None: """ Validate if the quad of bytes is valid UTF-32. @@ -150,7 +152,7 @@ def validate_utf32_characters(self, quad): ): self.invalid_utf32le = True - def validate_utf16_characters(self, pair): + def validate_utf16_characters(self, pair: List[int]) -> None: """ Validate if the pair of bytes is valid UTF-16. @@ -182,7 +184,7 @@ def validate_utf16_characters(self, pair): else: self.invalid_utf16le = True - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: mod4 = self.position % 4 self.quad[mod4] = c @@ -198,7 +200,7 @@ def feed(self, byte_str): return self.state @property - def state(self): + def state(self) -> ProbingState: if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: # terminal, decided states return self._state @@ -210,7 +212,7 @@ def state(self): self._state = ProbingState.NOT_ME return self._state - def get_confidence(self): + def get_confidence(self) -> float: return ( 0.85 if ( diff --git a/chardet/utf8prober.py b/chardet/utf8prober.py index 3aae09e8..d96354d9 100644 --- a/chardet/utf8prober.py +++ b/chardet/utf8prober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,26 @@ class UTF8Prober(CharSetProber): ONE_CHAR_PROB = 0.5 - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) - self._num_mb_chars = None + self._num_mb_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.coding_sm.reset() self._num_mb_chars = 0 @property - def charset_name(self): + def charset_name(self) -> str: return "utf-8" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: coding_state = self.coding_sm.next_state(c) if coding_state == MachineState.ERROR: @@ -72,7 +74,7 @@ def feed(self, byte_str): return self.state - def get_confidence(self): + def get_confidence(self) -> float: unlike = 0.99 if self._num_mb_chars < 6: unlike *= self.ONE_CHAR_PROB**self._num_mb_chars diff --git a/setup.cfg b/setup.cfg index f13efeeb..1a4a9c65 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,9 +31,13 @@ classifiers = Topic :: Text Processing :: Linguistic [options] +include_package_data = true python_requires = >=3.7 packages = find: +[options.package_data] +chardet = py.typed + [options.entry_points] console_scripts = chardetect = chardet.cli.chardetect:main