From 4c69febebffa226d4cacfec403420b6c49f25c3c Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 28 Oct 2021 14:38:36 +0200 Subject: [PATCH] Code style as refactored by Sourcery (#131) Co-authored-by: Sourcery AI <> --- bin/bc.py | 7 ++--- bin/coverage.py | 7 ++--- charset_normalizer/api.py | 22 ++++++--------- charset_normalizer/cd.py | 37 +++++++++---------------- charset_normalizer/cli/normalizer.py | 29 ++++++++++---------- charset_normalizer/constant.py | 3 +- charset_normalizer/md.py | 41 +++++++++++++++------------- charset_normalizer/models.py | 3 +- charset_normalizer/utils.py | 18 ++++-------- 9 files changed, 72 insertions(+), 95 deletions(-) diff --git a/bin/bc.py b/bin/bc.py index 3049ceeb..273fcb57 100644 --- a/bin/bc.py +++ b/bin/bc.py @@ -18,11 +18,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str): return 0. character_count = len(str_a) - diff_character_count = 0 + diff_character_count = sum( + chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) + ) - for chr_a, chr_b in zip(str_a, str_b): - if chr_a != chr_b: - diff_character_count += 1 return 1. - (diff_character_count / character_count) diff --git a/bin/coverage.py b/bin/coverage.py index 4003bd3c..141f9d13 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -15,11 +15,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str): str_b = content.decode(cp_b) character_count = len(str_a) - diff_character_count = 0 + diff_character_count = sum( + chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) + ) - for chr_a, chr_b in zip(str_a, str_b): - if chr_a != chr_b: - diff_character_count += 1 return 1. - (diff_character_count / character_count) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index dce7cf30..a42ec693 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -131,7 +131,7 @@ def from_bytes( prioritized_encodings = [] # type: List[str] specified_encoding = ( - any_specified_encoding(sequences) if preemptive_behaviour is True else None + any_specified_encoding(sequences) if preemptive_behaviour else None ) # type: Optional[str] if specified_encoding is not None: @@ -185,7 +185,7 @@ def from_bytes( encoding_iana ) # type: bool - if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False: + if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available: logger.info( "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", encoding_iana, @@ -241,7 +241,7 @@ def from_bytes( continue r_ = range( - 0 if bom_or_sig_available is False else len(sig_payload), + 0 if not bom_or_sig_available else len(sig_payload), length, int(length / steps), ) @@ -261,9 +261,7 @@ def from_bytes( max_chunk_gave_up = int(len(r_) / 4) # type: int - if max_chunk_gave_up < 2: - max_chunk_gave_up = 2 - + max_chunk_gave_up = max(max_chunk_gave_up, 2) early_stop_count = 0 # type: int md_chunks = [] # type: List[str] @@ -281,9 +279,7 @@ def from_bytes( # not the cleanest way to perform that fix but clever enough for now. if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: - chunk_partial_size_chk = ( - 16 if chunk_size > 16 else chunk_size - ) # type: int + chunk_partial_size_chk = min(chunk_size, 16) # type: int if ( decoded_payload @@ -312,11 +308,9 @@ def from_bytes( ): break - if md_ratios: - mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float - else: - mean_mess_ratio = 0.0 - + mean_mess_ratio = ( + sum(md_ratios) / len(md_ratios) if md_ratios else 0.0 + ) # type: float if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up: tested_but_soft_failure.append(encoding_iana) logger.warning( diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index 9053d601..8429a0eb 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -119,9 +119,9 @@ def get_target_features(language: str) -> Tuple[bool, bool]: target_pure_latin = True # type: bool for character in FREQUENCIES[language]: - if target_have_accents is False and is_accentuated(character): + if not target_have_accents and is_accentuated(character): target_have_accents = True - if target_pure_latin is True and is_latin(character) is False: + if target_pure_latin and is_latin(character) is False: target_pure_latin = False return target_have_accents, target_pure_latin @@ -135,12 +135,7 @@ def alphabet_languages( """ languages = [] # type: List[Tuple[str, float]] - source_have_accents = False # type: bool - - for character in characters: - if is_accentuated(character): - source_have_accents = True - break + source_have_accents = any(is_accentuated(character) for character in characters) for language, language_characters in FREQUENCIES.items(): @@ -273,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: The return type is the same as coherence_ratio. """ per_language_ratios = OrderedDict() # type: Dict[str, List[float]] - merge = [] # type: CoherenceMatches - for result in results: for sub_result in result: language, ratio = sub_result @@ -283,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: continue per_language_ratios[language].append(ratio) - for language in per_language_ratios: - merge.append( - ( - language, - round( - sum(per_language_ratios[language]) - / len(per_language_ratios[language]), - 4, - ), - ) + merge = [ + ( + language, + round( + sum(per_language_ratios[language]) / len(per_language_ratios[language]), + 4, + ), ) + for language in per_language_ratios + ] return sorted(merge, key=lambda x: x[1], reverse=True) @@ -308,14 +300,11 @@ def coherence_ratio( """ results = [] # type: List[Tuple[str, float]] - lg_inclusion_list = [] # type: List[str] ignore_non_latin = False # type: bool sufficient_match_count = 0 # type: int - if lg_inclusion is not None: - lg_inclusion_list = lg_inclusion.split(",") - + lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] if "Latin Based" in lg_inclusion_list: ignore_non_latin = True lg_inclusion_list.remove("Latin Based") diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index f1911259..5f912c92 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int: o_.insert(-1, best_guess.encoding) if my_file.closed is False: my_file.close() - else: - if ( - args.force is False - and query_yes_no( - 'Are you sure to normalize "{}" by replacing it ?'.format( - my_file.name - ), - "no", - ) - is False - ): - if my_file.closed is False: - my_file.close() - continue + elif ( + args.force is False + and query_yes_no( + 'Are you sure to normalize "{}" by replacing it ?'.format( + my_file.name + ), + "no", + ) + is False + ): + if my_file.closed is False: + my_file.close() + continue try: x_[0].unicode_path = abspath("./{}".format(".".join(o_))) @@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int: print( ", ".join( [ - el.encoding if el.encoding else "undefined" + el.encoding or "undefined" for el in x_ if el.path == abspath(my_file.name) ] diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index 5f94108c..3d5d6457 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -32,7 +32,7 @@ UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int UNICODE_RANGES_COMBINED = { - "Control character": range(0, 31 + 1), + "Control character": range(31 + 1), "Basic Latin": range(32, 127 + 1), "Latin-1 Supplement": range(128, 255 + 1), "Latin Extended-A": range(256, 383 + 1), @@ -313,6 +313,7 @@ "Variation Selectors Supplement": range(917760, 917999 + 1), } # type: Dict[str, range] + UNICODE_SECONDARY_RANGE_KEYWORD = [ "Supplement", "Extended", diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 622d48c2..cc83f14f 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -170,15 +170,16 @@ def eligible(self, character: str) -> bool: def feed(self, character: str) -> None: self._character_count += 1 - if self._last_latin_character is not None: - if is_accentuated(character) and is_accentuated(self._last_latin_character): - if character.isupper() and self._last_latin_character.isupper(): - self._successive_count += 1 - # Worse if its the same char duplicated with different accent. - if remove_accent(character) == remove_accent( - self._last_latin_character - ): - self._successive_count += 1 + if ( + self._last_latin_character is not None + and is_accentuated(character) + and is_accentuated(self._last_latin_character) + ): + if character.isupper() and self._last_latin_character.isupper(): + self._successive_count += 1 + # Worse if its the same char duplicated with different accent. + if remove_accent(character) == remove_accent(self._last_latin_character): + self._successive_count += 1 self._last_latin_character = character def reset(self) -> None: # pragma: no cover @@ -346,7 +347,7 @@ def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: - if character in ["丅", "丄"]: + if character in {"丅", "丄"}: self._wrong_stop_count += 1 return if is_cjk(character): @@ -459,9 +460,10 @@ def is_suspiciously_successive_range( # Latin characters can be accompanied with a combining diacritical mark # eg. Vietnamese. - if "Latin" in unicode_range_a or "Latin" in unicode_range_b: - if "Combining" in unicode_range_a or "Combining" in unicode_range_b: - return False + if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( + "Combining" in unicode_range_a or "Combining" in unicode_range_b + ): + return False keywords_range_a, keywords_range_b = unicode_range_a.split( " " @@ -482,11 +484,12 @@ def is_suspiciously_successive_range( ), unicode_range_b in ("Hiragana", "Katakana"), ) - if range_a_jp_chars or range_b_jp_chars: - if "CJK" in unicode_range_a or "CJK" in unicode_range_b: - return False - if range_a_jp_chars and range_b_jp_chars: - return False + if (range_a_jp_chars or range_b_jp_chars) and ( + "CJK" in unicode_range_a or "CJK" in unicode_range_b + ): + return False + if range_a_jp_chars and range_b_jp_chars: + return False if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: @@ -530,7 +533,7 @@ def mess_ratio( else: intermediary_mean_mess_ratio_calc = 128 - for character, index in zip(decoded_sequence + "\n", range(0, length)): + for character, index in zip(decoded_sequence + "\n", range(length)): for detector in detectors: if detector.eligible(character): detector.feed(character) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 68c27b89..c38da31f 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -284,8 +284,7 @@ def __init__(self, results: List[CharsetMatch] = None): self._results = sorted(results) if results else [] # type: List[CharsetMatch] def __iter__(self) -> Iterator[CharsetMatch]: - for result in self._results: - yield result + yield from self._results def __getitem__(self, item: Union[int, str]) -> CharsetMatch: """ diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index b9d12784..9a3d94a4 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -122,7 +122,7 @@ def is_emoticon(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_separator(character: str) -> bool: - if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]: + if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: return True character_category = unicodedata.category(character) # type: str @@ -138,7 +138,7 @@ def is_case_variable(character: str) -> bool: def is_private_use_only(character: str) -> bool: character_category = unicodedata.category(character) # type: str - return "Co" == character_category + return character_category == "Co" @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -193,11 +193,7 @@ def is_thai(character: str) -> bool: @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: - for keyword in UNICODE_SECONDARY_RANGE_KEYWORD: - if keyword in range_name: - return True - - return False + return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: @@ -211,9 +207,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional results = findall( RE_POSSIBLE_ENCODING_INDICATION, - sequence[: seq_len if seq_len <= search_zone else search_zone].decode( - "ascii", errors="ignore" - ), + sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), ) # type: List[str] if len(results) == 0: @@ -278,7 +272,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str: cp_name = cp_name.lower().replace("-", "_") for encoding_alias, encoding_iana in aliases.items(): - if cp_name == encoding_alias or cp_name == encoding_iana: + if cp_name in [encoding_alias, encoding_iana]: return encoding_iana if strict: @@ -314,7 +308,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: character_match_count = 0 # type: int - for i in range(0, 255): + for i in range(255): to_be_decoded = bytes([i]) # type: bytes if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): character_match_count += 1