Ousret · Ousret · Oct 28, 2021 · Oct 25, 2021 · Oct 25, 2021 · Oct 25, 2021
diff --git a/bin/bc.py b/bin/bc.py
@@ -18,11 +18,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
         return 0.
 
     character_count = len(str_a)
-    diff_character_count = 0
+    diff_character_count = sum(
+        chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
+    )
 
-    for chr_a, chr_b in zip(str_a, str_b):
-        if chr_a != chr_b:
-            diff_character_count += 1
 
     return 1. - (diff_character_count / character_count)
 

diff --git a/bin/coverage.py b/bin/coverage.py
@@ -15,11 +15,10 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
     str_b = content.decode(cp_b)
 
     character_count = len(str_a)
-    diff_character_count = 0
+    diff_character_count = sum(
+        chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
+    )
 
-    for chr_a, chr_b in zip(str_a, str_b):
-        if chr_a != chr_b:
-            diff_character_count += 1
 
     return 1. - (diff_character_count / character_count)
 

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -131,7 +131,7 @@ def from_bytes(
     prioritized_encodings = []  # type: List[str]
 
     specified_encoding = (
-        any_specified_encoding(sequences) if preemptive_behaviour is True else None
+        any_specified_encoding(sequences) if preemptive_behaviour else None
     )  # type: Optional[str]
 
     if specified_encoding is not None:
@@ -185,7 +185,7 @@ def from_bytes(
             encoding_iana
         )  # type: bool
 
-        if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
+        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
             logger.info(
                 "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
                 encoding_iana,
@@ -241,7 +241,7 @@ def from_bytes(
             continue
 
         r_ = range(
-            0 if bom_or_sig_available is False else len(sig_payload),
+            0 if not bom_or_sig_available else len(sig_payload),
             length,
             int(length / steps),
         )
@@ -261,9 +261,7 @@ def from_bytes(
 
         max_chunk_gave_up = int(len(r_) / 4)  # type: int
 
-        if max_chunk_gave_up < 2:
-            max_chunk_gave_up = 2
-
+        max_chunk_gave_up = max(max_chunk_gave_up, 2)
         early_stop_count = 0  # type: int
 
         md_chunks = []  # type: List[str]
@@ -281,9 +279,7 @@ def from_bytes(
             # not the cleanest way to perform that fix but clever enough for now.
             if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
 
-                chunk_partial_size_chk = (
-                    16 if chunk_size > 16 else chunk_size
-                )  # type: int
+                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
 
                 if (
                     decoded_payload
@@ -312,11 +308,9 @@ def from_bytes(
             ):
                 break
 
-        if md_ratios:
-            mean_mess_ratio = sum(md_ratios) / len(md_ratios)  # type: float
-        else:
-            mean_mess_ratio = 0.0
-
+        mean_mess_ratio = (
+            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
+        )  # type: float
         if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
             tested_but_soft_failure.append(encoding_iana)
             logger.warning(

diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -119,9 +119,9 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
     target_pure_latin = True  # type: bool
 
     for character in FREQUENCIES[language]:
-        if target_have_accents is False and is_accentuated(character):
+        if not target_have_accents and is_accentuated(character):
             target_have_accents = True
-        if target_pure_latin is True and is_latin(character) is False:
+        if target_pure_latin and is_latin(character) is False:
             target_pure_latin = False
 
     return target_have_accents, target_pure_latin
@@ -135,12 +135,7 @@ def alphabet_languages(
     """
     languages = []  # type: List[Tuple[str, float]]
 
-    source_have_accents = False  # type: bool
-
-    for character in characters:
-        if is_accentuated(character):
-            source_have_accents = True
-            break
+    source_have_accents = any(is_accentuated(character) for character in characters)
 
     for language, language_characters in FREQUENCIES.items():
 
@@ -273,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
     The return type is the same as coherence_ratio.
     """
     per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
-    merge = []  # type: CoherenceMatches
-
     for result in results:
         for sub_result in result:
             language, ratio = sub_result
@@ -283,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
                 continue
             per_language_ratios[language].append(ratio)
 
-    for language in per_language_ratios:
-        merge.append(
-            (
-                language,
-                round(
-                    sum(per_language_ratios[language])
-                    / len(per_language_ratios[language]),
-                    4,
-                ),
-            )
+    merge = [
+        (
+            language,
+            round(
+                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+                4,
+            ),
         )
+        for language in per_language_ratios
+    ]
 
     return sorted(merge, key=lambda x: x[1], reverse=True)
 
@@ -308,14 +300,11 @@ def coherence_ratio(
     """
 
     results = []  # type: List[Tuple[str, float]]
-    lg_inclusion_list = []  # type: List[str]
     ignore_non_latin = False  # type: bool
 
     sufficient_match_count = 0  # type: int
 
-    if lg_inclusion is not None:
-        lg_inclusion_list = lg_inclusion.split(",")
-
+    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
     if "Latin Based" in lg_inclusion_list:
         ignore_non_latin = True
         lg_inclusion_list.remove("Latin Based")

diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
@@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int:
                     o_.insert(-1, best_guess.encoding)
                     if my_file.closed is False:
                         my_file.close()
-                else:
-                    if (
-                        args.force is False
-                        and query_yes_no(
-                            'Are you sure to normalize "{}" by replacing it ?'.format(
-                                my_file.name
-                            ),
-                            "no",
-                        )
-                        is False
-                    ):
-                        if my_file.closed is False:
-                            my_file.close()
-                        continue
+                elif (
+                    args.force is False
+                    and query_yes_no(
+                        'Are you sure to normalize "{}" by replacing it ?'.format(
+                            my_file.name
+                        ),
+                        "no",
+                    )
+                    is False
+                ):
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue
 
                 try:
                     x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
@@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int:
             print(
                 ", ".join(
                     [
-                        el.encoding if el.encoding else "undefined"
+                        el.encoding or "undefined"
                         for el in x_
                         if el.path == abspath(my_file.name)
                     ]

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -32,7 +32,7 @@
 UTF8_MAXIMAL_ALLOCATION = 1112064  # type: int
 
 UNICODE_RANGES_COMBINED = {
-    "Control character": range(0, 31 + 1),
+    "Control character": range(31 + 1),
     "Basic Latin": range(32, 127 + 1),
     "Latin-1 Supplement": range(128, 255 + 1),
     "Latin Extended-A": range(256, 383 + 1),
@@ -313,6 +313,7 @@
     "Variation Selectors Supplement": range(917760, 917999 + 1),
 }  # type: Dict[str, range]
 
+
 UNICODE_SECONDARY_RANGE_KEYWORD = [
     "Supplement",
     "Extended",

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -170,15 +170,16 @@ def eligible(self, character: str) -> bool:
 
     def feed(self, character: str) -> None:
         self._character_count += 1
-        if self._last_latin_character is not None:
-            if is_accentuated(character) and is_accentuated(self._last_latin_character):
-                if character.isupper() and self._last_latin_character.isupper():
-                    self._successive_count += 1
-                # Worse if its the same char duplicated with different accent.
-                if remove_accent(character) == remove_accent(
-                    self._last_latin_character
-                ):
-                    self._successive_count += 1
+        if (
+            self._last_latin_character is not None
+            and is_accentuated(character)
+            and is_accentuated(self._last_latin_character)
+        ):
+            if character.isupper() and self._last_latin_character.isupper():
+                self._successive_count += 1
+            # Worse if its the same char duplicated with different accent.
+            if remove_accent(character) == remove_accent(self._last_latin_character):
+                self._successive_count += 1
         self._last_latin_character = character
 
     def reset(self) -> None:  # pragma: no cover
@@ -346,7 +347,7 @@ def eligible(self, character: str) -> bool:
         return True
 
     def feed(self, character: str) -> None:
-        if character in ["丅", "丄"]:
+        if character in {"丅", "丄"}:
             self._wrong_stop_count += 1
             return
         if is_cjk(character):
@@ -459,9 +460,10 @@ def is_suspiciously_successive_range(
 
     # Latin characters can be accompanied with a combining diacritical mark
     # eg. Vietnamese.
-    if "Latin" in unicode_range_a or "Latin" in unicode_range_b:
-        if "Combining" in unicode_range_a or "Combining" in unicode_range_b:
-            return False
+    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
+        "Combining" in unicode_range_a or "Combining" in unicode_range_b
+    ):
+        return False
 
     keywords_range_a, keywords_range_b = unicode_range_a.split(
         " "
@@ -482,11 +484,12 @@ def is_suspiciously_successive_range(
         ),
         unicode_range_b in ("Hiragana", "Katakana"),
     )
-    if range_a_jp_chars or range_b_jp_chars:
-        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
-            return False
-        if range_a_jp_chars and range_b_jp_chars:
-            return False
+    if (range_a_jp_chars or range_b_jp_chars) and (
+        "CJK" in unicode_range_a or "CJK" in unicode_range_b
+    ):
+        return False
+    if range_a_jp_chars and range_b_jp_chars:
+        return False
 
     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
@@ -530,7 +533,7 @@ def mess_ratio(
     else:
         intermediary_mean_mess_ratio_calc = 128
 
-    for character, index in zip(decoded_sequence + "\n", range(0, length)):
+    for character, index in zip(decoded_sequence + "\n", range(length)):
         for detector in detectors:
             if detector.eligible(character):
                 detector.feed(character)

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -284,8 +284,7 @@ def __init__(self, results: List[CharsetMatch] = None):
         self._results = sorted(results) if results else []  # type: List[CharsetMatch]
 
     def __iter__(self) -> Iterator[CharsetMatch]:
-        for result in self._results:
-            yield result
+        yield from self._results
 
     def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
         """

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -122,7 +122,7 @@ def is_emoticon(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_separator(character: str) -> bool:
-    if character.isspace() or character in ["｜", "+", ",", ";", "<", ">"]:
+    if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
         return True
 
     character_category = unicodedata.category(character)  # type: str
@@ -138,7 +138,7 @@ def is_case_variable(character: str) -> bool:
 def is_private_use_only(character: str) -> bool:
     character_category = unicodedata.category(character)  # type: str
 
-    return "Co" == character_category
+    return character_category == "Co"
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -193,11 +193,7 @@ def is_thai(character: str) -> bool:
 
 @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
 def is_unicode_range_secondary(range_name: str) -> bool:
-    for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
-        if keyword in range_name:
-            return True
-
-    return False
+    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
 
 
 def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
@@ -211,9 +207,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
 
     results = findall(
         RE_POSSIBLE_ENCODING_INDICATION,
-        sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
-            "ascii", errors="ignore"
-        ),
+        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
     )  # type: List[str]
 
     if len(results) == 0:
@@ -278,7 +272,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
     cp_name = cp_name.lower().replace("-", "_")
 
     for encoding_alias, encoding_iana in aliases.items():
-        if cp_name == encoding_alias or cp_name == encoding_iana:
+        if cp_name in [encoding_alias, encoding_iana]:
             return encoding_iana
 
     if strict:
@@ -314,7 +308,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
 
     character_match_count = 0  # type: int
 
-    for i in range(0, 255):
+    for i in range(255):
         to_be_decoded = bytes([i])  # type: bytes
         if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
             character_match_count += 1