jawah · Ousret · Sep 24, 2021 · Sep 21, 2021 · Sep 21, 2021 · Sep 21, 2021
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -62,7 +62,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
     return languages
 
 
-@lru_cache()
+@lru_cache(maxsize=4096) # up to 2999 IANA char sets
 def encoding_languages(iana_name: str) -> List[str]:
     """
     Single-byte encoding language association. Some code page are heavily linked to particular language(s).
@@ -82,21 +82,26 @@ def encoding_languages(iana_name: str) -> List[str]:
     return unicode_range_languages(primary_range)
 
 
+ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}
+KO_NAMES = {"johab", "cp949", "euc_kr"}
+
+@lru_cache(maxsize=4096) # up to 2999 IANA char sets
 def mb_encoding_languages(iana_name: str) -> List[str]:
     """
     Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
     This function does the correspondence.
     """
+    # todo: order conditions by general language/encoding frequency?
     if (
         iana_name.startswith("shift_")
         or iana_name.startswith("iso2022_jp")
         or iana_name.startswith("euc_j")
         or iana_name == "cp932"
     ):
         return ["Japanese"]
-    if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
+    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
         return ["Chinese", "Classical Chinese"]
-    if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
+    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
         return ["Korean"]
 
     return []
@@ -109,12 +114,10 @@ def alphabet_languages(characters: List[str]) -> List[str]:
     languages = []  # type: List[str]
 
     for language, language_characters in FREQUENCIES.items():
-        character_match_count = 0  # type: int
         character_count = len(language_characters)  # type: int
 
-        for character in language_characters:
-            if character in characters:
-                character_match_count += 1
+        # type: int
+        character_match_count = len([c for c in language_characters if c in characters])
 
         if character_match_count / character_count >= 0.2:
             languages.append(language)

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -21,6 +21,25 @@
 )
 
 
+SUSPICIOUS = {
+                "<",
+                ">",
+                "=",
+                ":",
+                "/",
+                "&",
+                ";",
+                "{",
+                "}",
+                "[",
+                "]",
+                ",",
+                "|",
+                '"',
+                "-",
+}
+
+
 class MessDetectorPlugin:
     """
     Base abstract class used for mess detection plugins.
@@ -70,23 +89,7 @@ def eligible(self, character: str) -> bool:
     def feed(self, character: str) -> None:
         self._character_count += 1
 
-        if character != self._last_printable_char and character not in [
-            "<",
-            ">",
-            "=",
-            ":",
-            "/",
-            "&",
-            ";",
-            "{",
-            "}",
-            "[",
-            "]",
-            ",",
-            "|",
-            '"',
-            "-",
-        ]:
+        if character != self._last_printable_char and character not in SUSPICIOUS:
             if is_punctuation(character):
                 self._punctuation_count += 1
             elif (
@@ -153,9 +156,8 @@ def eligible(self, character: str) -> bool:
 
     def feed(self, character: str) -> None:
         if (
-            character not in {"\n", "\t", "\r", "\v"}
+            character.isspace() is False  # includes \n \t \r \v
             and character.isprintable() is False
-            and character.isspace() is False
             and ord(character) != 0x1A  # Why? Its the ASCII substitute character.
         ):
             self._unprintable_count += 1
@@ -223,24 +225,7 @@ def feed(self, character: str) -> None:
         if (
             character.isspace()
             or is_punctuation(character)
-            or character
-            in [
-                "<",
-                ">",
-                "=",
-                ":",
-                "/",
-                "&",
-                ";",
-                "{",
-                "}",
-                "[",
-                "]",
-                ",",
-                "|",
-                '"',
-                "-",
-            ]
+            or character in SUSPICIOUS
         ):
             self._last_printable_seen = None
             return
@@ -495,18 +480,17 @@ def is_suspiciously_successive_range(
             return False
 
     # Japanese Exception
-    if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
-        "Katakana",
-        "Hiragana",
-    ]:
-        return False
-
     if unicode_range_a in ["Katakana", "Hiragana"] or unicode_range_b in [
         "Katakana",
         "Hiragana",
     ]:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
             return False
+        if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
+            "Katakana",
+            "Hiragana",
+        ]:
+            return False
 
     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
@@ -533,11 +517,9 @@ def mess_ratio(
 ) -> float:
     """
     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
-    """
-    detectors = []  # type: List[MessDetectorPlugin]
+    """ 
 
-    for md_class in MessDetectorPlugin.__subclasses__():
-        detectors.append(md_class())
+    detectors = [md_class() for md_class in MessDetectorPlugin.__subclasses__()]  # type: List[MessDetectorPlugin]
 
     length = len(decoded_sequence)  # type: int
 

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -225,12 +225,10 @@ def has_submatch(self) -> bool:
     def alphabets(self) -> List[str]:
         if self._unicode_ranges is not None:
             return self._unicode_ranges
-        detected_ranges = set()  # type: Set[str]
-        for character in str(self):
-            detected_range = unicode_range(character)  # type: Optional[str]
-            if detected_range:
-                detected_ranges.add(detected_range)
-        self._unicode_ranges = sorted(list(detected_ranges))
+        # list detected ranges
+        detected_ranges = [unicode_range(char) for char in str(self)]
+        # filter and sort
+        self._unicode_ranges = sorted(filter(None.__ne__, detected_ranges))
         return self._unicode_ranges
 
     @property