Refactoring for potential performance improvements in loops (#113)

* reviewed encoding language associations: caches and sets defined * use list comprehension for language association (#111) * use list comprehension and filter in char analysis (#111) * refactored variable inits in md.py * models: move regex compilation to constants * detection of Japanese characters: simplify syntax * amend detected_ranges semantics Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
jawah · Sep 24, 2021 · 42a7d3d · 42a7d3d
1 parent 59e48eb
commit 42a7d3d
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 70 deletions.
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -5,6 +5,7 @@
 from typing import Dict, List, Optional, Tuple
 
 from .assets import FREQUENCIES
+from .constant import KO_NAMES, ZH_NAMES
 from .md import is_suspiciously_successive_range
 from .models import CoherenceMatches
 from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
@@ -82,6 +83,7 @@ def encoding_languages(iana_name: str) -> List[str]:
     return unicode_range_languages(primary_range)
 
 
+@lru_cache()
 def mb_encoding_languages(iana_name: str) -> List[str]:
     """
     Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
@@ -94,9 +96,9 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
         or iana_name == "cp932"
     ):
         return ["Japanese"]
-    if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
+    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
         return ["Chinese", "Classical Chinese"]
-    if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
+    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
         return ["Korean"]
 
     return []
@@ -109,12 +111,11 @@ def alphabet_languages(characters: List[str]) -> List[str]:
     languages = []  # type: List[str]
 
     for language, language_characters in FREQUENCIES.items():
-        character_match_count = 0  # type: int
         character_count = len(language_characters)  # type: int
 
-        for character in language_characters:
-            if character in characters:
-                character_match_count += 1
+        character_match_count = len(
+            [c for c in language_characters if c in characters]
+        )  # type: int
 
         if character_match_count / character_count >= 0.2:
             languages.append(language)

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -2,7 +2,7 @@
 from collections import OrderedDict
 from encodings.aliases import aliases
 from re import IGNORECASE, compile as re_compile
-from typing import Dict, List, Union
+from typing import Dict, List, Set, Union
 
 # Contain for each eligible encoding a list of/item bytes SIG/BOM
 ENCODING_MARKS = OrderedDict(
@@ -469,3 +469,28 @@
     "cp1254": "Windows-1254",
     "cp949": "CP949",
 }  # type: Dict[str, str]
+
+
+COMMON_SAFE_ASCII_CHARACTERS = {
+    "<",
+    ">",
+    "=",
+    ":",
+    "/",
+    "&",
+    ";",
+    "{",
+    "}",
+    "[",
+    "]",
+    ",",
+    "|",
+    '"',
+    "-",
+}  # type: Set[str]
+
+
+KO_NAMES = {"johab", "cp949", "euc_kr"}  # type: Set[str]
+ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]
+
+NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -1,7 +1,7 @@
 from functools import lru_cache
 from typing import List, Optional
 
-from .constant import UNICODE_SECONDARY_RANGE_KEYWORD
+from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
 from .utils import (
     is_accentuated,
     is_ascii,
@@ -70,23 +70,10 @@ def eligible(self, character: str) -> bool:
     def feed(self, character: str) -> None:
         self._character_count += 1
 
-        if character != self._last_printable_char and character not in [
-            "<",
-            ">",
-            "=",
-            ":",
-            "/",
-            "&",
-            ";",
-            "{",
-            "}",
-            "[",
-            "]",
-            ",",
-            "|",
-            '"',
-            "-",
-        ]:
+        if (
+            character != self._last_printable_char
+            and character not in COMMON_SAFE_ASCII_CHARACTERS
+        ):
             if is_punctuation(character):
                 self._punctuation_count += 1
             elif (
@@ -153,10 +140,9 @@ def eligible(self, character: str) -> bool:
 
     def feed(self, character: str) -> None:
         if (
-            character not in {"\n", "\t", "\r", "\v"}
+            character.isspace() is False  # includes \n \t \r \v
             and character.isprintable() is False
-            and character.isspace() is False
-            and ord(character) != 0x1A  # Why? Its the ASCII substitute character.
+            and character != "\x1A"  # Why? Its the ASCII substitute character.
         ):
             self._unprintable_count += 1
         self._character_count += 1
@@ -223,24 +209,7 @@ def feed(self, character: str) -> None:
         if (
             character.isspace()
             or is_punctuation(character)
-            or character
-            in [
-                "<",
-                ">",
-                "=",
-                ":",
-                "/",
-                "&",
-                ";",
-                "{",
-                "}",
-                "[",
-                "]",
-                ",",
-                "|",
-                '"',
-                "-",
-            ]
+            or character in COMMON_SAFE_ASCII_CHARACTERS
         ):
             self._last_printable_seen = None
             return
@@ -495,18 +464,19 @@ def is_suspiciously_successive_range(
             return False
 
     # Japanese Exception
-    if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
-        "Katakana",
-        "Hiragana",
-    ]:
-        return False
-
-    if unicode_range_a in ["Katakana", "Hiragana"] or unicode_range_b in [
-        "Katakana",
-        "Hiragana",
-    ]:
+    range_a_jp_chars, range_b_jp_chars = (
+        unicode_range_a
+        in (
+            "Hiragana",
+            "Katakana",
+        ),
+        unicode_range_b in ("Hiragana", "Katakana"),
+    )
+    if range_a_jp_chars or range_b_jp_chars:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
             return False
+        if range_a_jp_chars and range_b_jp_chars:
+            return False
 
     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
@@ -534,10 +504,10 @@ def mess_ratio(
     """
     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
     """
-    detectors = []  # type: List[MessDetectorPlugin]
 
-    for md_class in MessDetectorPlugin.__subclasses__():
-        detectors.append(md_class())
+    detectors = [
+        md_class() for md_class in MessDetectorPlugin.__subclasses__()
+    ]  # type: List[MessDetectorPlugin]
 
     length = len(decoded_sequence)  # type: int
 

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -3,10 +3,10 @@
 from encodings.aliases import aliases
 from hashlib import sha256
 from json import dumps
-from re import compile as re_compile, sub
-from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
+from re import sub
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
-from .constant import TOO_BIG_SEQUENCE
+from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
 from .md import mess_ratio
 from .utils import iana_name, is_multi_byte_encoding, unicode_range
 
@@ -102,8 +102,8 @@ def w_counter(self) -> Counter:
         warnings.warn(
             "w_counter is deprecated and will be removed in 3.0", DeprecationWarning
         )
-        not_printable_pattern = re_compile(r"[0-9\W\n\r\t]+")
-        string_printable_only = sub(not_printable_pattern, " ", str(self).lower())
+
+        string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
 
         return Counter(string_printable_only.split())
 
@@ -225,12 +225,12 @@ def has_submatch(self) -> bool:
     def alphabets(self) -> List[str]:
         if self._unicode_ranges is not None:
             return self._unicode_ranges
-        detected_ranges = set()  # type: Set[str]
-        for character in str(self):
-            detected_range = unicode_range(character)  # type: Optional[str]
-            if detected_range:
-                detected_ranges.add(detected_range)
-        self._unicode_ranges = sorted(list(detected_ranges))
+        # list detected ranges
+        detected_ranges = [
+            unicode_range(char) for char in str(self)
+        ]  # type: List[Optional[str]]
+        # filter and sort
+        self._unicode_ranges = sorted([r for r in detected_ranges if r])  # type: ignore
         return self._unicode_ranges
 
     @property