Efficiency improvements in cd/alphabet_languages (#122)

* cache features of the target languages * add LANGUAGE_SUPPORTED_COUNT in order to keep naming style/conv from above * ✔️ Add test cases
Ousret · Oct 23, 2021 · f1cf425 · f1cf425
1 parent 38cfa45
commit f1cf425
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 10 deletions.
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -5,7 +5,7 @@
 from typing import Dict, List, Optional, Tuple
 
 from .assets import FREQUENCIES
-from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
+from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
 from .md import is_suspiciously_successive_range
 from .models import CoherenceMatches
 from .utils import (
@@ -110,6 +110,23 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
     return []
 
 
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> Tuple[bool, bool]:
+    """
+    Determine main aspects from a supported language if it contains accents and if is pure Latin.
+    """
+    target_have_accents = False  # type: bool
+    target_pure_latin = True  # type: bool
+
+    for character in FREQUENCIES[language]:
+        if target_have_accents is False and is_accentuated(character):
+            target_have_accents = True
+        if target_pure_latin is True and is_latin(character) is False:
+            target_pure_latin = False
+
+    return target_have_accents, target_pure_latin
+
+
 def alphabet_languages(
     characters: List[str], ignore_non_latin: bool = False
 ) -> List[str]:
@@ -127,14 +144,7 @@ def alphabet_languages(
 
     for language, language_characters in FREQUENCIES.items():
 
-        target_have_accents = False  # type: bool
-        target_pure_latin = True  # type: bool
-
-        for language_character in language_characters:
-            if target_have_accents is False and is_accentuated(language_character):
-                target_have_accents = True
-            if target_pure_latin is True and is_latin(language_character) is False:
-                target_pure_latin = False
+        target_have_accents, target_pure_latin = get_target_features(language)
 
         if ignore_non_latin and target_pure_latin is False:
             continue

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -4,6 +4,8 @@
 from re import IGNORECASE, compile as re_compile
 from typing import Dict, List, Set, Union
 
+from .assets import FREQUENCIES
+
 # Contain for each eligible encoding a list of/item bytes SIG/BOM
 ENCODING_MARKS = OrderedDict(
     [
@@ -493,3 +495,5 @@
 ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]
 
 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
+
+LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py
@@ -1,5 +1,5 @@
 import pytest
-from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding
+from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features
 
 
 @pytest.mark.parametrize(
@@ -21,3 +21,21 @@ def test_infer_language_from_cp(iana_encoding, expected_languages):
 
     for expected_language in expected_languages:
         assert expected_language in languages, "Wrongly detected language for given code page"
+
+
+@pytest.mark.parametrize(
+    "language, expected_have_accents, expected_pure_latin",
+    [
+        ("English", False, True),
+        ("French", True, True),
+        ("Hebrew", False, False),
+        ("Arabic", False, False),
+        ("Vietnamese", True, True),
+        ("Turkish", True, True)
+    ]
+)
+def test_target_features(language, expected_have_accents, expected_pure_latin):
+    target_have_accents, target_pure_latin = get_target_features(language)
+
+    assert target_have_accents is expected_have_accents
+    assert target_pure_latin is expected_pure_latin