Skip to content

Commit

Permalink
Efficiency improvements in cd/alphabet_languages (#122)
Browse files Browse the repository at this point in the history
* cache features of the target languages

* add LANGUAGE_SUPPORTED_COUNT

in order to keep naming style/conv from above

* ✔️ Add test cases
  • Loading branch information
adbar committed Oct 23, 2021
1 parent 38cfa45 commit f1cf425
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 10 deletions.
28 changes: 19 additions & 9 deletions charset_normalizer/cd.py
Expand Up @@ -5,7 +5,7 @@
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
Expand Down Expand Up @@ -110,6 +110,23 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
return []


@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for character in FREQUENCIES[language]:
if target_have_accents is False and is_accentuated(character):
target_have_accents = True
if target_pure_latin is True and is_latin(character) is False:
target_pure_latin = False

return target_have_accents, target_pure_latin


def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
Expand All @@ -127,14 +144,7 @@ def alphabet_languages(

for language, language_characters in FREQUENCIES.items():

target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for language_character in language_characters:
if target_have_accents is False and is_accentuated(language_character):
target_have_accents = True
if target_pure_latin is True and is_latin(language_character) is False:
target_pure_latin = False
target_have_accents, target_pure_latin = get_target_features(language)

if ignore_non_latin and target_pure_latin is False:
continue
Expand Down
4 changes: 4 additions & 0 deletions charset_normalizer/constant.py
Expand Up @@ -4,6 +4,8 @@
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Set, Union

from .assets import FREQUENCIES

# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS = OrderedDict(
[
Expand Down Expand Up @@ -493,3 +495,5 @@
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]

NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")

LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int
20 changes: 19 additions & 1 deletion tests/test_coherence_detection.py
@@ -1,5 +1,5 @@
import pytest
from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding
from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features


@pytest.mark.parametrize(
Expand All @@ -21,3 +21,21 @@ def test_infer_language_from_cp(iana_encoding, expected_languages):

for expected_language in expected_languages:
assert expected_language in languages, "Wrongly detected language for given code page"


@pytest.mark.parametrize(
"language, expected_have_accents, expected_pure_latin",
[
("English", False, True),
("French", True, True),
("Hebrew", False, False),
("Arabic", False, False),
("Vietnamese", True, True),
("Turkish", True, True)
]
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
target_have_accents, target_pure_latin = get_target_features(language)

assert target_have_accents is expected_have_accents
assert target_pure_latin is expected_pure_latin

0 comments on commit f1cf425

Please sign in to comment.