diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index a4512fbb..47044c03 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -5,7 +5,7 @@ from typing import Dict, List, Optional, Tuple from .assets import FREQUENCIES -from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES +from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES from .md import is_suspiciously_successive_range from .models import CoherenceMatches from .utils import ( @@ -110,6 +110,23 @@ def mb_encoding_languages(iana_name: str) -> List[str]: return [] +@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) +def get_target_features(language: str) -> Tuple[bool, bool]: + """ + Determine main aspects from a supported language if it contains accents and if is pure Latin. + """ + target_have_accents = False # type: bool + target_pure_latin = True # type: bool + + for character in FREQUENCIES[language]: + if target_have_accents is False and is_accentuated(character): + target_have_accents = True + if target_pure_latin is True and is_latin(character) is False: + target_pure_latin = False + + return target_have_accents, target_pure_latin + + def alphabet_languages( characters: List[str], ignore_non_latin: bool = False ) -> List[str]: @@ -127,14 +144,7 @@ def alphabet_languages( for language, language_characters in FREQUENCIES.items(): - target_have_accents = False # type: bool - target_pure_latin = True # type: bool - - for language_character in language_characters: - if target_have_accents is False and is_accentuated(language_character): - target_have_accents = True - if target_pure_latin is True and is_latin(language_character) is False: - target_pure_latin = False + target_have_accents, target_pure_latin = get_target_features(language) if ignore_non_latin and target_pure_latin is False: continue diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index d807ac6a..5f94108c 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -4,6 +4,8 @@ from re import IGNORECASE, compile as re_compile from typing import Dict, List, Set, Union +from .assets import FREQUENCIES + # Contain for each eligible encoding a list of/item bytes SIG/BOM ENCODING_MARKS = OrderedDict( [ @@ -493,3 +495,5 @@ ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str] NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") + +LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py index 7da21eea..6ad95927 100644 --- a/tests/test_coherence_detection.py +++ b/tests/test_coherence_detection.py @@ -1,5 +1,5 @@ import pytest -from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding +from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features @pytest.mark.parametrize( @@ -21,3 +21,21 @@ def test_infer_language_from_cp(iana_encoding, expected_languages): for expected_language in expected_languages: assert expected_language in languages, "Wrongly detected language for given code page" + + +@pytest.mark.parametrize( + "language, expected_have_accents, expected_pure_latin", + [ + ("English", False, True), + ("French", True, True), + ("Hebrew", False, False), + ("Arabic", False, False), + ("Vietnamese", True, True), + ("Turkish", True, True) + ] +) +def test_target_features(language, expected_have_accents, expected_pure_latin): + target_have_accents, target_pure_latin = get_target_features(language) + + assert target_have_accents is expected_have_accents + assert target_pure_latin is expected_pure_latin