Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficiency improvements in cd/alphabet_languages #122

Merged
merged 16 commits into from Oct 23, 2021
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 20 additions & 9 deletions charset_normalizer/cd.py
Expand Up @@ -110,6 +110,24 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
return []


@lru_cache(maxsize=64)
adbar marked this conversation as resolved.
Show resolved Hide resolved
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Cache features (accents, latin) of the target language to be tested against
adbar marked this conversation as resolved.
Show resolved Hide resolved
the input string.
"""
target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for character in FREQUENCIES[language]:
if target_have_accents is False and is_accentuated(character):
target_have_accents = True
if target_pure_latin is True and is_latin(character) is False:
target_pure_latin = False

return target_have_accents, target_pure_latin


def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
Expand All @@ -120,21 +138,14 @@ def alphabet_languages(

source_have_accents = False # type: bool

for character in characters:
for character in set(characters):
adbar marked this conversation as resolved.
Show resolved Hide resolved
if is_accentuated(character):
source_have_accents = True
break

for language, language_characters in FREQUENCIES.items():

target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for language_character in language_characters:
if target_have_accents is False and is_accentuated(language_character):
target_have_accents = True
if target_pure_latin is True and is_latin(language_character) is False:
target_pure_latin = False
target_have_accents, target_pure_latin = get_target_features(language)

if ignore_non_latin and target_pure_latin is False:
continue
Expand Down