Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring for potential performance improvements in loops #113

Merged
merged 19 commits into from Sep 24, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 10 additions & 7 deletions charset_normalizer/cd.py
Expand Up @@ -62,7 +62,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
return languages


@lru_cache()
@lru_cache(maxsize=4096) # up to 2999 IANA char sets
adbar marked this conversation as resolved.
Show resolved Hide resolved
def encoding_languages(iana_name: str) -> List[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
Expand All @@ -82,21 +82,26 @@ def encoding_languages(iana_name: str) -> List[str]:
return unicode_range_languages(primary_range)


ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}
adbar marked this conversation as resolved.
Show resolved Hide resolved
KO_NAMES = {"johab", "cp949", "euc_kr"}

@lru_cache(maxsize=4096) # up to 2999 IANA char sets
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
# todo: order conditions by general language/encoding frequency?
adbar marked this conversation as resolved.
Show resolved Hide resolved
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]

return []
Expand All @@ -109,12 +114,10 @@ def alphabet_languages(characters: List[str]) -> List[str]:
languages = [] # type: List[str]

for language, language_characters in FREQUENCIES.items():
character_match_count = 0 # type: int
character_count = len(language_characters) # type: int

for character in language_characters:
if character in characters:
character_match_count += 1
# type: int
adbar marked this conversation as resolved.
Show resolved Hide resolved
character_match_count = len([c for c in language_characters if c in characters])

if character_match_count / character_count >= 0.2:
languages.append(language)
Expand Down
76 changes: 29 additions & 47 deletions charset_normalizer/md.py
Expand Up @@ -21,6 +21,25 @@
)


SUSPICIOUS = {
adbar marked this conversation as resolved.
Show resolved Hide resolved
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
}


class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
Expand Down Expand Up @@ -70,23 +89,7 @@ def eligible(self, character: str) -> bool:
def feed(self, character: str) -> None:
self._character_count += 1

if character != self._last_printable_char and character not in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]:
if character != self._last_printable_char and character not in SUSPICIOUS:
if is_punctuation(character):
self._punctuation_count += 1
elif (
Expand Down Expand Up @@ -153,9 +156,8 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
if (
character not in {"\n", "\t", "\r", "\v"}
character.isspace() is False # includes \n \t \r \v
adbar marked this conversation as resolved.
Show resolved Hide resolved
Ousret marked this conversation as resolved.
Show resolved Hide resolved
and character.isprintable() is False
and character.isspace() is False
and ord(character) != 0x1A # Why? Its the ASCII substitute character.
adbar marked this conversation as resolved.
Show resolved Hide resolved
):
self._unprintable_count += 1
Expand Down Expand Up @@ -223,24 +225,7 @@ def feed(self, character: str) -> None:
if (
character.isspace()
or is_punctuation(character)
or character
in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]
or character in SUSPICIOUS
):
self._last_printable_seen = None
return
Expand Down Expand Up @@ -495,18 +480,17 @@ def is_suspiciously_successive_range(
return False

# Japanese Exception
if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
"Katakana",
"Hiragana",
]:
return False

if unicode_range_a in ["Katakana", "Hiragana"] or unicode_range_b in [
"Katakana",
"Hiragana",
]:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
"Katakana",
"Hiragana",
]:
adbar marked this conversation as resolved.
Show resolved Hide resolved
return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
Expand All @@ -533,11 +517,9 @@ def mess_ratio(
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors = [] # type: List[MessDetectorPlugin]
"""

for md_class in MessDetectorPlugin.__subclasses__():
detectors.append(md_class())
detectors = [md_class() for md_class in MessDetectorPlugin.__subclasses__()] # type: List[MessDetectorPlugin]

length = len(decoded_sequence) # type: int

Expand Down
10 changes: 4 additions & 6 deletions charset_normalizer/models.py
Expand Up @@ -225,12 +225,10 @@ def has_submatch(self) -> bool:
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
detected_ranges = set() # type: Set[str]
for character in str(self):
detected_range = unicode_range(character) # type: Optional[str]
if detected_range:
detected_ranges.add(detected_range)
self._unicode_ranges = sorted(list(detected_ranges))
# list detected ranges
detected_ranges = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(filter(None.__ne__, detected_ranges))
adbar marked this conversation as resolved.
Show resolved Hide resolved
return self._unicode_ranges

@property
Expand Down