Skip to content

Commit

Permalink
Fixing some performance bottlenecks (#183)
Browse files Browse the repository at this point in the history
* small performance correction
  • Loading branch information
deedy5 committed May 3, 2022
1 parent 95d4bea commit d50cd84
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 16 deletions.
17 changes: 7 additions & 10 deletions charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,10 @@ def characters_popularity_compare(
raise ValueError("{} not available".format(language))

character_approved_count = 0 # type: int
FREQUENCIES_language_set = set(FREQUENCIES[language])

for character in ordered_characters:
if character not in FREQUENCIES[language]:
if character not in FREQUENCIES_language_set:
continue

characters_before_source = FREQUENCIES[language][
Expand All @@ -186,23 +187,19 @@ def characters_popularity_compare(
characters_after_source = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]

characters_before = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
characters_after = ordered_characters[
ordered_characters.index(character) :
] # type: List[str]

before_match_count = [
e in characters_before for e in characters_before_source
].count(
True
before_match_count = len(
set(characters_before) & set(characters_before_source)
) # type: int
after_match_count = [
e in characters_after for e in characters_after_source
].count(
True

after_match_count = len(
set(characters_after) & set(characters_after_source)
) # type: int

if len(characters_before_source) == 0 and before_match_count <= 4:
Expand Down
10 changes: 4 additions & 6 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
Expand Down Expand Up @@ -139,11 +140,7 @@ def eligible(self, character: str) -> bool:
return True

def feed(self, character: str) -> None:
if (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1

Expand Down Expand Up @@ -269,7 +266,7 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
if character.isalpha():
self._buffer = "".join([self._buffer, character])
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
Expand Down Expand Up @@ -446,6 +443,7 @@ def ratio(self) -> float:
return self._successive_upper_lower_count_final / self._character_count


@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
Expand Down
10 changes: 10 additions & 0 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def is_latin(character: str) -> bool:
return "LATIN" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
Expand Down Expand Up @@ -197,6 +198,15 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
)


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
Expand Down

0 comments on commit d50cd84

Please sign in to comment.