Skip to content

Commit

Permalink
Refactoring for potential performance improvements in loops (#113)
Browse files Browse the repository at this point in the history
* reviewed encoding language associations: caches and sets defined

* use list comprehension for language association (#111)

* use list comprehension and filter in char analysis (#111)

* refactored variable inits in md.py

* models: move regex compilation to constants

* detection of Japanese characters: simplify syntax

* amend detected_ranges semantics

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
  • Loading branch information
3 people committed Sep 24, 2021
1 parent 59e48eb commit 42a7d3d
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 70 deletions.
13 changes: 7 additions & 6 deletions charset_normalizer/cd.py
Expand Up @@ -5,6 +5,7 @@
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
Expand Down Expand Up @@ -82,6 +83,7 @@ def encoding_languages(iana_name: str) -> List[str]:
return unicode_range_languages(primary_range)


@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
Expand All @@ -94,9 +96,9 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]

return []
Expand All @@ -109,12 +111,11 @@ def alphabet_languages(characters: List[str]) -> List[str]:
languages = [] # type: List[str]

for language, language_characters in FREQUENCIES.items():
character_match_count = 0 # type: int
character_count = len(language_characters) # type: int

for character in language_characters:
if character in characters:
character_match_count += 1
character_match_count = len(
[c for c in language_characters if c in characters]
) # type: int

if character_match_count / character_count >= 0.2:
languages.append(language)
Expand Down
27 changes: 26 additions & 1 deletion charset_normalizer/constant.py
Expand Up @@ -2,7 +2,7 @@
from collections import OrderedDict
from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Union
from typing import Dict, List, Set, Union

# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS = OrderedDict(
Expand Down Expand Up @@ -469,3 +469,28 @@
"cp1254": "Windows-1254",
"cp949": "CP949",
} # type: Dict[str, str]


COMMON_SAFE_ASCII_CHARACTERS = {
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
} # type: Set[str]


KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]

NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
74 changes: 22 additions & 52 deletions charset_normalizer/md.py
@@ -1,7 +1,7 @@
from functools import lru_cache
from typing import List, Optional

from .constant import UNICODE_SECONDARY_RANGE_KEYWORD
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
from .utils import (
is_accentuated,
is_ascii,
Expand Down Expand Up @@ -70,23 +70,10 @@ def eligible(self, character: str) -> bool:
def feed(self, character: str) -> None:
self._character_count += 1

if character != self._last_printable_char and character not in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]:
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
Expand Down Expand Up @@ -153,10 +140,9 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
if (
character not in {"\n", "\t", "\r", "\v"}
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character.isspace() is False
and ord(character) != 0x1A # Why? Its the ASCII substitute character.
and character != "\x1A" # Why? Its the ASCII substitute character.
):
self._unprintable_count += 1
self._character_count += 1
Expand Down Expand Up @@ -223,24 +209,7 @@ def feed(self, character: str) -> None:
if (
character.isspace()
or is_punctuation(character)
or character
in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
Expand Down Expand Up @@ -495,18 +464,19 @@ def is_suspiciously_successive_range(
return False

# Japanese Exception
if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
"Katakana",
"Hiragana",
]:
return False

if unicode_range_a in ["Katakana", "Hiragana"] or unicode_range_b in [
"Katakana",
"Hiragana",
]:
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if range_a_jp_chars or range_b_jp_chars:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if range_a_jp_chars and range_b_jp_chars:
return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
Expand Down Expand Up @@ -534,10 +504,10 @@ def mess_ratio(
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors = [] # type: List[MessDetectorPlugin]

for md_class in MessDetectorPlugin.__subclasses__():
detectors.append(md_class())
detectors = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
] # type: List[MessDetectorPlugin]

length = len(decoded_sequence) # type: int

Expand Down
22 changes: 11 additions & 11 deletions charset_normalizer/models.py
Expand Up @@ -3,10 +3,10 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import compile as re_compile, sub
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

from .constant import TOO_BIG_SEQUENCE
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio
from .utils import iana_name, is_multi_byte_encoding, unicode_range

Expand Down Expand Up @@ -102,8 +102,8 @@ def w_counter(self) -> Counter:
warnings.warn(
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
)
not_printable_pattern = re_compile(r"[0-9\W\n\r\t]+")
string_printable_only = sub(not_printable_pattern, " ", str(self).lower())

string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())

return Counter(string_printable_only.split())

Expand Down Expand Up @@ -225,12 +225,12 @@ def has_submatch(self) -> bool:
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
detected_ranges = set() # type: Set[str]
for character in str(self):
detected_range = unicode_range(character) # type: Optional[str]
if detected_range:
detected_ranges.add(detected_range)
self._unicode_ranges = sorted(list(detected_ranges))
# list detected ranges
detected_ranges = [
unicode_range(char) for char in str(self)
] # type: List[Optional[str]]
# filter and sort
self._unicode_ranges = sorted([r for r in detected_ranges if r]) # type: ignore
return self._unicode_ranges

@property
Expand Down

0 comments on commit 42a7d3d

Please sign in to comment.