Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring for potential performance improvements in loops #113

Merged
merged 19 commits into from Sep 24, 2021
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 5 additions & 6 deletions charset_normalizer/cd.py
Expand Up @@ -5,6 +5,7 @@
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, ZH_NAMES
Ousret marked this conversation as resolved.
Show resolved Hide resolved
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
Expand Down Expand Up @@ -82,6 +83,7 @@ def encoding_languages(iana_name: str) -> List[str]:
return unicode_range_languages(primary_range)


@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
Expand All @@ -94,9 +96,9 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]

return []
Expand All @@ -109,12 +111,9 @@ def alphabet_languages(characters: List[str]) -> List[str]:
languages = [] # type: List[str]

for language, language_characters in FREQUENCIES.items():
character_match_count = 0 # type: int
character_count = len(language_characters) # type: int

for character in language_characters:
if character in characters:
character_match_count += 1
character_match_count = len([c for c in language_characters if c in characters]) # type: int

if character_match_count / character_count >= 0.2:
languages.append(language)
Expand Down
27 changes: 26 additions & 1 deletion charset_normalizer/constant.py
Expand Up @@ -2,7 +2,7 @@
from collections import OrderedDict
from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Union
from typing import Dict, List, Set, Union

# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS = OrderedDict(
Expand Down Expand Up @@ -469,3 +469,28 @@
"cp1254": "Windows-1254",
"cp949": "CP949",
} # type: Dict[str, str]


COMMON_SAFE_ASCII_CHARACTERS = {
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
} # type: Set[str]


KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]

NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
65 changes: 12 additions & 53 deletions charset_normalizer/md.py
@@ -1,7 +1,7 @@
from functools import lru_cache
from typing import List, Optional

from .constant import UNICODE_SECONDARY_RANGE_KEYWORD
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
from .utils import (
is_accentuated,
is_ascii,
Expand Down Expand Up @@ -70,23 +70,7 @@ def eligible(self, character: str) -> bool:
def feed(self, character: str) -> None:
self._character_count += 1

if character != self._last_printable_char and character not in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]:
if character != self._last_printable_char and character not in COMMON_SAFE_ASCII_CHARACTERS:
if is_punctuation(character):
self._punctuation_count += 1
elif (
Expand Down Expand Up @@ -153,10 +137,9 @@ def eligible(self, character: str) -> bool:

def feed(self, character: str) -> None:
if (
character not in {"\n", "\t", "\r", "\v"}
character.isspace() is False # includes \n \t \r \v
adbar marked this conversation as resolved.
Show resolved Hide resolved
Ousret marked this conversation as resolved.
Show resolved Hide resolved
and character.isprintable() is False
and character.isspace() is False
and ord(character) != 0x1A # Why? Its the ASCII substitute character.
and character != "\x1A" # Why? Its the ASCII substitute character.
):
self._unprintable_count += 1
self._character_count += 1
Expand Down Expand Up @@ -223,24 +206,7 @@ def feed(self, character: str) -> None:
if (
character.isspace()
or is_punctuation(character)
or character
in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
]
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
Expand Down Expand Up @@ -495,18 +461,13 @@ def is_suspiciously_successive_range(
return False

# Japanese Exception
if unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in [
"Katakana",
"Hiragana",
]:
return False

if unicode_range_a in ["Katakana", "Hiragana"] or unicode_range_b in [
"Katakana",
"Hiragana",
]:
range_a_jp_chars, range_b_jp_chars = unicode_range_a in ("Hiragana", "Katakana"), \
unicode_range_b in ("Hiragana", "Katakana")
Ousret marked this conversation as resolved.
Show resolved Hide resolved
if range_a_jp_chars or range_b_jp_chars:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if range_a_jp_chars and range_b_jp_chars:
return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
Expand All @@ -533,11 +494,9 @@ def mess_ratio(
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors = [] # type: List[MessDetectorPlugin]
"""

for md_class in MessDetectorPlugin.__subclasses__():
detectors.append(md_class())
detectors = [md_class() for md_class in MessDetectorPlugin.__subclasses__()] # type: List[MessDetectorPlugin]

length = len(decoded_sequence) # type: int

Expand Down
18 changes: 8 additions & 10 deletions charset_normalizer/models.py
Expand Up @@ -3,10 +3,10 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import compile as re_compile, sub
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from .constant import TOO_BIG_SEQUENCE
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio
from .utils import iana_name, is_multi_byte_encoding, unicode_range

Expand Down Expand Up @@ -102,8 +102,8 @@ def w_counter(self) -> Counter:
warnings.warn(
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
)
not_printable_pattern = re_compile(r"[0-9\W\n\r\t]+")
string_printable_only = sub(not_printable_pattern, " ", str(self).lower())

string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())

return Counter(string_printable_only.split())

Expand Down Expand Up @@ -225,12 +225,10 @@ def has_submatch(self) -> bool:
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
detected_ranges = set() # type: Set[str]
for character in str(self):
detected_range = unicode_range(character) # type: Optional[str]
if detected_range:
detected_ranges.add(detected_range)
self._unicode_ranges = sorted(list(detected_ranges))
# list detected ranges
detected_ranges = [unicode_range(char) for char in str(self)] # type: List[Optional[str]]
# filter and sort
self._unicode_ranges = sorted([r for r in detected_ranges if r]) # type: ignore
return self._unicode_ranges

@property
Expand Down