diff --git a/.github/workflows/chardet-bc.yml b/.github/workflows/chardet-bc.yml index 97ffb6cd..4aa73d29 100644 --- a/.github/workflows/chardet-bc.yml +++ b/.github/workflows/chardet-bc.yml @@ -23,6 +23,7 @@ jobs: pip install -U pip setuptools pip install -r requirements.txt pip install -r ./bin/requirements.txt + pip uninstall -y charset-normalizer - name: Install the package run: | python setup.py install diff --git a/.github/workflows/detector-coverage.yml b/.github/workflows/detector-coverage.yml index 1db79121..b1e50dea 100644 --- a/.github/workflows/detector-coverage.yml +++ b/.github/workflows/detector-coverage.yml @@ -23,6 +23,7 @@ jobs: pip install -U pip setuptools pip install -r requirements.txt pip install -r ./bin/requirements.txt + pip uninstall -y charset-normalizer - name: Install the package run: | python setup.py install diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml index bf1fcda6..1661bc56 100644 --- a/.github/workflows/performance.yml +++ b/.github/workflows/performance.yml @@ -23,6 +23,7 @@ jobs: pip install -U pip setuptools pip install -r requirements.txt pip install -r ./bin/requirements.txt + pip uninstall -y charset-normalizer - name: Install the package run: | python setup.py install diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 845bfd90..f37da668 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -25,6 +25,7 @@ jobs: run: | pip install -U pip setuptools pip install -r requirements.txt + pip uninstall -y charset-normalizer - name: Install the package run: | python setup.py install diff --git a/README.md b/README.md index aa236201..d4b545c6 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,13 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ns) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 93.0 % | 67 ms | 15.38 file/sec | -| charset-normalizer | **95.0 %** | **37 ms** | 27.77 file/sec | +| [chardet](https://github.com/chardet/chardet) | 93.0 % | 150 ms | 7 file/sec | +| charset-normalizer | **95.0 %** | **36 ms** | 28 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 424 ms | 234 ms | 26 ms | -| charset-normalizer | 335 ms | 186 ms | 17 ms | +| [chardet](https://github.com/chardet/chardet) | 647 ms | 250 ms | 24 ms | +| charset-normalizer | 354 ms | 202 ms | 16 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 388e841b..fd023409 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -206,7 +206,7 @@ def from_bytes( multi_byte_bonus = is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length # type: bool if multi_byte_bonus: - logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. Should not be a coincidence. Priority +1 given.', encoding_iana) + logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.', encoding_iana) max_chunk_gave_up = int(len(r_) / 4) # type: int @@ -331,7 +331,7 @@ def from_bytes( logger.info( "Using %s code page we detected the following languages: %s", encoding_iana, - results[-1]._languages + results[encoding_iana]._languages ) if len(results) == 0: diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 3c854be9..c2ae18ea 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -93,6 +93,8 @@ def cli_detect(argv=None): print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr) return 1 + x_ = [] + for my_file in args.files: matches = from_fp( @@ -101,8 +103,6 @@ def cli_detect(argv=None): explain=args.verbose ) - x_ = [] - if len(matches) == 0: print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr) x_.append( @@ -202,7 +202,7 @@ def cli_detect(argv=None): dumps( [ el.__dict__ for el in x_ - ] if args.alternatives else x_[0].__dict__, + ] if len(x_) > 1 else x_[0].__dict__, ensure_ascii=True, indent=4 ) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 7d948e80..ad6d737d 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -3,7 +3,7 @@ from charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD from charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \ - remove_accent, is_separator, is_cjk + remove_accent, is_separator, is_cjk, is_case_variable, is_hangul, is_katakana, is_hiragana, is_ascii, is_thai class MessDetectorPlugin: @@ -140,11 +140,15 @@ def __init__(self): self._last_latin_character = None # type: Optional[str] def eligible(self, character: str) -> bool: - return is_latin(character) + return character.isalpha() and is_latin(character) def feed(self, character: str) -> None: + self._character_count += 1 if self._last_latin_character is not None: if is_accentuated(character) and is_accentuated(self._last_latin_character): + if character.isupper() and self._last_latin_character.isupper(): + self._successive_count += 1 + # Worse if its the same char duplicated with different accent. if remove_accent(character) == remove_accent(self._last_latin_character): self._successive_count += 1 self._last_latin_character = character @@ -175,14 +179,14 @@ def eligible(self, character: str) -> bool: def feed(self, character: str) -> None: self._character_count += 1 - if self._last_printable_seen is None: - self._last_printable_seen = character - return - if character.isspace() or is_punctuation(character): self._last_printable_seen = None return + if self._last_printable_seen is None: + self._last_printable_seen = character + return + unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str] unicode_range_b = unicode_range(character) # type: Optional[str] @@ -215,6 +219,7 @@ def __init__(self): self._word_count = 0 # type: int self._bad_word_count = 0 # type: int self._is_current_word_bad = False # type: bool + self._foreign_long_watch = False # type: bool self._character_count = 0 # type: int self._bad_character_count = 0 # type: int @@ -230,6 +235,8 @@ def feed(self, character: str) -> None: self._buffer = "".join([self._buffer, character]) if is_accentuated(character): self._buffer_accent_count += 1 + if self._foreign_long_watch is False and is_latin(character) is False and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False: + self._foreign_long_watch = True return if not self._buffer: return @@ -241,12 +248,15 @@ def feed(self, character: str) -> None: if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3: self._is_current_word_bad = True + if buffer_length >= 24 and self._foreign_long_watch: + self._is_current_word_bad = True if self._is_current_word_bad: self._bad_word_count += 1 self._bad_character_count += len(self._buffer) self._is_current_word_bad = False + self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character): @@ -256,6 +266,7 @@ def feed(self, character: str) -> None: def reset(self) -> None: self._buffer = "" self._is_current_word_bad = False + self._foreign_long_watch = False self._bad_word_count = 0 self._word_count = 0 self._character_count = 0 @@ -263,7 +274,7 @@ def reset(self) -> None: @property def ratio(self) -> float: - if self._word_count <= 16: + if self._word_count <= 10: return 0. return self._bad_character_count / self._character_count @@ -313,27 +324,43 @@ def __init__(self): self._character_count = 0 # type: int self._last_alpha_seen = None # type: Optional[str] + self._current_ascii_only = True # type: bool def eligible(self, character: str) -> bool: - return character.isspace() or character.isalpha() + return True def feed(self, character: str) -> None: - if is_separator(character): - if self._character_count_since_last_sep < 24: + is_concerned = character.isalpha() and is_case_variable(character) + chunk_sep = is_concerned is False + + if chunk_sep and self._character_count_since_last_sep > 0: + if self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False: self._successive_upper_lower_count_final += self._successive_upper_lower_count + self._successive_upper_lower_count = 0 self._character_count_since_last_sep = 0 + self._last_alpha_seen = None + self._buf = False + self._character_count += 1 + self._current_ascii_only = True + + return + + if self._current_ascii_only is True and is_ascii(character) is False: + self._current_ascii_only = False if self._last_alpha_seen is not None: if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()): if self._buf is True: - self._successive_upper_lower_count += 1 + self._successive_upper_lower_count += 2 + self._buf = False else: self._buf = True else: self._buf = False self._character_count += 1 + self._character_count_since_last_sep += 1 self._last_alpha_seen = character def reset(self) -> None: @@ -342,13 +369,15 @@ def reset(self) -> None: self._successive_upper_lower_count = 0 self._successive_upper_lower_count_final = 0 self._last_alpha_seen = None + self._buf = False + self._current_ascii_only = True @property def ratio(self) -> float: if self._character_count == 0: return 0. - return (self._successive_upper_lower_count_final * 2) / self._character_count + return self._successive_upper_lower_count_final / self._character_count def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool: diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 924f1613..bcea76ac 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -290,7 +290,7 @@ def append(self, item: CharsetMatch) -> None: # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) if len(item.raw) <= TOO_BIG_SEQUENCE: for match in self._results: - if match.fingerprint == item.fingerprint: + if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item) return self._results.append(item) diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index db59739f..95bb2b1b 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -22,7 +22,7 @@ def is_accentuated(character: str) -> bool: description = unicodedata.name(character) # type: str except ValueError: return False - return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description + return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -64,6 +64,13 @@ def is_latin(character: str) -> bool: return "LATIN" in description +def is_ascii(character: str) -> bool: + try: + character.encode("ascii") + except UnicodeEncodeError: + return False + return True + @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: character_category = unicodedata.category(character) # type: str @@ -96,7 +103,7 @@ def is_symbol(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_separator(character: str) -> bool: - if character.isspace() or character in ["|", "+"]: + if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]: return True character_category = unicodedata.category(character) # type: str @@ -104,12 +111,18 @@ def is_separator(character: str) -> bool: return "Z" in character_category +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_case_variable(character: str) -> bool: + return character.islower() != character.isupper() + + def is_private_use_only(character: str) -> bool: character_category = unicodedata.category(character) # type: str return "Co" == character_category +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_cjk(character: str) -> bool: try: character_name = unicodedata.name(character) @@ -119,6 +132,46 @@ def is_cjk(character: str) -> bool: return "CJK" in character_name +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_hiragana(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "HIRAGANA" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_katakana(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "KATAKANA" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_hangul(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "HANGUL" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_thai(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "THAI" in character_name + + @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: for keyword in UNICODE_SECONDARY_RANGE_KEYWORD: diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 12f4b340..112ac0cc 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.3" +__version__ = "2.0.4" VERSION = __version__.split('.') diff --git a/tests/test_probe_chaos.py b/tests/test_probe_chaos.py index 96342408..57fe433d 100644 --- a/tests/test_probe_chaos.py +++ b/tests/test_probe_chaos.py @@ -40,7 +40,7 @@ def test_subtle_gibberish(self): self.assertGreater( mess_ratio("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v"), - 0.7 + 0.5 ) self.assertGreater(