Skip to content

Commit

Permalink
🔖 Bump version to 2.0.4 (#72)
Browse files Browse the repository at this point in the history
* 🔖 Bump version to 2.0.4

* 🩹 MD sensitivity adjustments (#76)

* 🩹 MD sensitivity adjustments 
* 📌Make sure that CN deps from requests does not shadow the current dev-version

* 📝 Do not mislead, dont say if multibyte, priority given (logger, explain)

* 📝 🐛 Tiny mistake when logging detected language using specific cp (debug, explain)

* 🐛 submatch factoring were incorrect in rare cases

* 📝 ⚡ Performance claims update

* 🐛 Multiple file given to the CLI would not result in array JSON (omit after the first file)
  • Loading branch information
Ousret committed Jul 30, 2021
1 parent 8247f3a commit 558d1e2
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 26 deletions.
1 change: 1 addition & 0 deletions .github/workflows/chardet-bc.yml
Expand Up @@ -23,6 +23,7 @@ jobs:
pip install -U pip setuptools
pip install -r requirements.txt
pip install -r ./bin/requirements.txt
pip uninstall -y charset-normalizer
- name: Install the package
run: |
python setup.py install
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/detector-coverage.yml
Expand Up @@ -23,6 +23,7 @@ jobs:
pip install -U pip setuptools
pip install -r requirements.txt
pip install -r ./bin/requirements.txt
pip uninstall -y charset-normalizer
- name: Install the package
run: |
python setup.py install
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/performance.yml
Expand Up @@ -23,6 +23,7 @@ jobs:
pip install -U pip setuptools
pip install -r requirements.txt
pip install -r ./bin/requirements.txt
pip uninstall -y charset-normalizer
- name: Install the package
run: |
python setup.py install
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/run-tests.yml
Expand Up @@ -25,6 +25,7 @@ jobs:
run: |
pip install -U pip setuptools
pip install -r requirements.txt
pip uninstall -y charset-normalizer
- name: Install the package
run: |
python setup.py install
Expand Down
8 changes: 4 additions & 4 deletions README.md
Expand Up @@ -55,13 +55,13 @@ This package offer better performance than its counterpart Chardet. Here are som

| Package | Accuracy | Mean per file (ns) | File per sec (est) |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 93.0 % | 67 ms | 15.38 file/sec |
| charset-normalizer | **95.0 %** | **37 ms** | 27.77 file/sec |
| [chardet](https://github.com/chardet/chardet) | 93.0 % | 150 ms | 7 file/sec |
| charset-normalizer | **95.0 %** | **36 ms** | 28 file/sec |

| Package | 99th percentile | 95th percentile | 50th percentile |
| ------------- | :-------------: | :------------------: | :------------------: |
| [chardet](https://github.com/chardet/chardet) | 424 ms | 234 ms | 26 ms |
| charset-normalizer | 335 ms | 186 ms | 17 ms |
| [chardet](https://github.com/chardet/chardet) | 647 ms | 250 ms | 24 ms |
| charset-normalizer | 354 ms | 202 ms | 16 ms |

Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.

Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/api.py
Expand Up @@ -206,7 +206,7 @@ def from_bytes(
multi_byte_bonus = is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length # type: bool

if multi_byte_bonus:
logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. Should not be a coincidence. Priority +1 given.', encoding_iana)
logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.', encoding_iana)

max_chunk_gave_up = int(len(r_) / 4) # type: int

Expand Down Expand Up @@ -331,7 +331,7 @@ def from_bytes(
logger.info(
"Using %s code page we detected the following languages: %s",
encoding_iana,
results[-1]._languages
results[encoding_iana]._languages
)

if len(results) == 0:
Expand Down
6 changes: 3 additions & 3 deletions charset_normalizer/cli/normalizer.py
Expand Up @@ -93,6 +93,8 @@ def cli_detect(argv=None):
print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr)
return 1

x_ = []

for my_file in args.files:

matches = from_fp(
Expand All @@ -101,8 +103,6 @@ def cli_detect(argv=None):
explain=args.verbose
)

x_ = []

if len(matches) == 0:
print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr)
x_.append(
Expand Down Expand Up @@ -202,7 +202,7 @@ def cli_detect(argv=None):
dumps(
[
el.__dict__ for el in x_
] if args.alternatives else x_[0].__dict__,
] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4
)
Expand Down
53 changes: 41 additions & 12 deletions charset_normalizer/md.py
Expand Up @@ -3,7 +3,7 @@

from charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD
from charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \
remove_accent, is_separator, is_cjk
remove_accent, is_separator, is_cjk, is_case_variable, is_hangul, is_katakana, is_hiragana, is_ascii, is_thai


class MessDetectorPlugin:
Expand Down Expand Up @@ -140,11 +140,15 @@ def __init__(self):
self._last_latin_character = None # type: Optional[str]

def eligible(self, character: str) -> bool:
return is_latin(character)
return character.isalpha() and is_latin(character)

def feed(self, character: str) -> None:
self._character_count += 1
if self._last_latin_character is not None:
if is_accentuated(character) and is_accentuated(self._last_latin_character):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
Expand Down Expand Up @@ -175,14 +179,14 @@ def eligible(self, character: str) -> bool:
def feed(self, character: str) -> None:
self._character_count += 1

if self._last_printable_seen is None:
self._last_printable_seen = character
return

if character.isspace() or is_punctuation(character):
self._last_printable_seen = None
return

if self._last_printable_seen is None:
self._last_printable_seen = character
return

unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str]
unicode_range_b = unicode_range(character) # type: Optional[str]

Expand Down Expand Up @@ -215,6 +219,7 @@ def __init__(self):
self._word_count = 0 # type: int
self._bad_word_count = 0 # type: int
self._is_current_word_bad = False # type: bool
self._foreign_long_watch = False # type: bool

self._character_count = 0 # type: int
self._bad_character_count = 0 # type: int
Expand All @@ -230,6 +235,8 @@ def feed(self, character: str) -> None:
self._buffer = "".join([self._buffer, character])
if is_accentuated(character):
self._buffer_accent_count += 1
if self._foreign_long_watch is False and is_latin(character) is False and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False:
self._foreign_long_watch = True
return
if not self._buffer:
return
Expand All @@ -241,12 +248,15 @@ def feed(self, character: str) -> None:

if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._is_current_word_bad = True

if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False

self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character):
Expand All @@ -256,14 +266,15 @@ def feed(self, character: str) -> None:
def reset(self) -> None:
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0

@property
def ratio(self) -> float:
if self._word_count <= 16:
if self._word_count <= 10:
return 0.

return self._bad_character_count / self._character_count
Expand Down Expand Up @@ -313,27 +324,43 @@ def __init__(self):
self._character_count = 0 # type: int

self._last_alpha_seen = None # type: Optional[str]
self._current_ascii_only = True # type: bool

def eligible(self, character: str) -> bool:
return character.isspace() or character.isalpha()
return True

def feed(self, character: str) -> None:
if is_separator(character):
if self._character_count_since_last_sep < 24:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False

if chunk_sep and self._character_count_since_last_sep > 0:
if self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False:
self._successive_upper_lower_count_final += self._successive_upper_lower_count

self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True

return

if self._current_ascii_only is True and is_ascii(character) is False:
self._current_ascii_only = False

if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()):
if self._buf is True:
self._successive_upper_lower_count += 1
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False

self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character

def reset(self) -> None:
Expand All @@ -342,13 +369,15 @@ def reset(self) -> None:
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True

@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.

return (self._successive_upper_lower_count_final * 2) / self._character_count
return self._successive_upper_lower_count_final / self._character_count


def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/models.py
Expand Up @@ -290,7 +290,7 @@ def append(self, item: CharsetMatch) -> None:
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
Expand Down
57 changes: 55 additions & 2 deletions charset_normalizer/utils.py
Expand Up @@ -22,7 +22,7 @@ def is_accentuated(character: str) -> bool:
description = unicodedata.name(character) # type: str
except ValueError:
return False
return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description
return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down Expand Up @@ -64,6 +64,13 @@ def is_latin(character: str) -> bool:
return "LATIN" in description


def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
except UnicodeEncodeError:
return False
return True

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category = unicodedata.category(character) # type: str
Expand Down Expand Up @@ -96,20 +103,26 @@ def is_symbol(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in ["|", "+"]:
if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
return True

character_category = unicodedata.category(character) # type: str

return "Z" in character_category


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()


def is_private_use_only(character: str) -> bool:
character_category = unicodedata.category(character) # type: str

return "Co" == character_category


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
Expand All @@ -119,6 +132,46 @@ def is_cjk(character: str) -> bool:
return "CJK" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "HIRAGANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "KATAKANA" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "HANGUL" in character_name


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False

return "THAI" in character_name


@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.3"
__version__ = "2.0.4"
VERSION = __version__.split('.')
2 changes: 1 addition & 1 deletion tests/test_probe_chaos.py
Expand Up @@ -40,7 +40,7 @@ def test_subtle_gibberish(self):

self.assertGreater(
mess_ratio("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v"),
0.7
0.5
)

self.assertGreater(
Expand Down

0 comments on commit 558d1e2

Please sign in to comment.