Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MD improvement on trailing data and long foreign (non-pure latin) data #124

Merged
merged 5 commits into from Oct 23, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 8 additions & 4 deletions charset_normalizer/md.py
Expand Up @@ -252,6 +252,8 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count = 0 # type: int
self._bad_word_count = 0 # type: int
self._foreign_long_count = 0 # type: int

self._is_current_word_bad = False # type: bool
self._foreign_long_watch = False # type: bool

Expand All @@ -271,7 +273,7 @@ def feed(self, character: str) -> None:
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and is_latin(character) is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
Expand All @@ -293,6 +295,7 @@ def feed(self, character: str) -> None:
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._foreign_long_count += 1
self._is_current_word_bad = True

if self._is_current_word_bad:
Expand All @@ -319,10 +322,11 @@ def reset(self) -> None: # pragma: no cover
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0

@property
def ratio(self) -> float:
if self._word_count <= 10:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0

return self._bad_character_count / self._character_count
Expand Down Expand Up @@ -509,7 +513,7 @@ def mess_ratio(
md_class() for md_class in MessDetectorPlugin.__subclasses__()
] # type: List[MessDetectorPlugin]

length = len(decoded_sequence) # type: int
length = len(decoded_sequence) + 1 # type: int

mean_mess_ratio = 0.0 # type: float

Expand All @@ -520,7 +524,7 @@ def mess_ratio(
else:
intermediary_mean_mess_ratio_calc = 128

for character, index in zip(decoded_sequence, range(0, length)):
for character, index in zip(decoded_sequence + "\n", range(0, length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
Expand Down