Skip to content

Commit

Permalink
MD improvement on trailing data and long foreign (non-pure latin) data (
Browse files Browse the repository at this point in the history
#124)

* ❇️ Always feed a final non printable character to detect trailing mess

some MD plugins require a "separator" character before assessing the current buffer.

* ❇️ Never ignore non-pure latin (foreign) word that are too long
  • Loading branch information
Ousret committed Oct 23, 2021
1 parent 5c72742 commit b34d2e3
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions charset_normalizer/md.py
Expand Up @@ -252,6 +252,8 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count = 0 # type: int
self._bad_word_count = 0 # type: int
self._foreign_long_count = 0 # type: int

self._is_current_word_bad = False # type: bool
self._foreign_long_watch = False # type: bool

Expand All @@ -271,7 +273,7 @@ def feed(self, character: str) -> None:
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and is_latin(character) is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
Expand All @@ -293,6 +295,7 @@ def feed(self, character: str) -> None:
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._foreign_long_count += 1
self._is_current_word_bad = True

if self._is_current_word_bad:
Expand All @@ -319,10 +322,11 @@ def reset(self) -> None: # pragma: no cover
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0

@property
def ratio(self) -> float:
if self._word_count <= 10:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0

return self._bad_character_count / self._character_count
Expand Down Expand Up @@ -515,7 +519,7 @@ def mess_ratio(
md_class() for md_class in MessDetectorPlugin.__subclasses__()
] # type: List[MessDetectorPlugin]

length = len(decoded_sequence) # type: int
length = len(decoded_sequence) + 1 # type: int

mean_mess_ratio = 0.0 # type: float

Expand All @@ -526,7 +530,7 @@ def mess_ratio(
else:
intermediary_mean_mess_ratio_calc = 128

for character, index in zip(decoded_sequence, range(0, length)):
for character, index in zip(decoded_sequence + "\n", range(0, length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
Expand Down

0 comments on commit b34d2e3

Please sign in to comment.