diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index cc83f14f..b55e95c4 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -293,8 +293,14 @@ def feed(self, character: str) -> None: self._character_count += buffer_length - if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34: - self._is_current_word_bad = True + if buffer_length >= 4: + if self._buffer_accent_count / buffer_length > 0.34: + self._is_current_word_bad = True + # Word/Buffer ending with a upper case accentuated letter are so rare, + # that we will consider them all as suspicious. Same weight as foreign_long suspicious. + if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): + self._foreign_long_count += 1 + self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: self._foreign_long_count += 1 self._is_current_word_bad = True diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 78e2fab2..b06aeb39 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev0" +__version__ = "2.0.8.dev1" VERSION = __version__.split(".") diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index 18d35f16..46eed879 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -13,7 +13,7 @@ ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0), - ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.0), + ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0) ]