From 00ffea0534b01ad378d1f0830e34aa8710469809 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Mon, 1 Nov 2021 17:42:29 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20Minor=20adjustement=20o?= =?UTF-8?q?n=20the=20MD=20around=20european=20words=20(#133)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * :sparkle: Adjust the MD around obvious bad EU word rendered Partially close #130 * :bookmark: Bump version 2.0.8.dev1 --- charset_normalizer/md.py | 10 ++++++++-- charset_normalizer/version.py | 2 +- tests/test_mess_detection.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index cc83f14f..b55e95c4 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -293,8 +293,14 @@ def feed(self, character: str) -> None: self._character_count += buffer_length - if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34: - self._is_current_word_bad = True + if buffer_length >= 4: + if self._buffer_accent_count / buffer_length > 0.34: + self._is_current_word_bad = True + # Word/Buffer ending with a upper case accentuated letter are so rare, + # that we will consider them all as suspicious. Same weight as foreign_long suspicious. + if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): + self._foreign_long_count += 1 + self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: self._foreign_long_count += 1 self._is_current_word_bad = True diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 78e2fab2..b06aeb39 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev0" +__version__ = "2.0.8.dev1" VERSION = __version__.split(".") diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index 18d35f16..46eed879 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -13,7 +13,7 @@ ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0), - ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.0), + ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0) ]