From 89b30a0cd59e9f3460c77abeffd2e605b0972469 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 1 Nov 2021 17:24:09 +0100 Subject: [PATCH 1/3] :sparkle: Adjust the MD around obvious bad EU word rendered Partially close #130 --- charset_normalizer/md.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index cc83f14f..b55e95c4 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -293,8 +293,14 @@ def feed(self, character: str) -> None: self._character_count += buffer_length - if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34: - self._is_current_word_bad = True + if buffer_length >= 4: + if self._buffer_accent_count / buffer_length > 0.34: + self._is_current_word_bad = True + # Word/Buffer ending with a upper case accentuated letter are so rare, + # that we will consider them all as suspicious. Same weight as foreign_long suspicious. + if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): + self._foreign_long_count += 1 + self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: self._foreign_long_count += 1 self._is_current_word_bad = True From b73faf3ed1909fa7211e94620608ab6b6230c47d Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 1 Nov 2021 17:24:26 +0100 Subject: [PATCH 2/3] :bookmark: Bump version 2.0.8.dev1 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 78e2fab2..b06aeb39 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.8.dev0" +__version__ = "2.0.8.dev1" VERSION = __version__.split(".") From 5fd3dd000db46c0807e426a26659472de8d11671 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 1 Nov 2021 17:29:30 +0100 Subject: [PATCH 3/3] :heavy_check_mark: Adjust MD tests on expected max ratio --- tests/test_mess_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index 18d35f16..46eed879 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -13,7 +13,7 @@ ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0), - ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.0), + ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0) ]