Skip to content

Commit

Permalink
🛠️ Minor adjustement on the MD around european words (#133)
Browse files Browse the repository at this point in the history
* ❇️ Adjust the MD around obvious bad EU word rendered
Partially close #130
* 🔖 Bump version 2.0.8.dev1
  • Loading branch information
Ousret committed Nov 1, 2021
1 parent 618cd8c commit 00ffea0
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
10 changes: 8 additions & 2 deletions charset_normalizer/md.py
Expand Up @@ -293,8 +293,14 @@ def feed(self, character: str) -> None:

self._character_count += buffer_length

if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
# Word/Buffer ending with a upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._foreign_long_count += 1
self._is_current_word_bad = True
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.8.dev0"
__version__ = "2.0.8.dev1"
VERSION = __version__.split(".")
2 changes: 1 addition & 1 deletion tests/test_mess_detection.py
Expand Up @@ -13,7 +13,7 @@
("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
("<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>", 0.01, 0.5),
("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.0),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0)
]
Expand Down

0 comments on commit 00ffea0

Please sign in to comment.