From 28b05f9286cea3634198e0d7170464469200ea87 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 23 Oct 2021 21:59:39 +0200 Subject: [PATCH] :sparkle: Improvement over Vietnamese detection Latin character with combining diacritical mark were miss detected/seen as mess (MD). --- charset_normalizer/md.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 2146d61d..03a49fa5 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -453,6 +453,12 @@ def is_suspiciously_successive_range( if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: return False + # Latin characters can be accompanied with a combining diacritical mark + # eg. Vietnamese. + if "Latin" in unicode_range_a or "Latin" in unicode_range_b: + if "Combining" in unicode_range_a or "Combining" in unicode_range_b: + return False + keywords_range_a, keywords_range_b = unicode_range_a.split( " " ), unicode_range_b.split(" ")