❇️ Improvement over Vietnamese detection (#126)

Latin character with combining diacritical mark were miss detected/seen as mess (MD).
Ousret · Oct 23, 2021 · 5c72742 · 5c72742
1 parent 8b52c35
commit 5c72742
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -453,6 +453,12 @@ def is_suspiciously_successive_range(
     if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
         return False
 
+    # Latin characters can be accompanied with a combining diacritical mark
+    # eg. Vietnamese.
+    if "Latin" in unicode_range_a or "Latin" in unicode_range_b:
+        if "Combining" in unicode_range_a or "Combining" in unicode_range_b:
+            return False
+
     keywords_range_a, keywords_range_b = unicode_range_a.split(
         " "
     ), unicode_range_b.split(" ")