❇️ Various detection improvement (MD+CD) (#117)

* ❇️ Various detection improvement (MD+CD) * ❇️ Ignore non-latin based language upon latin CPs * 🔧 Using OrderedDict in alpha_unicode_split layers * 🐛 Use OrderedDict in coherence merger results for py35 sake..
jawah · Sep 26, 2021 · cf1f76a · cf1f76a
1 parent 42a7d3d
commit cf1f76a
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 14 deletions.
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -1,14 +1,20 @@
 import importlib
 from codecs import IncrementalDecoder
-from collections import Counter
+from collections import Counter, OrderedDict
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple
 
 from .assets import FREQUENCIES
-from .constant import KO_NAMES, ZH_NAMES
+from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
 from .md import is_suspiciously_successive_range
 from .models import CoherenceMatches
-from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
+from .utils import (
+    is_accentuated,
+    is_latin,
+    is_multi_byte_encoding,
+    is_unicode_range_secondary,
+    unicode_range,
+)
 
 
 def encoding_unicode_range(iana_name: str) -> List[str]:
@@ -104,23 +110,52 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
     return []
 
 
-def alphabet_languages(characters: List[str]) -> List[str]:
+def alphabet_languages(
+    characters: List[str], ignore_non_latin: bool = False
+) -> List[str]:
     """
     Return associated languages associated to given characters.
     """
-    languages = []  # type: List[str]
+    languages = []  # type: List[Tuple[str, float]]
+
+    source_have_accents = False  # type: bool
+
+    for character in characters:
+        if is_accentuated(character):
+            source_have_accents = True
+            break
 
     for language, language_characters in FREQUENCIES.items():
+
+        target_have_accents = False  # type: bool
+        target_pure_latin = True  # type: bool
+
+        for language_character in language_characters:
+            if target_have_accents is False and is_accentuated(language_character):
+                target_have_accents = True
+            if target_pure_latin is True and is_latin(language_character) is False:
+                target_pure_latin = False
+
+        if ignore_non_latin and target_pure_latin is False:
+            continue
+
+        if target_have_accents is False and source_have_accents:
+            continue
+
         character_count = len(language_characters)  # type: int
 
         character_match_count = len(
             [c for c in language_characters if c in characters]
         )  # type: int
 
-        if character_match_count / character_count >= 0.2:
-            languages.append(language)
+        ratio = character_match_count / character_count  # type: float
 
-    return languages
+        if ratio >= 0.2:
+            languages.append((language, ratio))
+
+    languages = sorted(languages, key=lambda x: x[1], reverse=True)
+
+    return [compatible_language[0] for compatible_language in languages]
 
 
 def characters_popularity_compare(
@@ -189,7 +224,7 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
     Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
     One containing the latin letters and the other hebrew.
     """
-    layers = {}  # type: Dict[str, str]
+    layers = OrderedDict()  # type: Dict[str, str]
 
     for character in decoded_sequence:
         if character.isalpha() is False:
@@ -227,7 +262,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
     This function merge results previously given by the function coherence_ratio.
     The return type is the same as coherence_ratio.
     """
-    per_language_ratios = {}  # type: Dict[str, List[float]]
+    per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
     merge = []  # type: CoherenceMatches
 
     for result in results:
@@ -264,13 +299,15 @@ def coherence_ratio(
 
     results = []  # type: List[Tuple[str, float]]
     lg_inclusion_list = []  # type: List[str]
+    ignore_non_latin = False  # type: bool
 
     sufficient_match_count = 0  # type: int
 
     if lg_inclusion is not None:
         lg_inclusion_list = lg_inclusion.split(",")
 
     if "Latin Based" in lg_inclusion_list:
+        ignore_non_latin = True
         lg_inclusion_list.remove("Latin Based")
 
     for layer in alpha_unicode_split(decoded_sequence):
@@ -279,13 +316,13 @@ def coherence_ratio(
 
         character_count = sum([o for c, o in most_common])  # type: int
 
-        if character_count <= 32:
+        if character_count <= TOO_SMALL_SEQUENCE:
             continue
 
         popular_character_ordered = [c for c, o in most_common]  # type: List[str]
 
         for language in lg_inclusion_list or alphabet_languages(
-            popular_character_ordered
+            popular_character_ordered, ignore_non_latin
         ):
             ratio = characters_popularity_compare(
                 language, popular_character_ordered

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -290,7 +290,7 @@ def feed(self, character: str) -> None:
 
             self._character_count += buffer_length
 
-            if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
+            if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
                 self._is_current_word_bad = True
             if buffer_length >= 24 and self._foreign_long_watch:
                 self._is_current_word_bad = True

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -54,9 +54,10 @@ def __lt__(self, other: object) -> bool:
             raise ValueError
 
         chaos_difference = abs(self.chaos - other.chaos)  # type: float
+        coherence_difference = abs(self.coherence - other.coherence)  # type: float
 
         # Bellow 1% difference --> Use Coherence
-        if chaos_difference < 0.01:
+        if chaos_difference < 0.01 and coherence_difference > 0.02:
             # When having a tough decision, use the result that decoded as many multi-byte as possible.
             if chaos_difference == 0.0 and self.coherence == other.coherence:
                 return self.multi_byte_usage > other.multi_byte_usage