Skip to content

Commit

Permalink
❇️ Various detection improvement (MD+CD) (#117)
Browse files Browse the repository at this point in the history
* ❇️ Various detection improvement (MD+CD)

* ❇️ Ignore non-latin based language upon latin CPs

* 🔧 Using OrderedDict in alpha_unicode_split layers

* 🐛 Use OrderedDict in coherence merger results for py35 sake..
  • Loading branch information
Ousret committed Sep 26, 2021
1 parent 42a7d3d commit cf1f76a
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 14 deletions.
61 changes: 49 additions & 12 deletions charset_normalizer/cd.py
@@ -1,14 +1,20 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from collections import Counter, OrderedDict
from functools import lru_cache
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, ZH_NAMES
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)


def encoding_unicode_range(iana_name: str) -> List[str]:
Expand Down Expand Up @@ -104,23 +110,52 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
return []


def alphabet_languages(characters: List[str]) -> List[str]:
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages = [] # type: List[str]
languages = [] # type: List[Tuple[str, float]]

source_have_accents = False # type: bool

for character in characters:
if is_accentuated(character):
source_have_accents = True
break

for language, language_characters in FREQUENCIES.items():

target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for language_character in language_characters:
if target_have_accents is False and is_accentuated(language_character):
target_have_accents = True
if target_pure_latin is True and is_latin(language_character) is False:
target_pure_latin = False

if ignore_non_latin and target_pure_latin is False:
continue

if target_have_accents is False and source_have_accents:
continue

character_count = len(language_characters) # type: int

character_match_count = len(
[c for c in language_characters if c in characters]
) # type: int

if character_match_count / character_count >= 0.2:
languages.append(language)
ratio = character_match_count / character_count # type: float

return languages
if ratio >= 0.2:
languages.append((language, ratio))

languages = sorted(languages, key=lambda x: x[1], reverse=True)

return [compatible_language[0] for compatible_language in languages]


def characters_popularity_compare(
Expand Down Expand Up @@ -189,7 +224,7 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers = {} # type: Dict[str, str]
layers = OrderedDict() # type: Dict[str, str]

for character in decoded_sequence:
if character.isalpha() is False:
Expand Down Expand Up @@ -227,7 +262,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios = {} # type: Dict[str, List[float]]
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
merge = [] # type: CoherenceMatches

for result in results:
Expand Down Expand Up @@ -264,13 +299,15 @@ def coherence_ratio(

results = [] # type: List[Tuple[str, float]]
lg_inclusion_list = [] # type: List[str]
ignore_non_latin = False # type: bool

sufficient_match_count = 0 # type: int

if lg_inclusion is not None:
lg_inclusion_list = lg_inclusion.split(",")

if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")

for layer in alpha_unicode_split(decoded_sequence):
Expand All @@ -279,13 +316,13 @@ def coherence_ratio(

character_count = sum([o for c, o in most_common]) # type: int

if character_count <= 32:
if character_count <= TOO_SMALL_SEQUENCE:
continue

popular_character_ordered = [c for c, o in most_common] # type: List[str]

for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered
popular_character_ordered, ignore_non_latin
):
ratio = characters_popularity_compare(
language, popular_character_ordered
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/md.py
Expand Up @@ -290,7 +290,7 @@ def feed(self, character: str) -> None:

self._character_count += buffer_length

if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._is_current_word_bad = True
Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/models.py
Expand Up @@ -54,9 +54,10 @@ def __lt__(self, other: object) -> bool:
raise ValueError

chaos_difference = abs(self.chaos - other.chaos) # type: float
coherence_difference = abs(self.coherence - other.coherence) # type: float

# Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01:
if chaos_difference < 0.01 and coherence_difference > 0.02:
# When having a tough decision, use the result that decoded as many multi-byte as possible.
if chaos_difference == 0.0 and self.coherence == other.coherence:
return self.multi_byte_usage > other.multi_byte_usage
Expand Down

0 comments on commit cf1f76a

Please sign in to comment.