Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

❇️ Various detection improvement (MD+CD) #117

Merged
merged 7 commits into from Sep 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 49 additions & 12 deletions charset_normalizer/cd.py
@@ -1,14 +1,20 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from collections import Counter, OrderedDict
from functools import lru_cache
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .constant import KO_NAMES, ZH_NAMES
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import is_multi_byte_encoding, is_unicode_range_secondary, unicode_range
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)


def encoding_unicode_range(iana_name: str) -> List[str]:
Expand Down Expand Up @@ -104,23 +110,52 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
return []


def alphabet_languages(characters: List[str]) -> List[str]:
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages = [] # type: List[str]
languages = [] # type: List[Tuple[str, float]]

source_have_accents = False # type: bool

for character in characters:
if is_accentuated(character):
source_have_accents = True
break

for language, language_characters in FREQUENCIES.items():

target_have_accents = False # type: bool
target_pure_latin = True # type: bool

for language_character in language_characters:
if target_have_accents is False and is_accentuated(language_character):
target_have_accents = True
if target_pure_latin is True and is_latin(language_character) is False:
target_pure_latin = False

if ignore_non_latin and target_pure_latin is False:
continue

if target_have_accents is False and source_have_accents:
continue

character_count = len(language_characters) # type: int

character_match_count = len(
[c for c in language_characters if c in characters]
) # type: int

if character_match_count / character_count >= 0.2:
languages.append(language)
ratio = character_match_count / character_count # type: float

return languages
if ratio >= 0.2:
languages.append((language, ratio))

languages = sorted(languages, key=lambda x: x[1], reverse=True)

return [compatible_language[0] for compatible_language in languages]


def characters_popularity_compare(
Expand Down Expand Up @@ -189,7 +224,7 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers = {} # type: Dict[str, str]
layers = OrderedDict() # type: Dict[str, str]

for character in decoded_sequence:
if character.isalpha() is False:
Expand Down Expand Up @@ -227,7 +262,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios = {} # type: Dict[str, List[float]]
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
merge = [] # type: CoherenceMatches

for result in results:
Expand Down Expand Up @@ -264,13 +299,15 @@ def coherence_ratio(

results = [] # type: List[Tuple[str, float]]
lg_inclusion_list = [] # type: List[str]
ignore_non_latin = False # type: bool

sufficient_match_count = 0 # type: int

if lg_inclusion is not None:
lg_inclusion_list = lg_inclusion.split(",")

if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")

for layer in alpha_unicode_split(decoded_sequence):
Expand All @@ -279,13 +316,13 @@ def coherence_ratio(

character_count = sum([o for c, o in most_common]) # type: int

if character_count <= 32:
if character_count <= TOO_SMALL_SEQUENCE:
continue

popular_character_ordered = [c for c, o in most_common] # type: List[str]

for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered
popular_character_ordered, ignore_non_latin
):
ratio = characters_popularity_compare(
language, popular_character_ordered
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/md.py
Expand Up @@ -290,7 +290,7 @@ def feed(self, character: str) -> None:

self._character_count += buffer_length

if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._is_current_word_bad = True
Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/models.py
Expand Up @@ -54,9 +54,10 @@ def __lt__(self, other: object) -> bool:
raise ValueError

chaos_difference = abs(self.chaos - other.chaos) # type: float
coherence_difference = abs(self.coherence - other.coherence) # type: float

# Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01:
if chaos_difference < 0.01 and coherence_difference > 0.02:
# When having a tough decision, use the result that decoded as many multi-byte as possible.
if chaos_difference == 0.0 and self.coherence == other.coherence:
return self.multi_byte_usage > other.multi_byte_usage
Expand Down