Skip to content

Commit

Permalink
❇️ Further improve infering the language from a given CP (single-byte) (
Browse files Browse the repository at this point in the history
  • Loading branch information
Ousret committed Sep 20, 2021
1 parent 505b26c commit c415db7
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 7 deletions.
24 changes: 17 additions & 7 deletions charset_normalizer/cd.py
Expand Up @@ -2,7 +2,7 @@
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .md import is_suspiciously_successive_range
Expand All @@ -20,9 +20,10 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore

p = decoder(errors="ignore") # type: IncrementalDecoder
seen_ranges = set() # type: Set[str]
seen_ranges = {} # type: Dict[str, int]
character_count = 0 # type: int

for i in range(48, 255):
for i in range(0x40, 0xFF):
chunk = p.decode(bytes([i])) # type: str

if chunk:
Expand All @@ -32,9 +33,18 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
continue

if is_unicode_range_secondary(character_range) is False:
seen_ranges.add(character_range)
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1

return sorted(list(seen_ranges))
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)


def unicode_range_languages(primary_range: str) -> List[str]:
Expand Down Expand Up @@ -81,10 +91,10 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name in {"cp932"}
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs"}:
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
return ["Korean"]
Expand Down
23 changes: 23 additions & 0 deletions tests/test_coherence_detection.py
@@ -0,0 +1,23 @@
import pytest
from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding


@pytest.mark.parametrize(
"iana_encoding, expected_languages",
[
("cp864", ["Arabic", "Farsi"]),
("cp862", ["Hebrew"]),
("cp737", ["Greek"]),
("cp424", ["Hebrew"]),
("cp273", ["Latin Based"]),
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
("iso2022_jp", ["Japanese"])
]
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding)

for expected_language in expected_languages:
assert expected_language in languages, "Wrongly detected language for given code page"

0 comments on commit c415db7

Please sign in to comment.