Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

❇️ Further improve infering the language from a given CP #112

Merged
merged 1 commit into from Sep 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 17 additions & 7 deletions charset_normalizer/cd.py
Expand Up @@ -2,7 +2,7 @@
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Dict, List, Optional, Set, Tuple
from typing import Dict, List, Optional, Tuple

from .assets import FREQUENCIES
from .md import is_suspiciously_successive_range
Expand All @@ -20,9 +20,10 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore

p = decoder(errors="ignore") # type: IncrementalDecoder
seen_ranges = set() # type: Set[str]
seen_ranges = {} # type: Dict[str, int]
character_count = 0 # type: int

for i in range(48, 255):
for i in range(0x40, 0xFF):
Ousret marked this conversation as resolved.
Show resolved Hide resolved
chunk = p.decode(bytes([i])) # type: str

if chunk:
Expand All @@ -32,9 +33,18 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
continue

if is_unicode_range_secondary(character_range) is False:
seen_ranges.add(character_range)
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1

return sorted(list(seen_ranges))
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
Ousret marked this conversation as resolved.
Show resolved Hide resolved
]
)


def unicode_range_languages(primary_range: str) -> List[str]:
Expand Down Expand Up @@ -81,10 +91,10 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name in {"cp932"}
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs"}:
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs", "hz"}:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
return ["Korean"]
Expand Down
23 changes: 23 additions & 0 deletions tests/test_coherence_detection.py
@@ -0,0 +1,23 @@
import pytest
from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding


@pytest.mark.parametrize(
"iana_encoding, expected_languages",
[
("cp864", ["Arabic", "Farsi"]),
("cp862", ["Hebrew"]),
("cp737", ["Greek"]),
("cp424", ["Hebrew"]),
("cp273", ["Latin Based"]),
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
("iso2022_jp", ["Japanese"])
]
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding)

for expected_language in expected_languages:
assert expected_language in languages, "Wrongly detected language for given code page"