Skip to content

Commit

Permalink
add prober for Johab Korean
Browse files Browse the repository at this point in the history
  • Loading branch information
grizlupo authored and dan-blanchard committed Dec 12, 2020
1 parent 73b9174 commit 4ba36a1
Show file tree
Hide file tree
Showing 9 changed files with 4,218 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Detects
- ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
- Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
- EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
- EUC-KR, ISO-2022-KR (Korean)
- EUC-KR, ISO-2022-KR, Johab (Korean)
- KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
- ISO-8859-5, windows-1251 (Bulgarian)
- ISO-8859-1, windows-1252 (Western European languages)
Expand Down
16 changes: 16 additions & 0 deletions chardet/chardistribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO,
)
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE


class CharDistributionAnalysis:
Expand Down Expand Up @@ -164,6 +165,21 @@ def get_order(self, byte_str):
return -1


class JOHABDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

def get_order(self, byte_str):
first_char = byte_str[0]
if 0x88 <= first_char < 0xD4:
code = first_char * 256 + byte_str[1]
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
return -1


class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super().__init__()
Expand Down

0 comments on commit 4ba36a1

Please sign in to comment.