add prober for Johab Korean

chardet · Dec 12, 2020 · 4ba36a1 · 4ba36a1
1 parent 73b9174
commit 4ba36a1
Show file tree

Hide file tree

Showing 9 changed files with 4,218 additions and 1 deletion.
diff --git a/README.rst b/README.rst
@@ -20,7 +20,7 @@ Detects
  - ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
  - Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
  - EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
- - EUC-KR, ISO-2022-KR (Korean)
+ - EUC-KR, ISO-2022-KR, Johab (Korean)
  - KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
  - ISO-8859-5, windows-1251 (Bulgarian)
  - ISO-8859-1, windows-1252 (Western European languages)

diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
@@ -50,6 +50,7 @@
     JIS_TABLE_SIZE,
     JIS_TYPICAL_DISTRIBUTION_RATIO,
 )
+from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
 
 
 class CharDistributionAnalysis:
@@ -164,6 +165,21 @@ def get_order(self, byte_str):
             return -1
 
 
+class JOHABDistributionAnalysis(CharDistributionAnalysis):
+    def __init__(self):
+        super().__init__()
+        self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
+        self._table_size = EUCKR_TABLE_SIZE
+        self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
+
+    def get_order(self, byte_str):
+        first_char = byte_str[0]
+        if 0x88 <= first_char < 0xD4:
+            code = first_char * 256 + byte_str[1]
+            return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
+        return -1
+
+
 class GB2312DistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super().__init__()