Skip to content

Commit

Permalink
add prober for Johab Korean
Browse files Browse the repository at this point in the history
  • Loading branch information
grizlupo authored and dan-blanchard committed Dec 12, 2020
1 parent 73b9174 commit 756ff88
Show file tree
Hide file tree
Showing 9 changed files with 2,159 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Detects
- ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
- Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
- EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
- EUC-KR, ISO-2022-KR (Korean)
- EUC-KR, ISO-2022-KR, Johab (Korean)
- KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
- ISO-8859-5, windows-1251 (Bulgarian)
- ISO-8859-1, windows-1252 (Western European languages)
Expand Down
16 changes: 16 additions & 0 deletions chardet/chardistribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO,
)
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE


class CharDistributionAnalysis:
Expand Down Expand Up @@ -164,6 +165,21 @@ def get_order(self, byte_str):
return -1


class JOHABDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

def get_order(self, byte_str):
first_char = byte_str[0]
if 0x88 <= first_char < 0xD4:
code = first_char * 256 + byte_str[1]
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
return -1


class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
super().__init__()
Expand Down
326 changes: 326 additions & 0 deletions chardet/johabfreq.py

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions chardet/johabprober.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
######################## BEGIN LICENSE BLOCK ########################
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import JOHABDistributionAnalysis
from .mbcssm import JOHAB_SM_MODEL


class JOHABProber(MultiByteCharSetProber):
def __init__(self):
super(JOHABProber, self).__init__()
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
self.distribution_analyzer = JOHABDistributionAnalysis()
self.reset()

@property
def charset_name(self):
return "Johab"

@property
def language(self):
return "Korean"
2 changes: 2 additions & 0 deletions chardet/mbcsgroupprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from .euckrprober import EUCKRProber
from .euctwprober import EUCTWProber
from .gb2312prober import GB2312Prober
from .johabprober import JOHABProber
from .sjisprober import SJISProber
from .utf8prober import UTF8Prober

Expand All @@ -50,5 +51,6 @@ def __init__(self, lang_filter=None):
CP949Prober(),
Big5Prober(),
EUCTWProber(),
JOHABProber(),
]
self.reset()
54 changes: 54 additions & 0 deletions chardet/mbcssm.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,60 @@
"name": "EUC-KR",
}

# JOHAB

JOHAB_CLS = (
4,4,4,4,4,4,4,4, # 00 - 07
4,4,4,4,4,4,0,0, # 08 - 0f
4,4,4,4,4,4,4,4, # 10 - 17
4,4,4,0,4,4,4,4, # 18 - 1f
4,4,4,4,4,4,4,4, # 20 - 27
4,4,4,4,4,4,4,4, # 28 - 2f
4,3,3,3,3,3,3,3, # 30 - 37
3,3,3,3,3,3,3,3, # 38 - 3f
3,1,1,1,1,1,1,1, # 40 - 47
1,1,1,1,1,1,1,1, # 48 - 4f
1,1,1,1,1,1,1,1, # 50 - 57
1,1,1,1,1,1,1,1, # 58 - 5f
1,1,1,1,1,1,1,1, # 60 - 67
1,1,1,1,1,1,1,1, # 68 - 6f
1,1,1,1,1,1,1,1, # 70 - 77
1,1,1,1,1,1,1,2, # 78 - 7f
6,6,6,6,8,8,8,8, # 80 - 87
8,8,8,8,8,8,8,8, # 88 - 8f
8,7,7,7,7,7,7,7, # 90 - 97
7,7,7,7,7,7,7,7, # 98 - 9f
7,7,7,7,7,7,7,7, # a0 - a7
7,7,7,7,7,7,7,7, # a8 - af
7,7,7,7,7,7,7,7, # b0 - b7
7,7,7,7,7,7,7,7, # b8 - bf
7,7,7,7,7,7,7,7, # c0 - c7
7,7,7,7,7,7,7,7, # c8 - cf
7,7,7,7,5,5,5,5, # d0 - d7
5,9,9,9,9,9,9,5, # d8 - df
9,9,9,9,9,9,9,9, # e0 - e7
9,9,9,9,9,9,9,9, # e8 - ef
9,9,9,9,9,9,9,9, # f0 - f7
9,9,5,5,5,5,5,0 # f8 - ff
)

JOHAB_ST = (
# cls = 0 1 2 3 4 5 6 7 8 9
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
)

JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)

JOHAB_SM_MODEL = {'class_table': JOHAB_CLS,
'class_factor': 10,
'state_table': JOHAB_ST,
'char_len_table': JOHAB_CHAR_LEN_TABLE,
'name': 'Johab'}

# EUC-TW
# fmt: off
EUCTW_CLS = (
Expand Down
33 changes: 33 additions & 0 deletions tests/Johab/hlpro-readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
To read this document, use common combined Hangul code or KSSM code.

<<< �e�a�a�� 3.0 >>>

�A�b ������

Copyright (C) 1990, 1997 by Inkeon Lim
All rights reserved


o ��b�� �ŝe �A�� �a�w
* �� CD�A ͡�q�E ���e �e�a�a�� �ŝe �a���i�� ��b���e �A�b�a�� ���A�����a.
* �� CD�A ͡�q�E ���e �e�a�a�� �ŝe �a���i�e �A�b�a�� ��a ���� ����e
�wȁ���e ���� ���e ���A�i ���a�a�a �a���A�A ��͡�i �� ���s���a.
(��: ɷ���w�A ���� ���a, �a �a���a�១�a �a���a���A ͡�q ���a)
* �a�w�a�e �e�a�a���� ���a��, ���a�E ��З �a��(.EXE �a��) �� ��З�A ϩ����a��
�a���A�e ���� �a��(�i�� �a�� �w)�e�i �a�A���A ��͡�i �� ���a�a, �a �w���A
�e�a�a �e�a�a�� �w�w �a���a���i �w��ⷥ �w������ ��͡�i �� ���s���a.
* ���a š�a�A ���e �a�w�e hlprosrc.doc�i �q���a�� �a�s���a.

o �e�a�a���e ���a�w �e�i �a���a�១�����a. �a�w�a�a ���a�e �a��Ϣ �a�w�a
�����A���a(GUI)�i �A�b�a�� ���e �����i �A���s���a.

o �e�a�a���� ���e ���� �a��(.DOC �a��)�e �w�w ���s�w š�a(KSSM š�a)�� �b���A��
���s���a.

o �e�a�a���� ��á �w��e install.doc �a���A ��w�A�� ���a�a, ���A ��á�e ��á
�a���� install.bat�i ��З���a ��З�s���a. ��á�a�� ��A �e�a�� install.doc
�a���i ���ᥡ�� �e�a�a���i ��á�a�� �a�s���a.

o 286 š�a�� �A�������a BC++ 3.0�a�� ���a���E �a���a�១ �a���i�� HLIB\BCLIB
���Bɡ���A �A���S���a. BC++ 3.0 ���w �a�w�a�e �a�w�a �e����A ��s�E HLIB\LIB
�����A HLIB\BCLIB ���Bɡ���i ���a���� ���e�A ����a�� �a�s���a.

0 comments on commit 756ff88

Please sign in to comment.