chardet · ghost · Jan 11, 2015 · Jan 27, 2015 · Jan 27, 2015 · Feb 15, 2015
diff --git a/CharsetsTabs.txt b/CharsetsTabs.txt
diff --git a/NOTES.rst b/NOTES.rst
@@ -64,11 +64,19 @@ Bigram files
 - ``hebrewprober.py``
 - ``jpcntxprober.py``
 - ``langbulgarianmodel.py``
+- ``langcroatianmodel.py``
 - ``langcyrillicmodel.py``
+- ``langczechmodel.py``
+- ``langgermanmodel.py``
 - ``langgreekmodel.py``
 - ``langhebrewmodel.py``
 - ``langhungarianmodel.py``
+- ``langpolishmodel.py``
+- ``langromanianmodel.py``
+- ``langslovakmodel.py``
+- ``langslovenemodel.py``
 - ``langthaimodel.py``
+- ``langturkishmodel.py``
 - ``latin1prober.py``
 - ``sbcharsetprober.py``
 - ``sbcsgroupprober.py``
@@ -111,7 +119,6 @@ Misc files
 ----------
 
 - ``__init__.py`` (currently has ``detect`` function in it)
-- ``compat.py``
 - ``enums.py``
 - ``universaldetector.py``
 - ``version.py``

diff --git a/README.rst b/README.rst
@@ -1,22 +1,39 @@
 Chardet: The Universal Character Encoding Detector
 --------------------------------------------------
 
+.. image:: https://img.shields.io/travis/chardet/chardet/stable.svg
+   :alt: Build status
+   :target: https://travis-ci.org/chardet/chardet
+
+.. image:: https://img.shields.io/coveralls/chardet/chardet/stable.svg
+    :target: https://coveralls.io/r/chardet/chardet
+
+.. image:: https://img.shields.io/pypi/dm/chardet.svg
+   :target: https://warehouse.python.org/project/chardet/
+   :alt: PyPI downloads
+
+.. image:: https://img.shields.io/pypi/v/chardet.svg
+   :target: https://warehouse.python.org/project/chardet/
+   :alt: Latest version on PyPI
+
+.. image:: https://img.shields.io/pypi/l/chardet.svg
+   :alt: License
+
+
 Detects
  - ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants)
  - Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese)
  - EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese)
  - EUC-KR, ISO-2022-KR (Korean)
  - KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic)
+ - ISO-8859-2 windows-1250 (Czech, Croatian, Hungarian, Polish, Romanian, Slovak, Slovene)
  - ISO-8859-5, windows-1251 (Bulgarian)
- - windows-1252 (English)
+ - ISO-8859-1, windows-1252 (Dutch, English, Finnish, French, German, Italy, Portuguese, Spanish)
  - ISO-8859-7, windows-1253 (Greek)
  - ISO-8859-8, windows-1255 (Visual and Logical Hebrew)
+ - ISO-8859-9, windows-1254 (Turkish)
  - TIS-620 (Thai)
 
-.. note::
-   Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
-   disabled until we can retrain the models.
-
 Requires Python 2.6 or later
 
 Installation

diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -16,18 +16,17 @@
 ######################### END LICENSE BLOCK #########################
 
 
-from .compat import PY2, PY3
+import sys
 from .universaldetector import UniversalDetector
-from .version import __version__, VERSION
 
-
-def detect(byte_str):
-    if (PY2 and isinstance(byte_str, unicode)) or (PY3 and
-                                               not isinstance(byte_str,
-                                                              bytes)):
+def detect(byte_str, txt_cleanup=True):
+    PY_VER = 2 if sys.version_info < (3, 0) else 3
+    if ((PY_VER == 2 and isinstance(byte_str, unicode)) or
+        (PY_VER == 3 and not isinstance(byte_str, bytes))):
         raise ValueError('Expected a bytes object, not a unicode object')
-
+    if PY_VER == 2:
+        byte_str = bytearray(byte_str)
     u = UniversalDetector()
-    u.feed(byte_str)
+    u.feed(byte_str, txt_cleanup)
     u.close()
     return u.result
diff --git a/chardet/big5prober.py b/chardet/big5prober.py
@@ -41,3 +41,7 @@ def __init__(self):
     @property
     def charset_name(self):
         return "Big5"
+
+    @property
+    def language(self):
+        return "Chinese"
diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
@@ -35,7 +35,6 @@
                        BIG5_TYPICAL_DISTRIBUTION_RATIO)
 from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
                       JIS_TYPICAL_DISTRIBUTION_RATIO)
-from .compat import wrap_ord
 
 
 class CharDistributionAnalysis(object):
@@ -123,9 +122,9 @@ def get_order(self, byte_str):
         #   first  byte range: 0xc4 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if first_char >= 0xC4:
-            return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
         else:
             return -1
 
@@ -142,9 +141,9 @@ def get_order(self, byte_str):
         #   first  byte range: 0xb0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if first_char >= 0xB0:
-            return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
         else:
             return -1
 
@@ -161,7 +160,7 @@ def get_order(self, byte_str):
         #  first  byte range: 0xb0 -- 0xfe
         #  second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if (first_char >= 0xB0) and (second_char >= 0xA1):
             return 94 * (first_char - 0xB0) + second_char - 0xA1
         else:
@@ -180,7 +179,7 @@ def get_order(self, byte_str):
         #   first  byte range: 0xa4 -- 0xfe
         #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if first_char >= 0xA4:
             if second_char >= 0xA1:
                 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
@@ -202,7 +201,7 @@ def get_order(self, byte_str):
         #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
         #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if (first_char >= 0x81) and (first_char <= 0x9F):
             order = 188 * (first_char - 0x81)
         elif (first_char >= 0xE0) and (first_char <= 0xEF):
@@ -227,8 +226,8 @@ def get_order(self, byte_str):
         #   first  byte range: 0xa0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        char = wrap_ord(byte_str[0])
+        char = byte_str[0]
         if char >= 0xA0:
-            return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
+            return 94 * (char - 0xA1) + byte_str[1] - 0xa1
         else:
             return -1
diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py
@@ -54,6 +54,14 @@ def charset_name(self):
                 return None
         return self._best_guess_prober.charset_name
 
+    @property
+    def language(self):
+        if not self._best_guess_prober:
+            self.get_confidence()
+            if not self._best_guess_prober:
+                return None
+        return self._best_guess_prober.language
+
     def feed(self, byte_str):
         for prober in self.probers:
             if not prober:
@@ -89,7 +97,7 @@ def get_confidence(self):
                 self.logger.debug('%s not active', prober.charset_name)
                 continue
             conf = prober.get_confidence()
-            self.logger.debug('%s confidence = %s', prober.charset_name, conf)
+            self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
             if best_conf < conf:
                 best_conf = conf
                 self._best_guess_prober = prober

diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py
@@ -28,7 +28,6 @@
 
 import logging
 import re
-from io import BytesIO
 
 from .enums import ProbingState
 
@@ -79,16 +78,16 @@ def filter_international_words(buf):
 
         This filter applies to all scripts which do not use English characters.
         """
-        filtered = BytesIO()
 
+        out = b''
         # This regex expression filters out only words that have at-least one
         # international character. The word may include one marker character at
         # the end.
         words = re.findall(
             b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', buf)
 
         for word in words:
-            filtered.write(word[:-1])
+            out += word[:-1]
 
             # If the last character in the word is a marker, replace it with a
             # space as markers shouldn't affect our analysis (they are used
@@ -97,9 +96,9 @@ def filter_international_words(buf):
             last_char = word[-1:]
             if not last_char.isalpha() and last_char < b'\x80':
                 last_char = b' '
-            filtered.write(last_char)
+            out += last_char
 
-        return filtered.getvalue()
+        return out
 
     @staticmethod
     def filter_with_english_letters(buf):
@@ -113,7 +112,6 @@ def filter_with_english_letters(buf):
         characters and extended ASCII characters, but is currently only used by
         ``Latin1Prober``.
         """
-        filtered = BytesIO()
         in_tag = False
         prev = 0
 
@@ -132,15 +130,15 @@ def filter_with_english_letters(buf):
                 if curr > prev and not in_tag:
                     # Keep everything after last non-extended-ASCII,
                     # non-alphabetic character
-                    filtered.write(buf[prev:curr])
+                    out += buf[prev:curr]
                     # Output a space to delimit stretch we kept
-                    filtered.write(b' ')
+                    out += b' '
                 prev = curr + 1
 
         # If we're not in a tag...
         if not in_tag:
             # Keep everything after last non-extended-ASCII, non-alphabetic
             # character
-            filtered.write(buf[prev:])
+            out += buf[prev:]
 
-        return filtered.getvalue()
+        return out
diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py
@@ -19,12 +19,11 @@
 import sys
 from io import open
 
-from chardet import __version__
-from chardet.compat import PY2
+from chardet.version import __version__
 from chardet.universaldetector import UniversalDetector
 
 
-
+PY_VER = 2 if sys.version_info < (3, 0) else 3
 
 def description_of(lines, name='stdin'):
     """
@@ -38,10 +37,12 @@ def description_of(lines, name='stdin'):
     """
     u = UniversalDetector()
     for line in lines:
+        if PY_VER == 2:
+            line = bytearray(line)
         u.feed(line)
     u.close()
     result = u.result
-    if PY2:
+    if PY_VER == 2:
         name = name.decode(sys.getfilesystemencoding(), 'ignore')
     if result['encoding']:
         return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
@@ -66,7 +67,7 @@ def main(argv=None):
                         help='File whose encoding we would like to determine. \
                               (default: stdin)',
                         type=argparse.FileType('rb'), nargs='*',
-                        default=[sys.stdin if PY2 else sys.stdin.buffer])
+                        default=[sys.stdin if PY_VER == 2 else sys.stdin.buffer])
     parser.add_argument('--version', action='version',
                         version='%(prog)s {0}'.format(__version__))
     args = parser.parse_args(argv)

diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py
@@ -28,7 +28,6 @@
 import logging
 
 from .enums import MachineState
-from .compat import wrap_ord
 
 
 class CodingStateMachine(object):
@@ -67,7 +66,7 @@ def reset(self):
     def next_state(self, c):
         # for each byte we get its class
         # if it is first byte, we also get byte length
-        byte_class = self._model['class_table'][wrap_ord(c)]
+        byte_class = self._model['class_table'][c]
         if self._curr_state == MachineState.start:
             self._curr_byte_pos = 0
             self._curr_char_len = self._model['char_len_table'][byte_class]

diff --git a/chardet/compat.py b/chardet/compat.py
diff --git a/chardet/cp949prober.py b/chardet/cp949prober.py
@@ -43,3 +43,7 @@ def __init__(self):
     @property
     def charset_name(self):
         return "CP949"
+
+    @property
+    def language(self):
+        return "Korean"
diff --git a/chardet/escprober.py b/chardet/escprober.py
@@ -27,7 +27,6 @@
 
 from .charsetprober import CharSetProber
 from .codingstatemachine import CodingStateMachine
-from .compat import wrap_ord
 from .enums import LanguageFilter, ProbingState, MachineState
 from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
                     ISO2022KR_SM_MODEL)
@@ -76,11 +75,12 @@ def get_confidence(self):
             return 0.00
 
     def feed(self, byte_str):
-        for c in byte_str:
+        num_bytes = len(byte_str)
+        for i in range(0, num_bytes):
             for coding_sm in self.coding_sm:
                 if not coding_sm or not coding_sm.active:
                     continue
-                coding_state = coding_sm.next_state(wrap_ord(c))
+                coding_state = coding_sm.next_state(byte_str[i])
                 if coding_state == MachineState.error:
                     coding_sm.active = False
                     self.active_sm_count -= 1