Merge pull request #140 from chardet/master

Release 4.0.0
chardet · Dec 10, 2020 · a808ed1 · a808ed1
2 parents 9b8c5c2 + 53854fb
commit a808ed1
Show file tree

Hide file tree

Showing 29 changed files with 33,412 additions and 1,686 deletions.
diff --git a/.coveragerc b/.coveragerc
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,26 @@
+name: Python package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy2, pypy3]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip wheel
+        pip install --upgrade --upgrade-strategy only-if-needed pytest pytest-catchlog
+        pip install .
+    - name: Test with pytest
+      run: |
+        pytest
diff --git a/.travis.yml b/.travis.yml
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,3 +4,5 @@ include requirements.txt
 include test.py
 recursive-include docs *
 recursive-include tests *
+global-exclude *.pyc
+global-exclude __pycache__
diff --git a/NOTES.rst b/NOTES.rst
@@ -133,8 +133,8 @@ might be useful in the future:
 
 
 .. _BOM by Encoding:
-    https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding
+    https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
 .. _A Composite Approach to Language/Encoding Detection:
     http://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html
 .. _What Every Programmer Absolutely...: http://kunststube.net/encoding/
-.. _source: https://mxr.mozilla.org/mozilla/source/intl/chardet/
+.. _source: https://dxr.mozilla.org/mozilla/source/intl/chardet/
diff --git a/README.rst b/README.rst
@@ -32,12 +32,12 @@ Detects
    Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
    disabled until we can retrain the models.
 
-Requires Python 2.6, 2.7, or 3.3+.
+Requires Python 2.7 or 3.5+.
 
 Installation
 ------------
 
-Install from `PyPI <https://pypi.python.org/pypi/chardet>`_::
+Install from `PyPI <https://pypi.org/project/chardet/>`_::
 
     pip install chardet
 
@@ -63,6 +63,6 @@ This is a continuation of Mark Pilgrim's excellent chardet. Previously, two
 versions needed to be maintained: one that supported python 2.x and one that
 supported python 3.x.  We've recently merged with `Ian Cordasco <https://github.com/sigmavirus24>`_'s
 `charade <https://github.com/sigmavirus24/charade>`_ fork, so now we have one
-coherent version that works for Python 2.6+.
+coherent version that works for Python 2.7+ and 3.4+.
 
 :maintainer: Dan Blanchard
diff --git a/bench.py b/bench.py
@@ -0,0 +1,146 @@
+"""
+Run chardet on a bunch of documents and see that we get the correct encodings.
+
+:author: Dan Blanchard
+:author: Ian Cordasco
+"""
+
+from __future__ import print_function, with_statement
+
+import argparse
+import sys
+import time
+from collections import defaultdict
+from io import open
+from os import listdir
+from os.path import dirname, isdir, join, realpath, relpath, splitext
+
+import chardet
+
+try:
+    import cchardet
+    HAVE_CCHARDET = True
+except:
+    HAVE_CCHARDET = False
+
+
+# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
+#       retrain model.
+MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
+                     'windows-1254', 'windows-1256'}
+EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
+                     'tests/iso-8859-9-turkish/divxplanet.com.xml',
+                     'tests/iso-8859-9-turkish/subtitle.srt',
+                     'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
+
+def get_py_impl():
+    """Return what kind of Python this is"""
+    if hasattr(sys, 'pypy_version_info'):
+        pyimpl = 'PyPy'
+    elif sys.platform.startswith('java'):
+        pyimpl = 'Jython'
+    elif sys.platform == 'cli':
+        pyimpl = 'IronPython'
+    else:
+        pyimpl = 'CPython'
+    return pyimpl
+
+
+def get_test_files():
+    """Yields filenames to use for timing chardet.detect"""
+    base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
+    for encoding in listdir(base_path):
+        path = join(base_path, encoding)
+        # Skip files in tests directory
+        if not isdir(path):
+            continue
+        # Remove language suffixes from encoding if pressent
+        encoding = encoding.lower()
+        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
+                        '-hebrew', '-hungarian', '-turkish']:
+            if encoding.endswith(postfix):
+                encoding = encoding.rpartition(postfix)[0]
+                break
+        # Skip directories for encodings we don't handle yet.
+        if encoding in MISSING_ENCODINGS:
+            continue
+        # Test encoding detection for each file we have of encoding for
+        for file_name in listdir(path):
+            ext = splitext(file_name)[1].lower()
+            if ext not in ['.html', '.txt', '.xml', '.srt']:
+                continue
+            full_path = join(path, file_name)
+            if full_path in EXPECTED_FAILURES:
+                continue
+            yield full_path, encoding
+
+
+def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
+    print('Benchmarking {} {} on {} {}'.format(chardet_mod.__name__,
+                                               chardet_mod.__version__,
+                                               get_py_impl(),
+                                               sys.version))
+    print('-' * 80)
+    total_time = 0
+    num_files = 0
+    encoding_times = defaultdict(float)
+    encoding_num_files = defaultdict(int)
+    for full_path, encoding in get_test_files():
+        num_files += 1
+        with open(full_path, 'rb') as f:
+            input_bytes = f.read()
+        start = time.time()
+        for _ in range(num_iters):
+            chardet_mod.detect(input_bytes)
+        bench_time = time.time() - start
+        if verbose:
+            print('Average time for {}: {}s'.format(full_path,
+                                                    bench_time / num_iters))
+        else:
+            print('.', end='')
+            sys.stdout.flush()
+        total_time += bench_time
+        encoding_times[encoding] += bench_time
+        encoding_num_files[encoding] += 1
+
+    print('\nCalls per second for each encoding:')
+    for encoding in sorted(encoding_times.keys()):
+        print('{}: {}'.format(encoding,
+                              num_iters * encoding_num_files[encoding] /
+                              encoding_times[encoding]))
+
+    print('\nTotal time: {}s ({} calls per second)'.format(total_time,
+                                                           num_iters * num_files /
+                                                           total_time))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Times how long it takes to process each file in test set '
+            'multiple times.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-c', '--cchardet',
+                        action='store_true',
+                        help='Run benchmarks for cChardet instead of chardet, '
+                             'if it is installed.')
+    parser.add_argument('-i', '--iterations',
+                        help='Number of times to process each file',
+                        type=int,
+                        default=10)
+    parser.add_argument('-v', '--verbose',
+                        help='Prints out the timing for each individual file.',
+                        action='store_true')
+    args = parser.parse_args()
+
+    if args.cchardet and not HAVE_CCHARDET:
+        print('You must pip install cchardet if you want to benchmark it.')
+        sys.exit(1)
+
+
+    benchmark(chardet_mod=cchardet if args.cchardet else chardet,
+              verbose=args.verbose,
+              num_iters=args.iterations)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -16,11 +16,14 @@
 ######################### END LICENSE BLOCK #########################
 
 
-from .compat import PY2, PY3
 from .universaldetector import UniversalDetector
+from .enums import InputState
 from .version import __version__, VERSION
 
 
+__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
+
+
 def detect(byte_str):
     """
     Detect the encoding of the given byte string.
@@ -31,9 +34,50 @@ def detect(byte_str):
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
             raise TypeError('Expected object of type bytes or bytearray, got: '
-                            '{0}'.format(type(byte_str)))
+                            '{}'.format(type(byte_str)))
         else:
             byte_str = bytearray(byte_str)
     detector = UniversalDetector()
     detector.feed(byte_str)
     return detector.close()
+
+
+def detect_all(byte_str):
+    """
+    Detect all the possible encodings of the given byte string.
+
+    :param byte_str:     The byte sequence to examine.
+    :type byte_str:      ``bytes`` or ``bytearray``
+    """
+    if not isinstance(byte_str, bytearray):
+        if not isinstance(byte_str, bytes):
+            raise TypeError('Expected object of type bytes or bytearray, got: '
+                            '{}'.format(type(byte_str)))
+        else:
+            byte_str = bytearray(byte_str)
+
+    detector = UniversalDetector()
+    detector.feed(byte_str)
+    detector.close()
+
+    if detector._input_state == InputState.HIGH_BYTE:
+        results = []
+        for prober in detector._charset_probers:
+            if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
+                charset_name = prober.charset_name
+                lower_charset_name = prober.charset_name.lower()
+                # Use Windows encoding name instead of ISO-8859 if we saw any
+                # extra Windows-specific bytes
+                if lower_charset_name.startswith('iso-8859'):
+                    if detector._has_win_bytes:
+                        charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
+                                                            charset_name)
+                results.append({
+                    'encoding': charset_name,
+                    'confidence': prober.get_confidence(),
+                    'language': prober.language,
+                })
+        if len(results) > 0:
+            return sorted(results, key=lambda result: -result['confidence'])
+
+    return [detector.result]
diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py
@@ -73,6 +73,7 @@ def feed(self, byte_str):
                 continue
             if state == ProbingState.FOUND_IT:
                 self._best_guess_prober = prober
+                self._state = ProbingState.FOUND_IT
                 return self.state
             elif state == ProbingState.NOT_ME:
                 prober.active = False

diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """
 Script which takes one or more file paths and reports on their detected
 encodings
@@ -45,10 +44,10 @@ def description_of(lines, name='stdin'):
     if PY2:
         name = name.decode(sys.getfilesystemencoding(), 'ignore')
     if result['encoding']:
-        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
+        return '{}: {} with confidence {}'.format(name, result['encoding'],
                                                      result['confidence'])
     else:
-        return '{0}: no result'.format(name)
+        return '{}: no result'.format(name)
 
 
 def main(argv=None):
@@ -69,7 +68,7 @@ def main(argv=None):
                         type=argparse.FileType('rb'), nargs='*',
                         default=[sys.stdin if PY2 else sys.stdin.buffer])
     parser.add_argument('--version', action='version',
-                        version='%(prog)s {0}'.format(__version__))
+                        version='%(prog)s {}'.format(__version__))
     args = parser.parse_args(argv)
 
     for f in args.input:

diff --git a/chardet/compat.py b/chardet/compat.py
@@ -25,10 +25,12 @@
 if sys.version_info < (3, 0):
     PY2 = True
     PY3 = False
-    base_str = (str, unicode)
+    string_types = (str, unicode)
     text_type = unicode
+    iteritems = dict.iteritems
 else:
     PY2 = False
     PY3 = True
-    base_str = (bytes, str)
+    string_types = (bytes, str)
     text_type = str
+    iteritems = dict.items