Skip to content

Commit

Permalink
Merge pull request #140 from chardet/master
Browse files Browse the repository at this point in the history
Release 4.0.0
  • Loading branch information
dan-blanchard committed Dec 10, 2020
2 parents 9b8c5c2 + 53854fb commit a808ed1
Show file tree
Hide file tree
Showing 29 changed files with 33,412 additions and 1,686 deletions.
9 changes: 0 additions & 9 deletions .coveragerc

This file was deleted.

26 changes: 26 additions & 0 deletions .github/workflows/test.yml
@@ -0,0 +1,26 @@
name: Python package

on: [push]

jobs:
build:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy2, pypy3]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip wheel
pip install --upgrade --upgrade-strategy only-if-needed pytest pytest-catchlog
pip install .
- name: Test with pytest
run: |
pytest
34 changes: 0 additions & 34 deletions .travis.yml

This file was deleted.

2 changes: 2 additions & 0 deletions MANIFEST.in
Expand Up @@ -4,3 +4,5 @@ include requirements.txt
include test.py
recursive-include docs *
recursive-include tests *
global-exclude *.pyc
global-exclude __pycache__
4 changes: 2 additions & 2 deletions NOTES.rst
Expand Up @@ -133,8 +133,8 @@ might be useful in the future:


.. _BOM by Encoding:
https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding
https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
.. _A Composite Approach to Language/Encoding Detection:
http://www-archive.mozilla.org/projects/intl/UniversalCharsetDetection.html
.. _What Every Programmer Absolutely...: http://kunststube.net/encoding/
.. _source: https://mxr.mozilla.org/mozilla/source/intl/chardet/
.. _source: https://dxr.mozilla.org/mozilla/source/intl/chardet/
6 changes: 3 additions & 3 deletions README.rst
Expand Up @@ -32,12 +32,12 @@ Detects
Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
disabled until we can retrain the models.

Requires Python 2.6, 2.7, or 3.3+.
Requires Python 2.7 or 3.5+.

Installation
------------

Install from `PyPI <https://pypi.python.org/pypi/chardet>`_::
Install from `PyPI <https://pypi.org/project/chardet/>`_::

pip install chardet

Expand All @@ -63,6 +63,6 @@ This is a continuation of Mark Pilgrim's excellent chardet. Previously, two
versions needed to be maintained: one that supported python 2.x and one that
supported python 3.x. We've recently merged with `Ian Cordasco <https://github.com/sigmavirus24>`_'s
`charade <https://github.com/sigmavirus24/charade>`_ fork, so now we have one
coherent version that works for Python 2.6+.
coherent version that works for Python 2.7+ and 3.4+.

:maintainer: Dan Blanchard
146 changes: 146 additions & 0 deletions bench.py
@@ -0,0 +1,146 @@
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""

from __future__ import print_function, with_statement

import argparse
import sys
import time
from collections import defaultdict
from io import open
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext

import chardet

try:
import cchardet
HAVE_CCHARDET = True
except:
HAVE_CCHARDET = False


# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
'windows-1254', 'windows-1256'}
EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
'tests/iso-8859-9-turkish/divxplanet.com.xml',
'tests/iso-8859-9-turkish/subtitle.srt',
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}

def get_py_impl():
"""Return what kind of Python this is"""
if hasattr(sys, 'pypy_version_info'):
pyimpl = 'PyPy'
elif sys.platform.startswith('java'):
pyimpl = 'Jython'
elif sys.platform == 'cli':
pyimpl = 'IronPython'
else:
pyimpl = 'CPython'
return pyimpl


def get_test_files():
"""Yields filenames to use for timing chardet.detect"""
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
'-hebrew', '-hungarian', '-turkish']:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in ['.html', '.txt', '.xml', '.srt']:
continue
full_path = join(path, file_name)
if full_path in EXPECTED_FAILURES:
continue
yield full_path, encoding


def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
print('Benchmarking {} {} on {} {}'.format(chardet_mod.__name__,
chardet_mod.__version__,
get_py_impl(),
sys.version))
print('-' * 80)
total_time = 0
num_files = 0
encoding_times = defaultdict(float)
encoding_num_files = defaultdict(int)
for full_path, encoding in get_test_files():
num_files += 1
with open(full_path, 'rb') as f:
input_bytes = f.read()
start = time.time()
for _ in range(num_iters):
chardet_mod.detect(input_bytes)
bench_time = time.time() - start
if verbose:
print('Average time for {}: {}s'.format(full_path,
bench_time / num_iters))
else:
print('.', end='')
sys.stdout.flush()
total_time += bench_time
encoding_times[encoding] += bench_time
encoding_num_files[encoding] += 1

print('\nCalls per second for each encoding:')
for encoding in sorted(encoding_times.keys()):
print('{}: {}'.format(encoding,
num_iters * encoding_num_files[encoding] /
encoding_times[encoding]))

print('\nTotal time: {}s ({} calls per second)'.format(total_time,
num_iters * num_files /
total_time))


def main():
parser = argparse.ArgumentParser(
description='Times how long it takes to process each file in test set '
'multiple times.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--cchardet',
action='store_true',
help='Run benchmarks for cChardet instead of chardet, '
'if it is installed.')
parser.add_argument('-i', '--iterations',
help='Number of times to process each file',
type=int,
default=10)
parser.add_argument('-v', '--verbose',
help='Prints out the timing for each individual file.',
action='store_true')
args = parser.parse_args()

if args.cchardet and not HAVE_CCHARDET:
print('You must pip install cchardet if you want to benchmark it.')
sys.exit(1)


benchmark(chardet_mod=cchardet if args.cchardet else chardet,
verbose=args.verbose,
num_iters=args.iterations)


if __name__ == '__main__':
main()
48 changes: 46 additions & 2 deletions chardet/__init__.py
Expand Up @@ -16,11 +16,14 @@
######################### END LICENSE BLOCK #########################


from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .enums import InputState
from .version import __version__, VERSION


__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']


def detect(byte_str):
"""
Detect the encoding of the given byte string.
Expand All @@ -31,9 +34,50 @@ def detect(byte_str):
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()


def detect_all(byte_str):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)

detector = UniversalDetector()
detector.feed(byte_str)
detector.close()

if detector._input_state == InputState.HIGH_BYTE:
results = []
for prober in detector._charset_probers:
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name
lower_charset_name = prober.charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence(),
'language': prober.language,
})
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])

return [detector.result]
1 change: 1 addition & 0 deletions chardet/charsetgroupprober.py
Expand Up @@ -73,6 +73,7 @@ def feed(self, byte_str):
continue
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
self._state = ProbingState.FOUND_IT
return self.state
elif state == ProbingState.NOT_ME:
prober.active = False
Expand Down
7 changes: 3 additions & 4 deletions chardet/cli/chardetect.py 100755 → 100644
@@ -1,4 +1,3 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
Expand Down Expand Up @@ -45,10 +44,10 @@ def description_of(lines, name='stdin'):
if PY2:
name = name.decode(sys.getfilesystemencoding(), 'ignore')
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
return '{}: {} with confidence {}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
return '{}: no result'.format(name)


def main(argv=None):
Expand All @@ -69,7 +68,7 @@ def main(argv=None):
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin if PY2 else sys.stdin.buffer])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
version='%(prog)s {}'.format(__version__))
args = parser.parse_args(argv)

for f in args.input:
Expand Down
6 changes: 4 additions & 2 deletions chardet/compat.py
Expand Up @@ -25,10 +25,12 @@
if sys.version_info < (3, 0):
PY2 = True
PY3 = False
base_str = (str, unicode)
string_types = (str, unicode)
text_type = unicode
iteritems = dict.iteritems
else:
PY2 = False
PY3 = True
base_str = (bytes, str)
string_types = (bytes, str)
text_type = str
iteritems = dict.items

0 comments on commit a808ed1

Please sign in to comment.