Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #140 from chardet/master
Release 4.0.0
- Loading branch information
Showing
29 changed files
with
33,412 additions
and
1,686 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
name: Python package | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
|
||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy2, pypy3] | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip wheel | ||
pip install --upgrade --upgrade-strategy only-if-needed pytest pytest-catchlog | ||
pip install . | ||
- name: Test with pytest | ||
run: | | ||
pytest |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
""" | ||
Run chardet on a bunch of documents and see that we get the correct encodings. | ||
:author: Dan Blanchard | ||
:author: Ian Cordasco | ||
""" | ||
|
||
from __future__ import print_function, with_statement | ||
|
||
import argparse | ||
import sys | ||
import time | ||
from collections import defaultdict | ||
from io import open | ||
from os import listdir | ||
from os.path import dirname, isdir, join, realpath, relpath, splitext | ||
|
||
import chardet | ||
|
||
try: | ||
import cchardet | ||
HAVE_CCHARDET = True | ||
except: | ||
HAVE_CCHARDET = False | ||
|
||
|
||
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we | ||
# retrain model. | ||
MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250', | ||
'windows-1254', 'windows-1256'} | ||
EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml', | ||
'tests/iso-8859-9-turkish/divxplanet.com.xml', | ||
'tests/iso-8859-9-turkish/subtitle.srt', | ||
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'} | ||
|
||
def get_py_impl(): | ||
"""Return what kind of Python this is""" | ||
if hasattr(sys, 'pypy_version_info'): | ||
pyimpl = 'PyPy' | ||
elif sys.platform.startswith('java'): | ||
pyimpl = 'Jython' | ||
elif sys.platform == 'cli': | ||
pyimpl = 'IronPython' | ||
else: | ||
pyimpl = 'CPython' | ||
return pyimpl | ||
|
||
|
||
def get_test_files(): | ||
"""Yields filenames to use for timing chardet.detect""" | ||
base_path = relpath(join(dirname(realpath(__file__)), 'tests')) | ||
for encoding in listdir(base_path): | ||
path = join(base_path, encoding) | ||
# Skip files in tests directory | ||
if not isdir(path): | ||
continue | ||
# Remove language suffixes from encoding if pressent | ||
encoding = encoding.lower() | ||
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek', | ||
'-hebrew', '-hungarian', '-turkish']: | ||
if encoding.endswith(postfix): | ||
encoding = encoding.rpartition(postfix)[0] | ||
break | ||
# Skip directories for encodings we don't handle yet. | ||
if encoding in MISSING_ENCODINGS: | ||
continue | ||
# Test encoding detection for each file we have of encoding for | ||
for file_name in listdir(path): | ||
ext = splitext(file_name)[1].lower() | ||
if ext not in ['.html', '.txt', '.xml', '.srt']: | ||
continue | ||
full_path = join(path, file_name) | ||
if full_path in EXPECTED_FAILURES: | ||
continue | ||
yield full_path, encoding | ||
|
||
|
||
def benchmark(chardet_mod=chardet, verbose=False, num_iters=10): | ||
print('Benchmarking {} {} on {} {}'.format(chardet_mod.__name__, | ||
chardet_mod.__version__, | ||
get_py_impl(), | ||
sys.version)) | ||
print('-' * 80) | ||
total_time = 0 | ||
num_files = 0 | ||
encoding_times = defaultdict(float) | ||
encoding_num_files = defaultdict(int) | ||
for full_path, encoding in get_test_files(): | ||
num_files += 1 | ||
with open(full_path, 'rb') as f: | ||
input_bytes = f.read() | ||
start = time.time() | ||
for _ in range(num_iters): | ||
chardet_mod.detect(input_bytes) | ||
bench_time = time.time() - start | ||
if verbose: | ||
print('Average time for {}: {}s'.format(full_path, | ||
bench_time / num_iters)) | ||
else: | ||
print('.', end='') | ||
sys.stdout.flush() | ||
total_time += bench_time | ||
encoding_times[encoding] += bench_time | ||
encoding_num_files[encoding] += 1 | ||
|
||
print('\nCalls per second for each encoding:') | ||
for encoding in sorted(encoding_times.keys()): | ||
print('{}: {}'.format(encoding, | ||
num_iters * encoding_num_files[encoding] / | ||
encoding_times[encoding])) | ||
|
||
print('\nTotal time: {}s ({} calls per second)'.format(total_time, | ||
num_iters * num_files / | ||
total_time)) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description='Times how long it takes to process each file in test set ' | ||
'multiple times.', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('-c', '--cchardet', | ||
action='store_true', | ||
help='Run benchmarks for cChardet instead of chardet, ' | ||
'if it is installed.') | ||
parser.add_argument('-i', '--iterations', | ||
help='Number of times to process each file', | ||
type=int, | ||
default=10) | ||
parser.add_argument('-v', '--verbose', | ||
help='Prints out the timing for each individual file.', | ||
action='store_true') | ||
args = parser.parse_args() | ||
|
||
if args.cchardet and not HAVE_CCHARDET: | ||
print('You must pip install cchardet if you want to benchmark it.') | ||
sys.exit(1) | ||
|
||
|
||
benchmark(chardet_mod=cchardet if args.cchardet else chardet, | ||
verbose=args.verbose, | ||
num_iters=args.iterations) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.