diff --git a/chardet/__init__.py b/chardet/__init__.py index 4c7edddc..eca92a69 100644 --- a/chardet/__init__.py +++ b/chardet/__init__.py @@ -24,12 +24,15 @@ __all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION'] -def detect(byte_str): +def detect(byte_str, chunk_size=100000): """ Detect the encoding of the given byte string. :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :param chunk_size: Number of bytes to pass to feed into underlying + UniversalDetector at a time. + :type chunk_size: ``int`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -38,7 +41,10 @@ def detect(byte_str): else: byte_str = bytearray(byte_str) detector = UniversalDetector() - detector.feed(byte_str) + for i in range(len(byte_str) // chunk_size + 1): + detector.feed(byte_str[i * chunk_size: (i + 1) * chunk_size]) + if detector.done: + break return detector.close() @@ -55,7 +61,6 @@ def detect_all(byte_str): '{}'.format(type(byte_str))) else: byte_str = bytearray(byte_str) - detector = UniversalDetector() detector.feed(byte_str) detector.close() @@ -74,7 +79,8 @@ def detect_all(byte_str): charset_name) results.append({ 'encoding': charset_name, - 'confidence': prober.get_confidence() + 'confidence': prober.get_confidence(), + 'language': prober.language, }) if len(results) > 0: return sorted(results, key=lambda result: -result['confidence'])