Skip to content

Commit

Permalink
Add language to detect_all output
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-blanchard committed Dec 10, 2020
1 parent 1e208b7 commit 9a754c9
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions chardet/__init__.py
Expand Up @@ -24,12 +24,15 @@
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']


def detect(byte_str):
def detect(byte_str, chunk_size=100000):
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param chunk_size: Number of bytes to pass to feed into underlying
UniversalDetector at a time.
:type chunk_size: ``int``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
Expand All @@ -38,7 +41,10 @@ def detect(byte_str):
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
for i in range(len(byte_str) // chunk_size + 1):
detector.feed(byte_str[i * chunk_size: (i + 1) * chunk_size])
if detector.done:
break
return detector.close()


Expand All @@ -55,7 +61,6 @@ def detect_all(byte_str):
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)

detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
Expand All @@ -74,7 +79,8 @@ def detect_all(byte_str):
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence()
'confidence': prober.get_confidence(),
'language': prober.language,
})
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])
Expand Down

0 comments on commit 9a754c9

Please sign in to comment.