Skip to content

Commit

Permalink
Add should_rename_legacy flag to allow W3C renaming
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-blanchard committed Jun 29, 2022
1 parent c4f7057 commit f29184c
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 9 deletions.
25 changes: 21 additions & 4 deletions chardet/__init__.py
Expand Up @@ -27,26 +27,34 @@
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]


def detect(byte_str: Union[bytes, bytearray]) -> ResultDict:
def detect(
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
) -> ResultDict:
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param should_rename_legacy: Should we rename legacy encodings
(as determined by W3C) to their more
modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError(
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
return detector.close()


def detect_all(
byte_str: Union[bytes, bytearray], ignore_threshold: bool = False
byte_str: Union[bytes, bytearray],
ignore_threshold: bool = False,
should_rename_legacy: bool = False,
) -> List[ResultDict]:
"""
Detect all the possible encodings of the given byte string.
Expand All @@ -57,6 +65,10 @@ def detect_all(
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
:param should_rename_legacy: Should we rename legacy encodings
(as determined by W3C) to their more
modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
Expand All @@ -65,7 +77,7 @@ def detect_all(
)
byte_str = bytearray(byte_str)

detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
detector.close()

Expand All @@ -87,6 +99,11 @@ def detect_all(
charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if should_rename_legacy:
charset_name = detector.W3C_LEGACY_MAP.get(
charset_name.lower(), charset_name
)
results.append(
{
"encoding": charset_name,
Expand Down
23 changes: 20 additions & 3 deletions chardet/cli/chardetect.py
Expand Up @@ -22,7 +22,10 @@


def description_of(
lines: Iterable[bytes], name: str = "stdin", minimal: bool = False
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
) -> Optional[str]:
"""
Return a string describing the probable encoding of a file or
Expand All @@ -32,8 +35,12 @@ def description_of(
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
:param should_rename_legacy: Should we rename legacy encodings
(as determined by W3C) to their more
modern equivalents?
:type should_rename_legacy: ``bool``
"""
u = UniversalDetector()
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
for line in lines:
line = bytearray(line)
u.feed(line)
Expand Down Expand Up @@ -75,6 +82,12 @@ def main(argv: Optional[List[str]] = None) -> None:
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones according to W3C standard.",
action="store_true",
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
Expand All @@ -89,7 +102,11 @@ def main(argv: Optional[List[str]] = None) -> None:
"--help\n",
file=sys.stderr,
)
print(description_of(f, f.name, minimal=args.minimal))
print(
description_of(
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
)
)


if __name__ == "__main__":
Expand Down
19 changes: 18 additions & 1 deletion chardet/universaldetector.py
Expand Up @@ -84,8 +84,24 @@ class UniversalDetector:
"iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257",
}
# Based on https://encoding.spec.whatwg.org/#names-and-labels
W3C_LEGACY_MAP = {
"ascii": "Windows-1252",
"iso-8859-1": "Windows-1252",
"tis-620": "CP874",
"iso-8859-11": "CP874",
"iso-8859-9": "Windows-1254",
"gb2312": "GBK",
"cp932": "Shift_JIS",
"cp949": "EUC-KR",
"utf-16le": "UTF-16",
}

def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
def __init__(
self,
lang_filter: LanguageFilter = LanguageFilter.ALL,
should_rename_legacy: bool = False,
) -> None:
self._esc_charset_prober: Optional[EscCharSetProber] = None
self._utf1632_prober: Optional[UTF1632Prober] = None
self._charset_probers: List[CharSetProber] = []
Expand All @@ -101,6 +117,7 @@ def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = False
self.should_rename_legacy = should_rename_legacy
self.reset()

@property
Expand Down
42 changes: 41 additions & 1 deletion test.py
Expand Up @@ -34,7 +34,6 @@
"windows-1256",
}
EXPECTED_FAILURES = {
"tests/iso-8859-7-greek/disabled.gr.xml",
"tests/iso-8859-9-turkish/_ude_1.txt",
"tests/iso-8859-9-turkish/_ude_2.txt",
"tests/iso-8859-9-turkish/divxplanet.com.xml",
Expand Down Expand Up @@ -114,6 +113,47 @@ def test_encoding_detection(file_name, encoding):
)


@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection_rename_legacy(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes, should_rename_legacy=True)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)


if HAVE_HYPOTHESIS:

class JustALengthIssue(Exception):
Expand Down

0 comments on commit f29184c

Please sign in to comment.