diff --git a/chardet/__init__.py b/chardet/__init__.py index 2112ee14..03888ee3 100644 --- a/chardet/__init__.py +++ b/chardet/__init__.py @@ -27,12 +27,18 @@ __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] -def detect(byte_str: Union[bytes, bytearray]) -> ResultDict: +def detect( + byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False +) -> ResultDict: """ Detect the encoding of the given byte string. :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :param should_rename_legacy: Should we rename legacy encodings + (as determined by W3C) to their more + modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -40,13 +46,15 @@ def detect(byte_str: Union[bytes, bytearray]) -> ResultDict: f"Expected object of type bytes or bytearray, got: {type(byte_str)}" ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) return detector.close() def detect_all( - byte_str: Union[bytes, bytearray], ignore_threshold: bool = False + byte_str: Union[bytes, bytearray], + ignore_threshold: bool = False, + should_rename_legacy: bool = False, ) -> List[ResultDict]: """ Detect all the possible encodings of the given byte string. @@ -57,6 +65,10 @@ def detect_all( ``UniversalDetector.MINIMUM_THRESHOLD`` in results. :type ignore_threshold: ``bool`` + :param should_rename_legacy: Should we rename legacy encodings + (as determined by W3C) to their more + modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -65,7 +77,7 @@ def detect_all( ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) detector.close() @@ -87,6 +99,11 @@ def detect_all( charset_name = detector.ISO_WIN_MAP.get( lower_charset_name, charset_name ) + # Rename legacy encodings with superset encodings if asked + if should_rename_legacy: + charset_name = detector.W3C_LEGACY_MAP.get( + charset_name.lower(), charset_name + ) results.append( { "encoding": charset_name, diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py index 94076a4b..6ff6ad43 100644 --- a/chardet/cli/chardetect.py +++ b/chardet/cli/chardetect.py @@ -22,7 +22,10 @@ def description_of( - lines: Iterable[bytes], name: str = "stdin", minimal: bool = False + lines: Iterable[bytes], + name: str = "stdin", + minimal: bool = False, + should_rename_legacy: bool = False, ) -> Optional[str]: """ Return a string describing the probable encoding of a file or @@ -32,8 +35,12 @@ def description_of( :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str + :param should_rename_legacy: Should we rename legacy encodings + (as determined by W3C) to their more + modern equivalents? + :type should_rename_legacy: ``bool`` """ - u = UniversalDetector() + u = UniversalDetector(should_rename_legacy=should_rename_legacy) for line in lines: line = bytearray(line) u.feed(line) @@ -75,6 +82,12 @@ def main(argv: Optional[List[str]] = None) -> None: help="Print only the encoding to standard output", action="store_true", ) + parser.add_argument( + "-l", + "--legacy", + help="Rename legacy encodings to more modern ones according to W3C standard.", + action="store_true", + ) parser.add_argument( "--version", action="version", version=f"%(prog)s {__version__}" ) @@ -89,7 +102,11 @@ def main(argv: Optional[List[str]] = None) -> None: "--help\n", file=sys.stderr, ) - print(description_of(f, f.name, minimal=args.minimal)) + print( + description_of( + f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy + ) + ) if __name__ == "__main__": diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py index f94961e6..37edea16 100644 --- a/chardet/universaldetector.py +++ b/chardet/universaldetector.py @@ -84,8 +84,24 @@ class UniversalDetector: "iso-8859-9": "Windows-1254", "iso-8859-13": "Windows-1257", } + # Based on https://encoding.spec.whatwg.org/#names-and-labels + W3C_LEGACY_MAP = { + "ascii": "Windows-1252", + "iso-8859-1": "Windows-1252", + "tis-620": "CP874", + "iso-8859-11": "CP874", + "iso-8859-9": "Windows-1254", + "gb2312": "GBK", + "cp932": "Shift_JIS", + "cp949": "EUC-KR", + "utf-16le": "UTF-16", + } - def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None: + def __init__( + self, + lang_filter: LanguageFilter = LanguageFilter.ALL, + should_rename_legacy: bool = False, + ) -> None: self._esc_charset_prober: Optional[EscCharSetProber] = None self._utf1632_prober: Optional[UTF1632Prober] = None self._charset_probers: List[CharSetProber] = [] @@ -101,6 +117,7 @@ def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None: self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) self._has_win_bytes = False + self.should_rename_legacy = should_rename_legacy self.reset() @property diff --git a/test.py b/test.py index 8f6e1897..009250f9 100644 --- a/test.py +++ b/test.py @@ -34,7 +34,6 @@ "windows-1256", } EXPECTED_FAILURES = { - "tests/iso-8859-7-greek/disabled.gr.xml", "tests/iso-8859-9-turkish/_ude_1.txt", "tests/iso-8859-9-turkish/_ude_2.txt", "tests/iso-8859-9-turkish/divxplanet.com.xml", @@ -114,6 +113,47 @@ def test_encoding_detection(file_name, encoding): ) +@pytest.mark.parametrize("file_name, encoding", gen_test_params()) +def test_encoding_detection_rename_legacy(file_name, encoding): + with open(file_name, "rb") as f: + input_bytes = f.read() + result = chardet.detect(input_bytes, should_rename_legacy=True) + try: + expected_unicode = input_bytes.decode(encoding) + except LookupError: + expected_unicode = "" + try: + detected_unicode = input_bytes.decode(result["encoding"]) + except (LookupError, UnicodeDecodeError, TypeError): + detected_unicode = "" + if result: + encoding_match = (result["encoding"] or "").lower() == encoding + else: + encoding_match = False + # Only care about mismatches that would actually result in different + # behavior when decoding + if not encoding_match and expected_unicode != detected_unicode: + wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n" + wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n" + diff = "".join( + list( + ndiff( + wrapped_expected.splitlines(True), wrapped_detected.splitlines(True) + ) + )[:20] + ) + all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True) + else: + diff = "" + encoding_match = True + all_encodings = [result] + assert encoding_match, ( + f"Expected {encoding}, but got {result} for {file_name}. First 20 " + f"lines of character differences: \n{diff}\n" + f"All encodings: {pformat(all_encodings)}" + ) + + if HAVE_HYPOTHESIS: class JustALengthIssue(Exception):