Add should_rename_legacy flag to allow W3C renaming

chardet · Jun 29, 2022 · f29184c · f29184c
1 parent c4f7057
commit f29184c
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 9 deletions.
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -27,26 +27,34 @@
 __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
 
 
-def detect(byte_str: Union[bytes, bytearray]) -> ResultDict:
+def detect(
+    byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
+) -> ResultDict:
     """
     Detect the encoding of the given byte string.
 
     :param byte_str:     The byte sequence to examine.
     :type byte_str:      ``bytes`` or ``bytearray``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  (as determined by W3C) to their more
+                                  modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
             raise TypeError(
                 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
             )
         byte_str = bytearray(byte_str)
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     return detector.close()
 
 
 def detect_all(
-    byte_str: Union[bytes, bytearray], ignore_threshold: bool = False
+    byte_str: Union[bytes, bytearray],
+    ignore_threshold: bool = False,
+    should_rename_legacy: bool = False,
 ) -> List[ResultDict]:
     """
     Detect all the possible encodings of the given byte string.
@@ -57,6 +65,10 @@ def detect_all(
                               ``UniversalDetector.MINIMUM_THRESHOLD``
                               in results.
     :type ignore_threshold:   ``bool``
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  (as determined by W3C) to their more
+                                  modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
@@ -65,7 +77,7 @@ def detect_all(
             )
         byte_str = bytearray(byte_str)
 
-    detector = UniversalDetector()
+    detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
     detector.feed(byte_str)
     detector.close()
 
@@ -87,6 +99,11 @@ def detect_all(
                     charset_name = detector.ISO_WIN_MAP.get(
                         lower_charset_name, charset_name
                     )
+                # Rename legacy encodings with superset encodings if asked
+                if should_rename_legacy:
+                    charset_name = detector.W3C_LEGACY_MAP.get(
+                        charset_name.lower(), charset_name
+                    )
                 results.append(
                     {
                         "encoding": charset_name,

diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py
@@ -22,7 +22,10 @@
 
 
 def description_of(
-    lines: Iterable[bytes], name: str = "stdin", minimal: bool = False
+    lines: Iterable[bytes],
+    name: str = "stdin",
+    minimal: bool = False,
+    should_rename_legacy: bool = False,
 ) -> Optional[str]:
     """
     Return a string describing the probable encoding of a file or
@@ -32,8 +35,12 @@ def description_of(
     :type lines: Iterable of bytes
     :param name: Name of file or collection of lines
     :type name: str
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  (as determined by W3C) to their more
+                                  modern equivalents?
+    :type should_rename_legacy:   ``bool``
     """
-    u = UniversalDetector()
+    u = UniversalDetector(should_rename_legacy=should_rename_legacy)
     for line in lines:
         line = bytearray(line)
         u.feed(line)
@@ -75,6 +82,12 @@ def main(argv: Optional[List[str]] = None) -> None:
         help="Print only the encoding to standard output",
         action="store_true",
     )
+    parser.add_argument(
+        "-l",
+        "--legacy",
+        help="Rename legacy encodings to more modern ones according to W3C standard.",
+        action="store_true",
+    )
     parser.add_argument(
         "--version", action="version", version=f"%(prog)s {__version__}"
     )
@@ -89,7 +102,11 @@ def main(argv: Optional[List[str]] = None) -> None:
                 "--help\n",
                 file=sys.stderr,
             )
-        print(description_of(f, f.name, minimal=args.minimal))
+        print(
+            description_of(
+                f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
+            )
+        )
 
 
 if __name__ == "__main__":

diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py
@@ -84,8 +84,24 @@ class UniversalDetector:
         "iso-8859-9": "Windows-1254",
         "iso-8859-13": "Windows-1257",
     }
+    # Based on https://encoding.spec.whatwg.org/#names-and-labels
+    W3C_LEGACY_MAP = {
+        "ascii": "Windows-1252",
+        "iso-8859-1": "Windows-1252",
+        "tis-620": "CP874",
+        "iso-8859-11": "CP874",
+        "iso-8859-9": "Windows-1254",
+        "gb2312": "GBK",
+        "cp932": "Shift_JIS",
+        "cp949": "EUC-KR",
+        "utf-16le": "UTF-16",
+    }
 
-    def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
+    def __init__(
+        self,
+        lang_filter: LanguageFilter = LanguageFilter.ALL,
+        should_rename_legacy: bool = False,
+    ) -> None:
         self._esc_charset_prober: Optional[EscCharSetProber] = None
         self._utf1632_prober: Optional[UTF1632Prober] = None
         self._charset_probers: List[CharSetProber] = []
@@ -101,6 +117,7 @@ def __init__(self, lang_filter: LanguageFilter = LanguageFilter.ALL) -> None:
         self.lang_filter = lang_filter
         self.logger = logging.getLogger(__name__)
         self._has_win_bytes = False
+        self.should_rename_legacy = should_rename_legacy
         self.reset()
 
     @property

diff --git a/test.py b/test.py
@@ -34,7 +34,6 @@
     "windows-1256",
 }
 EXPECTED_FAILURES = {
-    "tests/iso-8859-7-greek/disabled.gr.xml",
     "tests/iso-8859-9-turkish/_ude_1.txt",
     "tests/iso-8859-9-turkish/_ude_2.txt",
     "tests/iso-8859-9-turkish/divxplanet.com.xml",
@@ -114,6 +113,47 @@ def test_encoding_detection(file_name, encoding):
     )
 
 
+@pytest.mark.parametrize("file_name, encoding", gen_test_params())
+def test_encoding_detection_rename_legacy(file_name, encoding):
+    with open(file_name, "rb") as f:
+        input_bytes = f.read()
+        result = chardet.detect(input_bytes, should_rename_legacy=True)
+        try:
+            expected_unicode = input_bytes.decode(encoding)
+        except LookupError:
+            expected_unicode = ""
+        try:
+            detected_unicode = input_bytes.decode(result["encoding"])
+        except (LookupError, UnicodeDecodeError, TypeError):
+            detected_unicode = ""
+    if result:
+        encoding_match = (result["encoding"] or "").lower() == encoding
+    else:
+        encoding_match = False
+    # Only care about mismatches that would actually result in different
+    # behavior when decoding
+    if not encoding_match and expected_unicode != detected_unicode:
+        wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
+        wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
+        diff = "".join(
+            list(
+                ndiff(
+                    wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
+                )
+            )[:20]
+        )
+        all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
+    else:
+        diff = ""
+        encoding_match = True
+        all_encodings = [result]
+    assert encoding_match, (
+        f"Expected {encoding}, but got {result} for {file_name}.  First 20 "
+        f"lines of character differences: \n{diff}\n"
+        f"All encodings: {pformat(all_encodings)}"
+    )
+
+
 if HAVE_HYPOTHESIS:
 
     class JustALengthIssue(Exception):