formats: Switch to charset-normalizer from chardet

This is better maintained and more reliable detection. This avoids issues with chardet mistakenly reporting utf-8 content as windows-1252, see chardet/chardet#185
nijel · Feb 12, 2022 · 385680c · 385680c
1 parent c607195
commit 385680c
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 6 deletions.
diff --git a/requirements/optional.txt b/requirements/optional.txt
@@ -5,7 +5,7 @@ aeidon==1.10.1        # Subtitles
 # Format support
 BeautifulSoup4>=4.3  # Trados
 # Encoding detection
-chardet==4.0.0              # chardet
+charset-normalizer==2.0.11   # chardet
 # Tmserver backend
 cheroot==8.6.0       # tmserver
 # Format support

diff --git a/translate/storage/base.py b/translate/storage/base.py
@@ -23,6 +23,7 @@
 import pickle
 from collections import OrderedDict
 from io import BytesIO
+from typing import List, Optional, Tuple
 
 from translate.misc.multistring import multistring
 from translate.storage.placeables import StringElem, parse as rich_parse
@@ -781,21 +782,25 @@ def fallback_detection(text):
                 return {"encoding": encoding, "confidence": 1.0}
         return None
 
-    def detect_encoding(self, text, default_encodings=None):
+    def detect_encoding(
+        self, text: bytes, default_encodings: Optional[List[str]] = None
+    ) -> Tuple[str, str]:
         """
         Try to detect a file encoding from `text`, using either the chardet lib
         or by trying to decode the file.
         """
         if not default_encodings:
             default_encodings = ["utf-8"]
         try:
-            import chardet
+            from charset_normalizer import detect
         except ImportError:
             detected_encoding = self.fallback_detection(text)
         else:
-            # many false complaints with ellipse (…) (see bug 1825)
-            detected_encoding = chardet.detect(text.replace(b"\xe2\x80\xa6", b""))
-            if detected_encoding["confidence"] < 0.48:
+            detected_encoding = detect(text)
+            if (
+                detected_encoding["confidence"] is None
+                or detected_encoding["confidence"] < 0.48
+            ):
                 detected_encoding = None
             elif detected_encoding["encoding"] == "ascii":
                 detected_encoding["encoding"] = self.encoding

diff --git a/translate/storage/test_csvl10n.py b/translate/storage/test_csvl10n.py
@@ -103,3 +103,15 @@ def test_parse_sample(self):
         assert store.units[0].source == "te\\nst"
         assert store.units[0].target == "ot\\nher"
         assert bytes(store) == content
+
+    def test_utf_8_detection(self):
+        content = (
+            """"location","source","target","id","fuzzy","context","translator_comments","developer_comments"\r\n"""
+            """"","Second","秒","","False","00029.00002","","# Filter Order|IDE_2ND_ORDER_FILTER"\r\n"""
+        )
+        store = self.StoreClass()
+        store.parse(content.encode())
+        assert len(store.units) == 1
+        assert store.units[0].source == "Second"
+        assert store.units[0].target == "秒"
+        assert bytes(store).decode() == content