diff --git a/requirements/optional.txt b/requirements/optional.txt index 001b5c05fd..002b3a0e8a 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -5,7 +5,7 @@ aeidon==1.10.1 # Subtitles # Format support BeautifulSoup4>=4.3 # Trados # Encoding detection -chardet==4.0.0 # chardet +charset-normalizer==2.0.11 # chardet # Tmserver backend cheroot==8.6.0 # tmserver # Format support diff --git a/translate/storage/base.py b/translate/storage/base.py index 4409439b3a..bf2b0da76b 100644 --- a/translate/storage/base.py +++ b/translate/storage/base.py @@ -23,6 +23,7 @@ import pickle from collections import OrderedDict from io import BytesIO +from typing import List, Optional, Tuple from translate.misc.multistring import multistring from translate.storage.placeables import StringElem, parse as rich_parse @@ -781,7 +782,9 @@ def fallback_detection(text): return {"encoding": encoding, "confidence": 1.0} return None - def detect_encoding(self, text, default_encodings=None): + def detect_encoding( + self, text: bytes, default_encodings: Optional[List[str]] = None + ) -> Tuple[str, str]: """ Try to detect a file encoding from `text`, using either the chardet lib or by trying to decode the file. @@ -789,13 +792,15 @@ def detect_encoding(self, text, default_encodings=None): if not default_encodings: default_encodings = ["utf-8"] try: - import chardet + from charset_normalizer import detect except ImportError: detected_encoding = self.fallback_detection(text) else: - # many false complaints with ellipse (…) (see bug 1825) - detected_encoding = chardet.detect(text.replace(b"\xe2\x80\xa6", b"")) - if detected_encoding["confidence"] < 0.48: + detected_encoding = detect(text) + if ( + detected_encoding["confidence"] is None + or detected_encoding["confidence"] < 0.48 + ): detected_encoding = None elif detected_encoding["encoding"] == "ascii": detected_encoding["encoding"] = self.encoding diff --git a/translate/storage/test_csvl10n.py b/translate/storage/test_csvl10n.py index 20ce869af2..43519e8e3c 100644 --- a/translate/storage/test_csvl10n.py +++ b/translate/storage/test_csvl10n.py @@ -103,3 +103,15 @@ def test_parse_sample(self): assert store.units[0].source == "te\\nst" assert store.units[0].target == "ot\\nher" assert bytes(store) == content + + def test_utf_8_detection(self): + content = ( + """"location","source","target","id","fuzzy","context","translator_comments","developer_comments"\r\n""" + """"","Second","秒","","False","00029.00002","","# Filter Order|IDE_2ND_ORDER_FILTER"\r\n""" + ) + store = self.StoreClass() + store.parse(content.encode()) + assert len(store.units) == 1 + assert store.units[0].source == "Second" + assert store.units[0].target == "秒" + assert bytes(store).decode() == content