Skip to content

Commit

Permalink
formats: Switch to charset-normalizer from chardet
Browse files Browse the repository at this point in the history
This is better maintained and more reliable detection.

This avoids issues with chardet mistakenly reporting utf-8 content as
windows-1252, see chardet/chardet#185
  • Loading branch information
nijel committed Feb 12, 2022
1 parent c607195 commit 385680c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 6 deletions.
2 changes: 1 addition & 1 deletion requirements/optional.txt
Expand Up @@ -5,7 +5,7 @@ aeidon==1.10.1 # Subtitles
# Format support
BeautifulSoup4>=4.3 # Trados
# Encoding detection
chardet==4.0.0 # chardet
charset-normalizer==2.0.11 # chardet
# Tmserver backend
cheroot==8.6.0 # tmserver
# Format support
Expand Down
15 changes: 10 additions & 5 deletions translate/storage/base.py
Expand Up @@ -23,6 +23,7 @@
import pickle
from collections import OrderedDict
from io import BytesIO
from typing import List, Optional, Tuple

from translate.misc.multistring import multistring
from translate.storage.placeables import StringElem, parse as rich_parse
Expand Down Expand Up @@ -781,21 +782,25 @@ def fallback_detection(text):
return {"encoding": encoding, "confidence": 1.0}
return None

def detect_encoding(self, text, default_encodings=None):
def detect_encoding(
self, text: bytes, default_encodings: Optional[List[str]] = None
) -> Tuple[str, str]:
"""
Try to detect a file encoding from `text`, using either the chardet lib
or by trying to decode the file.
"""
if not default_encodings:
default_encodings = ["utf-8"]
try:
import chardet
from charset_normalizer import detect
except ImportError:
detected_encoding = self.fallback_detection(text)
else:
# many false complaints with ellipse (…) (see bug 1825)
detected_encoding = chardet.detect(text.replace(b"\xe2\x80\xa6", b""))
if detected_encoding["confidence"] < 0.48:
detected_encoding = detect(text)
if (
detected_encoding["confidence"] is None
or detected_encoding["confidence"] < 0.48
):
detected_encoding = None
elif detected_encoding["encoding"] == "ascii":
detected_encoding["encoding"] = self.encoding
Expand Down
12 changes: 12 additions & 0 deletions translate/storage/test_csvl10n.py
Expand Up @@ -103,3 +103,15 @@ def test_parse_sample(self):
assert store.units[0].source == "te\\nst"
assert store.units[0].target == "ot\\nher"
assert bytes(store) == content

def test_utf_8_detection(self):
content = (
""""location","source","target","id","fuzzy","context","translator_comments","developer_comments"\r\n"""
""""","Second","秒","","False","00029.00002","","# Filter Order|IDE_2ND_ORDER_FILTER"\r\n"""
)
store = self.StoreClass()
store.parse(content.encode())
assert len(store.units) == 1
assert store.units[0].source == "Second"
assert store.units[0].target == "秒"
assert bytes(store).decode() == content

0 comments on commit 385680c

Please sign in to comment.