🔖 Version 2.0.7 (#119)

* 📝 Update claims * 🔖 Bump version to 2.0.7 * 🐛 Fix regression from PR #113 List instead of Set for alphabets property * fix type output in alphabets property * ✔️ Add test case to ensure non-regression upon 28c3ae1 * ✔️ Add tests and ignore old legacy methods cover * ❇️ Add autofix script for black and isort linters * 📝 Update contrib.md * 🔧 Python 3.10 (using public release) tests
jawah · Oct 11, 2021 · ea44bd7 · ea44bd7
1 parent ccf77d2
commit ea44bd7
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 19 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10.0-rc.2"]
+        python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10"]
         os: [ubuntu-latest]
 
     steps:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -56,7 +56,7 @@ the backward-compatibility.
 ## What PR may be doomed?
 
   - Dropping EOL Python 3.5
-> We are waiting upon the right moment to drop it. Hint, wait for requests to drop it first.
+> Scheduled for the 3.0 milestone.
 
   - Add support for a Python unsupported charset/encoding
 > If you looked carefully at the project, you would see that it aims to be generic whenever possible. So adding a specific prober is out of the question.
@@ -71,3 +71,5 @@ the backward-compatibility.
 
 It is essential that you run, prior to any submissions the mandatory checks.
 Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything.
+
+Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting.
diff --git a/README.md b/README.md
@@ -48,20 +48,21 @@ This project offers you an alternative to **Universal Charset Encoding Detector*
 
 This package offer better performance than its counterpart Chardet. Here are some numbers.
 
-| Package       | Accuracy       | Mean per file (ns) | File per sec (est) |
+| Package       | Accuracy       | Mean per file (ms) | File per sec (est) |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     92.0 %     |     220 ms      |       5 file/sec        |
-| charset-normalizer |    **97.0 %**     |     **40 ms**      |       25 file/sec    |
+|      [chardet](https://github.com/chardet/chardet)        |     92 %     |     220 ms      |       5 file/sec        |
+| charset-normalizer |    **98 %**     |     **40 ms**      |       25 file/sec    |
 
 | Package       | 99th percentile       | 95th percentile | 50th percentile |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     888 ms     |     300 ms      |       27 ms        |
-| charset-normalizer |    430 ms     |     220 ms      |       18 ms    |
+|      [chardet](https://github.com/chardet/chardet)        |     1115 ms     |     300 ms      |       27 ms        |
+| charset-normalizer |    460 ms     |     240 ms      |       18 ms    |
 
 Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
 
 > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
 > And yes, these results might change at any time. The dataset can be updated to include more files.
+> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
 
 [cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) faster alternative. If speed is the most important factor,
 you should try it.

diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh
@@ -0,0 +1,11 @@
+#!/bin/sh -e
+
+export PREFIX=""
+if [ -d 'venv' ] ; then
+    export PREFIX="venv/bin/"
+fi
+
+set -x
+
+${PREFIX}black --diff --target-version=py35 charset_normalizer
+${PREFIX}isort --diff charset_normalizer
diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py
@@ -17,7 +17,7 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
     :param byte_str:     The byte sequence to examine.
     """
     if not isinstance(byte_str, (bytearray, bytes)):
-        raise TypeError(
+        raise TypeError(  # pragma: nocover
             "Expected object of type bytes or bytearray, got: "
             "{0}".format(type(byte_str))
         )
@@ -52,39 +52,39 @@ class CharsetNormalizerMatch(CharsetMatch):
 class CharsetNormalizerMatches(CharsetMatches):
     @staticmethod
     def from_fp(*args, **kwargs):  # type: ignore
-        warnings.warn(
+        warnings.warn(  # pragma: nocover
             "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
             "and scheduled to be removed in 3.0",
             DeprecationWarning,
         )
-        return from_fp(*args, **kwargs)
+        return from_fp(*args, **kwargs)  # pragma: nocover
 
     @staticmethod
     def from_bytes(*args, **kwargs):  # type: ignore
-        warnings.warn(
+        warnings.warn(  # pragma: nocover
             "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
             "and scheduled to be removed in 3.0",
             DeprecationWarning,
         )
-        return from_bytes(*args, **kwargs)
+        return from_bytes(*args, **kwargs)  # pragma: nocover
 
     @staticmethod
     def from_path(*args, **kwargs):  # type: ignore
-        warnings.warn(
+        warnings.warn(  # pragma: nocover
             "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
             "and scheduled to be removed in 3.0",
             DeprecationWarning,
         )
-        return from_path(*args, **kwargs)
+        return from_path(*args, **kwargs)  # pragma: nocover
 
     @staticmethod
     def normalize(*args, **kwargs):  # type: ignore
-        warnings.warn(
+        warnings.warn(  # pragma: nocover
             "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
             "and scheduled to be removed in 3.0",
             DeprecationWarning,
         )
-        return normalize(*args, **kwargs)
+        return normalize(*args, **kwargs)  # pragma: nocover
 
 
 class CharsetDetector(CharsetNormalizerMatches):

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -231,7 +231,7 @@ def alphabets(self) -> List[str]:
             unicode_range(char) for char in str(self)
         ]  # type: List[Optional[str]]
         # filter and sort
-        self._unicode_ranges = sorted([r for r in detected_ranges if r])  # type: ignore
+        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
         return self._unicode_ranges
 
     @property

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.6"
+__version__ = "2.0.7"
 VERSION = __version__.split(".")
diff --git a/docs/why_migrate.rst b/docs/why_migrate.rst
@@ -4,7 +4,7 @@ Why should I migrate to Charset-Normalizer?
 There is so many reason to migrate your current project. Here are some of them:
 
 - Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly).
-- X4 faster than Chardet (average) AND support X3 more encoding.
+- X5 faster than Chardet in average and X2 faster in 99% of the cases AND support 3 times more encoding.
 - Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError!
 - Actively maintained, open to contributors.
 - Have the backward compatible function ``detect`` that come from Chardet.

diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py
@@ -105,3 +105,13 @@ def test_mb_cutting_chk():
 
     assert len(guesses) == 1, "cp isolation is set and given seq should be clear CP949!"
     assert best_guess.encoding == "cp949"
+
+
+def test_alphabets_property():
+    best_guess = from_bytes(
+        "😀 Hello World! How affairs are going? 😀".encode("utf_8")
+    ).best()
+
+    assert "Basic Latin" in best_guess.alphabets
+    assert "Emoticons range(Emoji)" in best_guess.alphabets
+    assert best_guess.alphabets.count("Basic Latin") == 1
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -114,6 +114,29 @@ def test_non_existent_file(self):
 
         self.assertEqual(cm.exception.code, 2)
 
+    def test_replace_without_normalize(self):
+
+        self.assertEqual(
+            cli_detect(
+                [
+                    './data/sample.1.ar.srt',
+                    '--replace'
+                ]
+            ),
+            1
+        )
+
+    def test_force_replace_without_replace(self):
+        self.assertEqual(
+            cli_detect(
+                [
+                    './data/sample.1.ar.srt',
+                    '--force'
+                ]
+            ),
+            1
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_normalize_fp.py b/tests/test_normalize_fp.py
@@ -0,0 +1,20 @@
+import pytest
+from charset_normalizer import normalize
+from os.path import exists
+from os import unlink
+
+
+def test_normalize_fp_creation():
+    guesses = normalize(
+        "./data/sample.1.ar.srt"
+    )
+
+    predicted_path = "./data/sample.1.ar-{}.srt".format(guesses.best().encoding)
+    path_exist = exists(
+        "./data/sample.1.ar-{}.srt".format(guesses.best().encoding)
+    )
+
+    assert path_exist is True
+
+    if path_exist:
+        unlink(predicted_path)