From 85d0e8b42db22ad77c4e6d3f968dd6b5cc2177ab Mon Sep 17 00:00:00 2001
From: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Date: Mon, 12 Jul 2021 21:33:45 +0200
Subject: [PATCH 1/4] Always strip the SIG in raw bytes except on u16/u32

---
 charset_normalizer/api.py | 14 ++++++++++++--
 tests/test_on_byte.py     | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
index 0161b8ee..6a246a57 100644
--- a/charset_normalizer/api.py
+++ b/charset_normalizer/api.py
@@ -37,6 +37,16 @@ def from_bytes(
     """
     Given a raw bytes sequence, return the best possibles charset usable to render str objects.
     If there is no results, it is a strong indicator that the source is binary/not text.
+    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
+    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
+
+    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
+    but never take it for granted. Can improve the performance.
+
+    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
+    purpose.
+
+    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
     """
 
     if not explain:
@@ -152,7 +162,7 @@ def from_bytes(
         try:
             if is_too_large_sequence and is_multi_byte_decoder is False:
                 str(
-                    sequences[:int(50e4)],
+                    sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)],
                     encoding=encoding_iana
                 )
             else:
@@ -273,7 +283,7 @@ def from_bytes(
 
         results.append(
             CharsetMatch(
-                sequences,
+                sequences if strip_sig_or_bom is False else sequences[len(sig_payload):],
                 encoding_iana,
                 mean_mess_ratio,
                 bom_or_sig_available,
diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py
index 945fff90..9444f6b7 100644
--- a/tests/test_on_byte.py
+++ b/tests/test_on_byte.py
@@ -27,6 +27,40 @@ def test_empty_bytes(self):
             len(r.alphabets)
         )
 
+    def test_empty_str_with_sig_gb18030(self):
+        r = from_bytes('\uFEFF'.encode('gb18030')).best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            "",
+            str(r)
+        )
+        self.assertEqual(
+            "gb18030",
+            r.encoding
+        )
+        self.assertEqual(
+            0,
+            len(r.raw)
+        )
+
+    def test_empty_str_with_sig_utf8(self):
+        r = from_bytes(b'\xef\xbb\xbf').best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            "",
+            str(r)
+        )
+        self.assertEqual(
+            "utf_8",
+            r.encoding
+        )
+        self.assertEqual(
+            0,
+            len(r.raw)
+        )
+
     def test_bom_detection(self):
         with self.subTest('GB18030 UNAVAILABLE SIG'):
             self.assertFalse(

From 84c2bb9cb5e1ca238662f340141d27cf471c6bdf Mon Sep 17 00:00:00 2001
From: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Date: Wed, 14 Jul 2021 22:03:28 +0200
Subject: [PATCH 2/4] =?UTF-8?q?=E2=8F=AA=EF=B8=8F=20Revert=20backward=20in?=
 =?UTF-8?q?compatible=20change=20in=20resulting=20CharsetMatch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the 'raw' property should remain untouched no matter what.
---
 charset_normalizer/api.py | 2 +-
 tests/test_on_byte.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
index 4bae8163..bf2dcf07 100644
--- a/charset_normalizer/api.py
+++ b/charset_normalizer/api.py
@@ -283,7 +283,7 @@ def from_bytes(
 
         results.append(
             CharsetMatch(
-                sequences if strip_sig_or_bom is False else sequences[len(sig_payload):],
+                sequences,
                 encoding_iana,
                 mean_mess_ratio,
                 bom_or_sig_available,
diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py
index 9a0845ae..096f50bb 100644
--- a/tests/test_on_byte.py
+++ b/tests/test_on_byte.py
@@ -40,7 +40,7 @@ def test_empty_str_with_sig_gb18030(self):
             r.encoding
         )
         self.assertEqual(
-            0,
+            4,
             len(r.raw)
         )
 
@@ -57,7 +57,7 @@ def test_empty_str_with_sig_utf8(self):
             r.encoding
         )
         self.assertEqual(
-            0,
+            3,
             len(r.raw)
         )
 

From 103b2c64ad45ada0ce6cab3c7cbfec9ced44be2b Mon Sep 17 00:00:00 2001
From: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Date: Wed, 14 Jul 2021 22:17:19 +0200
Subject: [PATCH 3/4] :heavy_check_mark: Add additional test case scenario

---
 tests/test_on_byte.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py
index 096f50bb..99518b7a 100644
--- a/tests/test_on_byte.py
+++ b/tests/test_on_byte.py
@@ -1,6 +1,7 @@
 import unittest
 
 from charset_normalizer.api import from_bytes
+from charset_normalizer.constant import TOO_BIG_SEQUENCE
 
 
 class TestBytes(unittest.TestCase):
@@ -61,6 +62,23 @@ def test_empty_str_with_sig_utf8(self):
             len(r.raw)
         )
 
+    def test_empty_str_with_large_sig_utf8(self):
+        r = from_bytes(b'\xef\xbb\xbf' + (b'0' * TOO_BIG_SEQUENCE)).best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            '0' * TOO_BIG_SEQUENCE,
+            str(r)
+        )
+        self.assertEqual(
+            "utf_8",
+            r.encoding
+        )
+        self.assertEqual(
+            TOO_BIG_SEQUENCE + 3,
+            len(r.raw)
+        )
+
     def test_on_empty_json(self):
 
         with self.subTest("Detecting empty JSON as ASCII"):

From 044eed309f6fba3afef7a9e3910bee51b5a5b2d4 Mon Sep 17 00:00:00 2001
From: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Date: Wed, 14 Jul 2021 22:20:34 +0200
Subject: [PATCH 4/4] :pencil: Fix typo in docs CharsetMatch instead of
 CharsetNormalizerMatch

---
 docs/handling_result.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/handling_result.rst b/docs/handling_result.rst
index f161ffb8..067771ee 100755
--- a/docs/handling_result.rst
+++ b/docs/handling_result.rst
@@ -15,7 +15,7 @@ When initiating search upon a buffer, bytes or file you can assign the return va
 
     print(result.encoding)  # gb18030
 
-Using CharsetNormalizerMatch
+Using CharsetMatch
 ----------------------------
 
 Here, ``result`` is a ``CharsetMatch`` object or ``None``.