Ousret · Ousret · Jul 14, 2021 · Jul 12, 2021 · Jul 14, 2021 · Jul 14, 2021
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -37,6 +37,16 @@ def from_bytes(
     """
     Given a raw bytes sequence, return the best possibles charset usable to render str objects.
     If there is no results, it is a strong indicator that the source is binary/not text.
+    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
+    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
+
+    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
+    but never take it for granted. Can improve the performance.
+
+    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
+    purpose.
+
+    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
     """
 
     if not explain:
@@ -152,7 +162,7 @@ def from_bytes(
         try:
             if is_too_large_sequence and is_multi_byte_decoder is False:
                 str(
-                    sequences[:int(50e4)],
+                    sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)],
                     encoding=encoding_iana
                 )
             else:

diff --git a/docs/handling_result.rst b/docs/handling_result.rst
@@ -15,7 +15,7 @@ When initiating search upon a buffer, bytes or file you can assign the return va
 
     print(result.encoding)  # gb18030
 
-Using CharsetNormalizerMatch
+Using CharsetMatch
 ----------------------------
 
 Here, ``result`` is a ``CharsetMatch`` object or ``None``.

diff --git a/tests/test_on_byte.py b/tests/test_on_byte.py
@@ -1,6 +1,7 @@
 import unittest
 
 from charset_normalizer.api import from_bytes
+from charset_normalizer.constant import TOO_BIG_SEQUENCE
 
 
 class TestBytes(unittest.TestCase):
@@ -27,6 +28,57 @@ def test_empty_bytes(self):
             len(r.alphabets)
         )
 
+    def test_empty_str_with_sig_gb18030(self):
+        r = from_bytes('\uFEFF'.encode('gb18030')).best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            "",
+            str(r)
+        )
+        self.assertEqual(
+            "gb18030",
+            r.encoding
+        )
+        self.assertEqual(
+            4,
+            len(r.raw)
+        )
+
+    def test_empty_str_with_sig_utf8(self):
+        r = from_bytes(b'\xef\xbb\xbf').best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            "",
+            str(r)
+        )
+        self.assertEqual(
+            "utf_8",
+            r.encoding
+        )
+        self.assertEqual(
+            3,
+            len(r.raw)
+        )
+
+    def test_empty_str_with_large_sig_utf8(self):
+        r = from_bytes(b'\xef\xbb\xbf' + (b'0' * TOO_BIG_SEQUENCE)).best()
+
+        self.assertIsNotNone(r)
+        self.assertEqual(
+            '0' * TOO_BIG_SEQUENCE,
+            str(r)
+        )
+        self.assertEqual(
+            "utf_8",
+            r.encoding
+        )
+        self.assertEqual(
+            TOO_BIG_SEQUENCE + 3,
+            len(r.raw)
+        )
+
     def test_on_empty_json(self):
 
         with self.subTest("Detecting empty JSON as ASCII"):