scrapy · elacuesta · Aug 4, 2021 · Jul 2, 2021 · Jul 5, 2021 · Jul 5, 2021
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,10 +3,12 @@
 from unittest import mock
 
 from xtractmime._utils import (
-    is_archive,
-    is_audio_video,
-    is_font,
-    is_image,
+    get_archive_mime,
+    get_audio_video_mime,
+    get_extra_mime,
+    get_font_mime,
+    get_image_mime,
+    get_text_mime,
     is_mp3_non_ID3_signature,
     is_mp4_signature,
     is_webm_signature,
@@ -133,16 +135,16 @@ def test_audio_video(self, input_bytes, expected):
         if isinstance(input_bytes, str):
             with open(f"tests/files/{input_bytes}", "rb") as input_file:
                 input_bytes = input_file.read()
-        assert is_audio_video(input_bytes) == expected
+        assert get_audio_video_mime(input_bytes) == expected
 
     def test_image(self):
-        assert is_image(self.body_gif) == "image/gif"
-        assert is_image(b"\x00\x00\x00\x00") is None
+        assert get_image_mime(self.body_gif) == "image/gif"
+        assert get_image_mime(b"\x00\x00\x00\x00") is None
 
     def test_font(self):
-        assert is_font(self.body_ttf) == "font/ttf"
-        assert is_font(b"\x00\x00\x00\x00") is None
+        assert get_font_mime(self.body_ttf) == "font/ttf"
+        assert get_font_mime(b"\x00\x00\x00\x00") is None
 
     def test_archive(self):
-        assert is_archive(self.body_zip) == "application/zip"
-        assert is_archive(b"\x00\x00\x00\x00") is None
+        assert get_archive_mime(self.body_zip) == "application/zip"
+        assert get_archive_mime(b"\x00\x00\x00\x00") is None
diff --git a/xtractmime/__init__.py b/xtractmime/__init__.py
@@ -1,10 +1,13 @@
 __version__ = "0.0.0"
 from typing import Optional, Set, Tuple
 from xtractmime._utils import (
-    is_archive,
-    is_audio_video,
-    is_font,
-    is_image,
+    contains_binary,
+    get_archive_mime,
+    get_audio_video_mime,
+    get_extra_mime,
+    get_font_mime,
+    get_image_mime,
+    get_text_mime,
 )
 
 _APACHE_TYPES = [
@@ -16,6 +19,7 @@
 WHITESPACE_BYTES = {b"\t", b"\r", b"\x0c", b"\n", b" "}
 
 
+
 def _is_match_mime_pattern(
     input_bytes: bytes, byte_pattern: bytes, pattern_mask: bytes, lstrip: Set[bytes] = None
 ) -> bool:
@@ -45,19 +49,95 @@ def _is_match_mime_pattern(
     return True
 
 
-def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool):
-    # TODO
-    pass
+def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool, extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]]) -> str:
+    if sniff_scriptable:
+        return get_text_mime(input_bytes)
+
+    matched_type = get_extra_mime(input_bytes, extra_types=extra_types)
+    if matched_type:
+        return matched_type
+
+    matched_type = get_image_mime(input_bytes)
+    if matched_type:
+        return matched_type
+
+    matched_type = get_audio_video_mime(input_bytes)
+    if matched_type:
+        return matched_type
+
+    matched_type = get_archive_mime(input_bytes)
+    if matched_type:
+        return matched_type
+
+    if not contains_binary(input_bytes):
+        return "text/plain"
+
+    return "application/octet-stream"
+
+
+def _sniff_mislabled_binary(input_bytes: bytes) -> str:
+    input_size = len(input_bytes)
+
+    if input_size >= 2 and input_bytes.startswith((b"\xfe\xff",b"\xff\xfe",b"\xef\xbb\xbf")):
+        return "text/plain"
+
+    if not contains_binary(input_bytes):
+        return "text/plain"
+
+    return "application/octet-stream"
+
+
+def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[bytes]]) -> str:
+    input_size = len(input_bytes)
+    index = 0
+
+    if input_bytes[:3] == b"\xef\xbb\xbf":
+        index += 3
+
+    while index < input_size:
+        while True:
+            if not input_bytes[index:index+1]:
+                return supplied_type
+
+            if input_bytes[index:index+1] == b"<":
+                index += 1
+                break
+
+            if input_bytes[index:index+1] not in WHITESPACE_BYTES:
+                return supplied_type
+
+            index += 1
+
+        while True:
+            if not input_bytes[index:index+1]:
+                return supplied_type
+
+            if input_bytes[index:index+3] == b"!--":
+                index += 3
+                while True:
+                    if not input_bytes[index:index+1]:
+                        return supplied_type
 
+                    if input_bytes[index:index+3] == b"-->":
+                        index += 3
+                        return supplied_type
 
-def _sniff_mislabled_binary(input_bytes: bytes):
-    # TODO
-    pass
+                    index += 1
 
+            if input_bytes[index:index+1] == b"!":
+                index += 1
+                while True:
+                    if not input_bytes[index:index+1]:
+                        return supplied_type
 
-def _compare_feed_html(input_bytes: bytes):
-    # TODO
-    pass
+                    if input_bytes[index:index+3] == b">":
+                        index += 1
+                        return supplied_type
+                    index += 1
+
+        # Completed Till Section 7.3 5.2.3.3
+
+    return ""
 
 
 def extract_mime(
@@ -67,35 +147,37 @@ def extract_mime(
     http_origin: bool = True,
     no_sniff: bool = False,
     extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]] = None,
+    supported_types: Set[str] = None,
 ) -> str:
     extra_types = extra_types or tuple()
     supplied_type = content_types[-1] if content_types else None
     check_for_apache = http_origin and supplied_type in _APACHE_TYPES
-    resource_header = memoryview(body) if len(body) < 1445 else memoryview(body)[:1445]
+    resource_header = memoryview(body)[:1445]
 
     if supplied_type in (None, "unknown/unknown", "application/unknown", "*/*"):
-        _find_unknown_mimetype(body, not no_sniff)
+        _find_unknown_mimetype(resource_header, not no_sniff, extra_types)
 
     if no_sniff:
         return supplied_type
 
     if check_for_apache:
-        return _sniff_mislabled_binary(body)
+        return _sniff_mislabled_binary(resource_header)
 
-    if supplied_type[-4:] is "+xml" or supplied_type in ("text/xml", "application/xml"):
+    if supplied_type.endswith("+xml") or supplied_type in {"text/xml", "application/xml"}:
         return supplied_type
 
-    if supplied_type is "text/html":
-        return _compare_feed_html(body)
-
-    matched_type = is_image(resource_header)
-
-    if matched_type:
-        return matched_type
+    if supplied_type.startswith("text/html"):
+        return _sniff_mislabled_feed(resource_header)
 
-    matched_type = is_audio_video(resource_header)
+    if supplied_type.startswith("image/"):
+        matched_type = get_image_mime(resource_header)
+        if matched_type in supported_types:
+            return matched_type
 
-    if matched_type:
-        return matched_type
+    video_types = ("audio/","video/")
+    if supplied_type.startswith(video_types) or supplied_type.startswith("application/ogg"):
+        matched_type = get_audio_video_mime(resource_header)
+        if matched_type in supported_types:
+            return matched_type
 
     return supplied_type
diff --git a/xtractmime/_patterns.py b/xtractmime/_patterns.py
@@ -1,3 +1,37 @@
+from xtractmime import WHITESPACE_BYTES
+
+#: Section 3
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#terminology  # noqa: E501
+BINARY_BYTES = (
+    b"\x00",
+    b"\x01",
+    b"\x02",
+    b"\x03",
+    b"\x04",
+    b"\x05",
+    b"\x06",
+    b"\x07",
+    b"\x08",
+    b"\x0b",
+    b"\x0e",
+    b"\x0f",
+    b"\x10",
+    b"\x11",
+    b"\x12",
+    b"\x13",
+    b"\x14",
+    b"\x15",
+    b"\x16",
+    b"\x17",
+    b"\x18",
+    b"\x19",
+    b"\x1a",
+    b"\x1c",
+    b"\x1d",
+    b"\x1e",
+    b"\x1f",
+)
+
 #: Section 6.1, step 1
 #: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-image-type-pattern  # noqa: E501
 IMAGE_PATTERNS = (
@@ -75,6 +109,44 @@
 
 #: Section 7.1, step 1
 #: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#identifying-a-resource-with-an-unknown-mime-type  # noqa: E501
-TEXT_PATTERNS = (
-
+TEXT_PATTERNS_1 = tuple(
+    (prefix + suffix, mask, WHITESPACE_BYTES, "text/html")
+    for prefix, mask, in (
+        (b"<!DOCTYPE HTML", b'\xff\xff\xdf\xdf\xdf\xdf\xdf\xdf\xdf\xff\xdf\xdf\xdf\xdf\xff'),
+        (b'<HTML', b'\xff\xdf\xdf\xdf\xdf\xff'),
+        (b'<HEAD', b'\xff\xdf\xdf\xdf\xdf\xff'),
+        (b'<SCRIPT', b'\xff\xdf\xdf\xdf\xdf\xdf\xdf\xff'),
+        (b'<IFRAME', b'\xff\xdf\xdf\xdf\xdf\xdf\xdf\xff'),
+        (b'<H1', b'\xff\xdf\xff\xff'),
+        (b'<DIV', b'\xff\xdf\xdf\xdf\xff'),
+        (b'<FONT', b'\xff\xdf\xdf\xdf\xdf\xff'),
+        (b'<TABLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
+        (b'<A', b'\xff\xdf\xff'),
+        (b'<STYLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
+        (b'<TITLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
+        (b'<B', b'\xff\xdf\xff'),
+        (b'<BODY', b'\xff\xdf\xdf\xdf\xdf\xff'),
+        (b'<BR', b'\xff\xdf\xdf\xff'),
+        (b'<P', b'\xff\xdf\xff'),
+        (b'<!--', b'\xff\xff\xff\xff\xff')
+    )
+    for suffix in (b"\x20", b"\x3E")
+)
+TEXT_PATTERNS_2 = (
+    (b'<?xml', b'\xff\xff\xff\xff\xff', WHITESPACE_BYTES, "text/xml"),
+    (b'%PDF-', b'\xff\xff\xff\xff\xff', None, "application/pdf")
     )
+
+#: Section 7.1, step 2
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#identifying-a-resource-with-an-unknown-mime-type  # noqa: E501
+EXTRA_PATTERNS = (
+    (
+        b"%!PS-Adobe-",
+        b"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff",
+        None,
+        "application/postscript",
+    ),
+    (b"\xfe\xff\x00\x00", b"\xff\xff\x00\x00", None, "text/plain"),
+    (b"\xff\xfe\x00\x00", b"\xff\xff\x00\x00", None, "text/plain"),
+    (b"\xef\xbb\xbf\x00", b"\xff\xff\xff\x00", None, "text/plain"),
+)
diff --git a/xtractmime/_utils.py b/xtractmime/_utils.py
@@ -1,12 +1,16 @@
 from struct import unpack
-from typing import Tuple, Union
+from typing import Optional, Set, Tuple, Union
 
 from xtractmime import _is_match_mime_pattern
 from xtractmime._patterns import (
     ARCHIVE_PATTERNS,
     AUDIO_VIDEO_PATTERNS,
+    BINARY_BYTES,
+    EXTRA_PATTERNS,
     FONT_PATTERNS,
     IMAGE_PATTERNS,
+    TEXT_PATTERNS_1,
+    TEXT_PATTERNS_2,
 )
 
 SAMPLE_RATES = (44100, 48000, 32000)
@@ -228,15 +232,15 @@ def is_mp3_non_ID3_signature(input_bytes: bytes) -> bool:
         return False
 
 
-def is_image(input_bytes: bytes) -> Union[str, None]:
+def get_image_mime(input_bytes: bytes) -> Union[str, None]:
     for pattern in IMAGE_PATTERNS:
         if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
             return pattern[3]
 
     return None
 
 
-def is_audio_video(input_bytes: bytes) -> Union[str, None]:
+def get_audio_video_mime(input_bytes: bytes) -> Union[str, None]:
     for pattern in AUDIO_VIDEO_PATTERNS:
         if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
             return pattern[3]
@@ -253,17 +257,49 @@ def is_audio_video(input_bytes: bytes) -> Union[str, None]:
     return None
 
 
-def is_font(input_bytes: bytes) -> Union[str, None]:
+def get_font_mime(input_bytes: bytes) -> Union[str, None]:
     for pattern in FONT_PATTERNS:
         if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
             return pattern[3]
 
     return None
 
 
-def is_archive(input_bytes: bytes) -> Union[str, None]:
+def get_archive_mime(input_bytes: bytes) -> Union[str, None]:
     for pattern in ARCHIVE_PATTERNS:
         if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
             return pattern[3]
 
     return None
+
+
+def get_text_mime(input_bytes: bytes) -> Union[str, None]:
+    for pattern in TEXT_PATTERNS_1:
+        if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
+            return pattern[3]
+
+    for pattern in TEXT_PATTERNS_2:
+        if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
+            return pattern[3]
+
+    return None
+
+def get_extra_mime(input_bytes: bytes, extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]]) -> Union[str, None]:
+    for pattern in EXTRA_PATTERNS:
+        if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
+            return pattern[3]
+
+    if extra_types:
+        for pattern in extra_types:
+            if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
+                return pattern[3]
+
+    return None
+
+
+def contains_binary(input_bytes: bytes) -> bool:
+    for i in input_bytes:
+        if i in BINARY_BYTES:
+            return True
+
+    return False