Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Determining the computed MIME type of a resource #4

Merged
merged 26 commits into from Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 13 additions & 11 deletions tests/test_utils.py
Expand Up @@ -3,10 +3,12 @@
from unittest import mock

from xtractmime._utils import (
is_archive,
is_audio_video,
is_font,
is_image,
get_archive_mime,
get_audio_video_mime,
get_extra_mime,
get_font_mime,
get_image_mime,
get_text_mime,
is_mp3_non_ID3_signature,
is_mp4_signature,
is_webm_signature,
Expand Down Expand Up @@ -133,16 +135,16 @@ def test_audio_video(self, input_bytes, expected):
if isinstance(input_bytes, str):
with open(f"tests/files/{input_bytes}", "rb") as input_file:
input_bytes = input_file.read()
assert is_audio_video(input_bytes) == expected
assert get_audio_video_mime(input_bytes) == expected

def test_image(self):
assert is_image(self.body_gif) == "image/gif"
assert is_image(b"\x00\x00\x00\x00") is None
assert get_image_mime(self.body_gif) == "image/gif"
assert get_image_mime(b"\x00\x00\x00\x00") is None

def test_font(self):
assert is_font(self.body_ttf) == "font/ttf"
assert is_font(b"\x00\x00\x00\x00") is None
assert get_font_mime(self.body_ttf) == "font/ttf"
assert get_font_mime(b"\x00\x00\x00\x00") is None

def test_archive(self):
assert is_archive(self.body_zip) == "application/zip"
assert is_archive(b"\x00\x00\x00\x00") is None
assert get_archive_mime(self.body_zip) == "application/zip"
assert get_archive_mime(b"\x00\x00\x00\x00") is None
136 changes: 109 additions & 27 deletions xtractmime/__init__.py
@@ -1,10 +1,13 @@
__version__ = "0.0.0"
from typing import Optional, Set, Tuple
from xtractmime._utils import (
is_archive,
is_audio_video,
is_font,
is_image,
contains_binary,
get_archive_mime,
get_audio_video_mime,
get_extra_mime,
get_font_mime,
get_image_mime,
get_text_mime,
)

_APACHE_TYPES = [
Expand All @@ -16,6 +19,7 @@
WHITESPACE_BYTES = {b"\t", b"\r", b"\x0c", b"\n", b" "}



def _is_match_mime_pattern(
input_bytes: bytes, byte_pattern: bytes, pattern_mask: bytes, lstrip: Set[bytes] = None
) -> bool:
Expand Down Expand Up @@ -45,19 +49,95 @@ def _is_match_mime_pattern(
return True


def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool):
# TODO
pass
def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool, extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]]) -> str:
if sniff_scriptable:
return get_text_mime(input_bytes)

matched_type = get_extra_mime(input_bytes, extra_types=extra_types)
if matched_type:
return matched_type

matched_type = get_image_mime(input_bytes)
if matched_type:
return matched_type

matched_type = get_audio_video_mime(input_bytes)
if matched_type:
return matched_type

matched_type = get_archive_mime(input_bytes)
if matched_type:
return matched_type

if not contains_binary(input_bytes):
return "text/plain"

return "application/octet-stream"


def _sniff_mislabled_binary(input_bytes: bytes) -> str:
input_size = len(input_bytes)

if input_size >= 2 and input_bytes.startswith((b"\xfe\xff",b"\xff\xfe",b"\xef\xbb\xbf")):
return "text/plain"

if not contains_binary(input_bytes):
return "text/plain"

return "application/octet-stream"


def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[bytes]]) -> str:
input_size = len(input_bytes)
index = 0

if input_bytes[:3] == b"\xef\xbb\xbf":
index += 3

while index < input_size:
while True:
if not input_bytes[index:index+1]:
return supplied_type
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if input_bytes[index:index+1] == b"<":
index += 1
break

if input_bytes[index:index+1] not in WHITESPACE_BYTES:
return supplied_type

index += 1

while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+3] == b"!--":
index += 3
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+3] == b"-->":
index += 3
return supplied_type
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved

def _sniff_mislabled_binary(input_bytes: bytes):
# TODO
pass
index += 1

if input_bytes[index:index+1] == b"!":
index += 1
while True:
if not input_bytes[index:index+1]:
return supplied_type

def _compare_feed_html(input_bytes: bytes):
# TODO
pass
if input_bytes[index:index+3] == b">":
index += 1
return supplied_type
index += 1

# Completed Till Section 7.3 5.2.3.3

return ""


def extract_mime(
Expand All @@ -67,35 +147,37 @@ def extract_mime(
http_origin: bool = True,
no_sniff: bool = False,
extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]] = None,
supported_types: Set[str] = None,
) -> str:
extra_types = extra_types or tuple()
supplied_type = content_types[-1] if content_types else None
check_for_apache = http_origin and supplied_type in _APACHE_TYPES
resource_header = memoryview(body) if len(body) < 1445 else memoryview(body)[:1445]
resource_header = memoryview(body)[:1445]

if supplied_type in (None, "unknown/unknown", "application/unknown", "*/*"):
_find_unknown_mimetype(body, not no_sniff)
_find_unknown_mimetype(resource_header, not no_sniff, extra_types)

if no_sniff:
return supplied_type

if check_for_apache:
return _sniff_mislabled_binary(body)
return _sniff_mislabled_binary(resource_header)

if supplied_type[-4:] is "+xml" or supplied_type in ("text/xml", "application/xml"):
if supplied_type.endswith("+xml") or supplied_type in {"text/xml", "application/xml"}:
return supplied_type

if supplied_type is "text/html":
return _compare_feed_html(body)

matched_type = is_image(resource_header)

if matched_type:
return matched_type
if supplied_type.startswith("text/html"):
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved
return _sniff_mislabled_feed(resource_header)

matched_type = is_audio_video(resource_header)
if supplied_type.startswith("image/"):
matched_type = get_image_mime(resource_header)
if matched_type in supported_types:
return matched_type

if matched_type:
return matched_type
video_types = ("audio/","video/")
if supplied_type.startswith(video_types) or supplied_type.startswith("application/ogg"):
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved
matched_type = get_audio_video_mime(resource_header)
if matched_type in supported_types:
return matched_type

return supplied_type
76 changes: 74 additions & 2 deletions xtractmime/_patterns.py
@@ -1,3 +1,37 @@
from xtractmime import WHITESPACE_BYTES

#: Section 3
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#terminology # noqa: E501
BINARY_BYTES = (
b"\x00",
b"\x01",
b"\x02",
b"\x03",
b"\x04",
b"\x05",
b"\x06",
b"\x07",
b"\x08",
b"\x0b",
b"\x0e",
b"\x0f",
b"\x10",
b"\x11",
b"\x12",
b"\x13",
b"\x14",
b"\x15",
b"\x16",
b"\x17",
b"\x18",
b"\x19",
b"\x1a",
b"\x1c",
b"\x1d",
b"\x1e",
b"\x1f",
)

#: Section 6.1, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-image-type-pattern # noqa: E501
IMAGE_PATTERNS = (
Expand Down Expand Up @@ -75,6 +109,44 @@

#: Section 7.1, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#identifying-a-resource-with-an-unknown-mime-type # noqa: E501
TEXT_PATTERNS = (

TEXT_PATTERNS_1 = tuple(
(prefix + suffix, mask, WHITESPACE_BYTES, "text/html")
for prefix, mask, in (
(b"<!DOCTYPE HTML", b'\xff\xff\xdf\xdf\xdf\xdf\xdf\xdf\xdf\xff\xdf\xdf\xdf\xdf\xff'),
(b'<HTML', b'\xff\xdf\xdf\xdf\xdf\xff'),
(b'<HEAD', b'\xff\xdf\xdf\xdf\xdf\xff'),
(b'<SCRIPT', b'\xff\xdf\xdf\xdf\xdf\xdf\xdf\xff'),
(b'<IFRAME', b'\xff\xdf\xdf\xdf\xdf\xdf\xdf\xff'),
(b'<H1', b'\xff\xdf\xff\xff'),
(b'<DIV', b'\xff\xdf\xdf\xdf\xff'),
(b'<FONT', b'\xff\xdf\xdf\xdf\xdf\xff'),
(b'<TABLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
(b'<A', b'\xff\xdf\xff'),
(b'<STYLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
(b'<TITLE', b'\xff\xdf\xdf\xdf\xdf\xdf\xff'),
(b'<B', b'\xff\xdf\xff'),
(b'<BODY', b'\xff\xdf\xdf\xdf\xdf\xff'),
(b'<BR', b'\xff\xdf\xdf\xff'),
(b'<P', b'\xff\xdf\xff'),
(b'<!--', b'\xff\xff\xff\xff\xff')
)
for suffix in (b"\x20", b"\x3E")
)
TEXT_PATTERNS_2 = (
(b'<?xml', b'\xff\xff\xff\xff\xff', WHITESPACE_BYTES, "text/xml"),
(b'%PDF-', b'\xff\xff\xff\xff\xff', None, "application/pdf")
)

#: Section 7.1, step 2
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#identifying-a-resource-with-an-unknown-mime-type # noqa: E501
EXTRA_PATTERNS = (
(
b"%!PS-Adobe-",
b"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff",
None,
"application/postscript",
),
(b"\xfe\xff\x00\x00", b"\xff\xff\x00\x00", None, "text/plain"),
(b"\xff\xfe\x00\x00", b"\xff\xff\x00\x00", None, "text/plain"),
(b"\xef\xbb\xbf\x00", b"\xff\xff\xff\x00", None, "text/plain"),
)
46 changes: 41 additions & 5 deletions xtractmime/_utils.py
@@ -1,12 +1,16 @@
from struct import unpack
from typing import Tuple, Union
from typing import Optional, Set, Tuple, Union

from xtractmime import _is_match_mime_pattern
from xtractmime._patterns import (
ARCHIVE_PATTERNS,
AUDIO_VIDEO_PATTERNS,
BINARY_BYTES,
EXTRA_PATTERNS,
FONT_PATTERNS,
IMAGE_PATTERNS,
TEXT_PATTERNS_1,
TEXT_PATTERNS_2,
)

SAMPLE_RATES = (44100, 48000, 32000)
Expand Down Expand Up @@ -228,15 +232,15 @@ def is_mp3_non_ID3_signature(input_bytes: bytes) -> bool:
return False


def is_image(input_bytes: bytes) -> Union[str, None]:
def get_image_mime(input_bytes: bytes) -> Union[str, None]:
for pattern in IMAGE_PATTERNS:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

return None


def is_audio_video(input_bytes: bytes) -> Union[str, None]:
def get_audio_video_mime(input_bytes: bytes) -> Union[str, None]:
for pattern in AUDIO_VIDEO_PATTERNS:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]
Expand All @@ -253,17 +257,49 @@ def is_audio_video(input_bytes: bytes) -> Union[str, None]:
return None


def is_font(input_bytes: bytes) -> Union[str, None]:
def get_font_mime(input_bytes: bytes) -> Union[str, None]:
for pattern in FONT_PATTERNS:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

return None


def is_archive(input_bytes: bytes) -> Union[str, None]:
def get_archive_mime(input_bytes: bytes) -> Union[str, None]:
for pattern in ARCHIVE_PATTERNS:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

return None


def get_text_mime(input_bytes: bytes) -> Union[str, None]:
for pattern in TEXT_PATTERNS_1:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

for pattern in TEXT_PATTERNS_2:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

return None

def get_extra_mime(input_bytes: bytes, extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]]) -> Union[str, None]:
for pattern in EXTRA_PATTERNS:
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

if extra_types:
for pattern in extra_types:
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved
if _is_match_mime_pattern(input_bytes, pattern[0], pattern[1], pattern[2]):
return pattern[3]

return None


def contains_binary(input_bytes: bytes) -> bool:
for i in input_bytes:
if i in BINARY_BYTES:
return True

return False