Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Determining the computed MIME type of a resource #4

Merged
merged 26 commits into from Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 13 additions & 11 deletions tests/test_utils.py
Expand Up @@ -3,10 +3,12 @@
from unittest import mock

from xtractmime._utils import (
is_archive,
is_audio_video,
is_font,
is_image,
get_archive_mime,
get_audio_video_mime,
get_extra_mime,
get_font_mime,
get_image_mime,
get_text_mime,
is_mp3_non_ID3_signature,
is_mp4_signature,
is_webm_signature,
Expand Down Expand Up @@ -133,16 +135,16 @@ def test_audio_video(self, input_bytes, expected):
if isinstance(input_bytes, str):
with open(f"tests/files/{input_bytes}", "rb") as input_file:
input_bytes = input_file.read()
assert is_audio_video(input_bytes) == expected
assert get_audio_video_mime(input_bytes) == expected

def test_image(self):
assert is_image(self.body_gif) == "image/gif"
assert is_image(b"\x00\x00\x00\x00") is None
assert get_image_mime(self.body_gif) == "image/gif"
assert get_image_mime(b"\x00\x00\x00\x00") is None

def test_font(self):
assert is_font(self.body_ttf) == "font/ttf"
assert is_font(b"\x00\x00\x00\x00") is None
assert get_font_mime(self.body_ttf) == "font/ttf"
assert get_font_mime(b"\x00\x00\x00\x00") is None

def test_archive(self):
assert is_archive(self.body_zip) == "application/zip"
assert is_archive(b"\x00\x00\x00\x00") is None
assert get_archive_mime(self.body_zip) == "application/zip"
assert get_archive_mime(b"\x00\x00\x00\x00") is None
136 changes: 132 additions & 4 deletions xtractmime/__init__.py
@@ -1,5 +1,14 @@
__version__ = "0.0.0"
from typing import Optional, Set, Tuple
from xtractmime._utils import (
contains_binary,
get_archive_mime,
get_audio_video_mime,
get_extra_mime,
get_font_mime,
get_image_mime,
get_text_mime,
)

_APACHE_TYPES = [
b"text/plain",
Expand All @@ -10,6 +19,7 @@
WHITESPACE_BYTES = {b"\t", b"\r", b"\x0c", b"\n", b" "}



def _is_match_mime_pattern(
input_bytes: bytes, byte_pattern: bytes, pattern_mask: bytes, lstrip: Set[bytes] = None
) -> bool:
Expand Down Expand Up @@ -39,17 +49,135 @@ def _is_match_mime_pattern(
return True


def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool, extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]]) -> str:
if sniff_scriptable:
return get_text_mime(input_bytes)

matched_type = get_extra_mime(input_bytes, extra_types=extra_types)
if matched_type:
return matched_type

matched_type = get_image_mime(input_bytes)
if matched_type:
return matched_type

matched_type = get_audio_video_mime(input_bytes)
if matched_type:
return matched_type

matched_type = get_archive_mime(input_bytes)
if matched_type:
return matched_type

if not contains_binary(input_bytes):
return "text/plain"

return "application/octet-stream"


def _sniff_mislabled_binary(input_bytes: bytes) -> str:
input_size = len(input_bytes)

if input_size >= 2 and input_bytes.startswith((b"\xfe\xff",b"\xff\xfe",b"\xef\xbb\xbf")):
return "text/plain"

if not contains_binary(input_bytes):
return "text/plain"

return "application/octet-stream"


def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[bytes]]) -> str:
input_size = len(input_bytes)
index = 0

if input_bytes[:3] == b"\xef\xbb\xbf":
index += 3

while index < input_size:
while True:
if not input_bytes[index:index+1]:
return supplied_type
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if input_bytes[index:index+1] == b"<":
index += 1
break

if input_bytes[index:index+1] not in WHITESPACE_BYTES:
return supplied_type

index += 1

while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+3] == b"!--":
index += 3
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+3] == b"-->":
index += 3
return supplied_type
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved

index += 1

if input_bytes[index:index+1] == b"!":
index += 1
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+3] == b">":
index += 1
return supplied_type
index += 1

# Completed Till Section 7.3 5.2.3.3

return ""


def extract_mime(
body: bytes,
*,
content_types: Optional[Tuple[bytes]] = None,
http_origin: bool = True,
no_sniff: bool = False,
extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]] = None,
supported_types: Set[str] = None,
) -> str:
extra_types = extra_types or tuple()
# supplied_type = content_types[-1] if content_types else None
supplied_type = content_types[-1] if content_types else None
check_for_apache = http_origin and supplied_type in _APACHE_TYPES
resource_header = memoryview(body)[:1445]

if supplied_type in (None, "unknown/unknown", "application/unknown", "*/*"):
_find_unknown_mimetype(resource_header, not no_sniff, extra_types)

if no_sniff:
return supplied_type

if check_for_apache:
return _sniff_mislabled_binary(resource_header)

if supplied_type.endswith("+xml") or supplied_type in {"text/xml", "application/xml"}:
return supplied_type

if supplied_type.startswith("text/html"):
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved
return _sniff_mislabled_feed(resource_header)

if supplied_type.startswith("image/"):
matched_type = get_image_mime(resource_header)
if matched_type in supported_types:
return matched_type

video_types = ("audio/","video/")
if supplied_type.startswith(video_types) or supplied_type.startswith("application/ogg"):
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved
matched_type = get_audio_video_mime(resource_header)
if matched_type in supported_types:
return matched_type

# check_for_apache = http_origin and supplied_type in _APACHE_TYPES
# resource_header = memoryview(body) if len(body) < 1445 else memoryview(body)[:1445]
return "mimetype"
return supplied_type