Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Determining the computed MIME type of a resource #4

Merged
merged 26 commits into from Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
54 changes: 50 additions & 4 deletions xtractmime/__init__.py
@@ -1,5 +1,11 @@
__version__ = "0.0.0"
from typing import Optional, Set, Tuple
from xtractmime._utils import (
is_archive,
is_audio_video,
is_font,
is_image,
)

_APACHE_TYPES = [
b"text/plain",
Expand Down Expand Up @@ -39,6 +45,21 @@ def _is_match_mime_pattern(
return True


def _find_unknown_mimetype(input_bytes: bytes, sniff_scriptable: bool):
# TODO
pass


def _sniff_mislabled_binary(input_bytes: bytes):
# TODO
pass


def _compare_feed_html(input_bytes: bytes):
# TODO
pass


def extract_mime(
body: bytes,
*,
Expand All @@ -48,8 +69,33 @@ def extract_mime(
extra_types: Optional[Tuple[Tuple[bytes, bytes, Set[bytes], str], ...]] = None,
) -> str:
extra_types = extra_types or tuple()
# supplied_type = content_types[-1] if content_types else None
supplied_type = content_types[-1] if content_types else None
check_for_apache = http_origin and supplied_type in _APACHE_TYPES
resource_header = memoryview(body) if len(body) < 1445 else memoryview(body)[:1445]
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved

if supplied_type in (None, "unknown/unknown", "application/unknown", "*/*"):
_find_unknown_mimetype(body, not no_sniff)
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved

if no_sniff:
return supplied_type

if check_for_apache:
return _sniff_mislabled_binary(body)

if supplied_type[-4:] is "+xml" or supplied_type in ("text/xml", "application/xml"):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
return supplied_type

if supplied_type is "text/html":
return _compare_feed_html(body)

matched_type = is_image(resource_header)

if matched_type:
return matched_type
akshaysharmajs marked this conversation as resolved.
Show resolved Hide resolved

matched_type = is_audio_video(resource_header)

if matched_type:
return matched_type

# check_for_apache = http_origin and supplied_type in _APACHE_TYPES
# resource_header = memoryview(body) if len(body) < 1445 else memoryview(body)[:1445]
return "mimetype"
return supplied_type
114 changes: 25 additions & 89 deletions xtractmime/_patterns.py
@@ -1,80 +1,40 @@
#: Section 6.1, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-image-type-pattern # noqa: E501
IMAGE_PATTERNS = (
(b"\x00\x00\x01\x00", b"\xff\xff\xff\xff", None, "image/x-icon"), # A Windows Icon signature
(b"\x00\x00\x02\x00", b"\xff\xff\xff\xff", None, "image/x-icon"), # A Windows Cursor signature
(b"BM", b"\xff\xff", None, "image/bmp"), # The string "BM", a BMP signature
(
b"GIF87a",
b"\xff\xff\xff\xff\xff\xff",
None,
"image/gif",
), # The string "GIF87a", a GIF signature
(
b"GIF89a",
b"\xff\xff\xff\xff\xff\xff",
None,
"image/gif",
), # The string "GIF89a", a GIF signature
(b"\x00\x00\x01\x00", b"\xff\xff\xff\xff", None, "image/x-icon"),
(b"\x00\x00\x02\x00", b"\xff\xff\xff\xff", None, "image/x-icon"),
(b"BM", b"\xff\xff", None, "image/bmp"),
(b"GIF87a", b"\xff\xff\xff\xff\xff\xff", None, "image/gif",),
(b"GIF89a", b"\xff\xff\xff\xff\xff\xff", None, "image/gif",),
(
b"RIFF\x00\x00\x00\x00WEBPVP",
b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff",
None,
"image/webp",
), # The string "RIFF" followed by four bytes followed by the string "WEBPVP"
# An error-checking byte followed by the string "PNG" followed by CR LF SUB LF,
# the PNG signature
),
(b"\x89PNG\r\n\x1a\n", b"\xff\xff\xff\xff\xff\xff\xff\xff", None, "image/png",),
(
b"\xff\xd8\xff",
b"\xff\xff\xff",
None,
"image/jpeg",
), # The JPEG Start of Image marker followed by the indicator byte of another marker
(b"\xff\xd8\xff", b"\xff\xff\xff", None, "image/jpeg",),
)

#: Section 6.2, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-audio-or-video-type-pattern # noqa: E501
AUDIO_VIDEO_PATTERNS = (
(
b".snd",
b"\xff\xff\xff\xff",
None,
"audio/basic",
), # The string ".snd", the basic audio signature
# The string "FORM" followed by four bytes followed by the string "AIFF"
# the AIFF signature
(b".snd", b"\xff\xff\xff\xff", None, "audio/basic",),
(
b"FORM\x00\x00\x00\x00AIFF",
b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
None,
"audio/aiff",
),
(
b"ID3",
b"\xff\xff\xff",
None,
"audio/mpeg",
), # The string "ID3", the ID3v2-tagged MP3 signature
(
b"OggS\x00",
b"\xff\xff\xff\xff\xff",
None,
"application/ogg",
), # The string "OggS" followed by NUL, the Ogg container signature
# The string "MThd" followed by four bytes representing the number 6 in 32 bits
# (big-endian), the MIDI signature
(b"ID3", b"\xff\xff\xff", None, "audio/mpeg",),
(b"OggS\x00", b"\xff\xff\xff\xff\xff", None, "application/ogg",),
(b"MThd\x00\x00\x00\x06", b"\xff\xff\xff\xff\xff\xff\xff\xff", None, "audio/midi",),
# The string "RIFF" followed by four bytes followed by the string "AVI "
# the AVI signature
(
b"RIFF\x00\x00\x00\x00AVI ",
b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
None,
"video/avi",
),
# The string "RIFF" followed by four bytes followed by the string "WAVE"
# the WAVE signature
(
b"RIFF\x00\x00\x00\x00WAVE",
b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
Expand All @@ -97,48 +57,24 @@
),
None,
"application/vnd.ms-fontobject",
), # 34 bytes followed by the string "LP", the Embedded OpenType signature
(
b"\x00\x01\x00\x00",
b"\xff\xff\xff\xff",
None,
"font/ttf",
), # 4 bytes representing the version number 1.0, a TrueType signature
(b"OTTO", b"\xff\xff\xff\xff", None, "font/otf"), # The string "OTTO", the OpenType signature
(
b"ttcf",
b"\xff\xff\xff\xff",
None,
"font/collection",
), # The string "ttcf", the TrueType Collection signature
(
b"wOFF",
b"\xff\xff\xff\xff",
None,
"font/woff",
), # The string "wOFF", the Web Open Font Format 1.0 signature
(
b"wOF2",
b"\xff\xff\xff\xff",
None,
"font/woff2",
), # The string "wOF2", the Web Open Font Format 2.0 signature
),
(b"\x00\x01\x00\x00", b"\xff\xff\xff\xff", None, "font/ttf",),
(b"OTTO", b"\xff\xff\xff\xff", None, "font/otf"),
(b"ttcf", b"\xff\xff\xff\xff", None, "font/collection",),
(b"wOFF", b"\xff\xff\xff\xff", None, "font/woff",),
(b"wOF2", b"\xff\xff\xff\xff", None, "font/woff2",),
)

#: Section 6.4, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-archive-type-pattern # noqa: E501
ARCHIVE_PATTERNS = (
(b"\x1f\x8b\x08", b"\xff\xff\xff", None, "application/x-gzip"), # The GZIP archive signature
(
b"PK\x03\x04",
b"\xff\xff\xff\xff",
None,
"application/zip",
), # The string "PK" followed by ETX EOT, the ZIP archive signature
(
b"Rar \x1a\x07\x00",
b"\xff\xff\xff\xff\xff\xff\xff",
None,
"application/x-rar-compressed",
), # The string "Rar " followed by SUB BEL NUL, the RAR archive signature
(b"\x1f\x8b\x08", b"\xff\xff\xff", None, "application/x-gzip"),
(b"PK\x03\x04", b"\xff\xff\xff\xff", None, "application/zip",),
(b"Rar \x1a\x07\x00", b"\xff\xff\xff\xff\xff\xff\xff", None, "application/x-rar-compressed",),
)

#: Section 7.1, step 1
#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#identifying-a-resource-with-an-unknown-mime-type # noqa: E501
TEXT_PATTERNS = (

)