Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Determining the computed MIME type of a resource #4

Merged
merged 26 commits into from Aug 4, 2021
Merged
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
69 changes: 64 additions & 5 deletions xtractmime/__init__.py
Expand Up @@ -109,6 +109,7 @@ def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[byte
index += 1

while True:
loop_break = False
if not input_bytes[index:index+1]:
return supplied_type

Expand All @@ -120,10 +121,14 @@ def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[byte

if input_bytes[index:index+3] == b"-->":
index += 3
return supplied_type
loop_break = True
break

index += 1

if loop_break:
break

if input_bytes[index:index+1] == b"!":
index += 1
while True:
Expand All @@ -132,12 +137,66 @@ def _sniff_mislabled_feed(input_bytes: bytes, supplied_type: Optional[Tuple[byte

if input_bytes[index:index+3] == b">":
index += 1
loop_break = True
break

index += 1

if loop_break:
break

if input_bytes[index:index+1] == b"?":
index += 1
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+2] == "?>":
index += 2
loop_break = True
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
break

index += 1

if input_bytes[index:index+3] == b"rss":
return b"application/rss+xml"

if input_bytes[index:index+4] == b"feed":
return b"application/atom+xml"

if input_bytes[index:index+7] == b"rdf:RDF":
index += 7
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+24] == b"http://purl.org/rss/1.0/":
index += 24
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+43] == b"http://www.w3.org/1999/02/22-rdf-syntax-ns#":
return b"application/rss+xml"

index += 1

if input_bytes[index:index+43] == b"http://www.w3.org/1999/02/22-rdf-syntax-ns#":
index += 43
while True:
if not input_bytes[index:index+1]:
return supplied_type

if input_bytes[index:index+24] == b"http://purl.org/rss/1.0/":
return b"application/rss+xml"

index += 1

index += 1

# Completed Till Section 7.3 5.2.3.3
return supplied_type

return ""
return supplied_type


def extract_mime(
Expand Down Expand Up @@ -166,7 +225,7 @@ def extract_mime(
if supplied_type.endswith("+xml") or supplied_type in {"text/xml", "application/xml"}:
return supplied_type

if supplied_type.startswith("text/html"):
if supplied_type == "text/html":
return _sniff_mislabled_feed(resource_header)

if supplied_type.startswith("image/"):
Expand All @@ -175,7 +234,7 @@ def extract_mime(
return matched_type

video_types = ("audio/","video/")
if supplied_type.startswith(video_types) or supplied_type.startswith("application/ogg"):
if supplied_type.startswith(video_types) or supplied_type == "application/ogg":
matched_type = get_audio_video_mime(resource_header)
if matched_type in supported_types:
return matched_type
Expand Down