Skip to content

Commit

Permalink
ROB: improve inline image extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed May 3, 2024
1 parent e92b20e commit b449664
Show file tree
Hide file tree
Showing 3 changed files with 294 additions and 59 deletions.
99 changes: 40 additions & 59 deletions pypdf/generic/_data_structures.py
Expand Up @@ -49,7 +49,6 @@

from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol
from .._utils import (
WHITESPACES,
StreamType,
b_,
deprecate_no_replacement,
Expand Down Expand Up @@ -81,6 +80,13 @@
TextStringObject,
)
from ._fit import Fit
from ._image_inline import (
extract_inline_A85,
extract_inline_AHex,
extract_inline_DCT,
extract_inline_default,
extract_inline_RL,
)
from ._utils import read_hex_string_from_stream, read_string_from_stream

if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -1152,65 +1158,40 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
# left at beginning of ID
tmp = stream.read(3)
assert tmp[:2] == b"ID"
data = BytesIO()
# Read the inline image, while checking for EI (End Image) operator.
while True:
# Read 8 kB at a time and check if the chunk contains the E operator.
buf = stream.read(8192)
# We have reached the end of the stream, but haven't found the EI operator.
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(
b"E"
) # we can not look straight for "EI" because it may not have been loaded in the buffer

if loc == -1:
data.write(buf)
filtr = settings.get("/F", "not set")
# print("inline", stream.tell(),filtr,"*",settings)
if isinstance(filtr, list):
filtr = filtr[0] # used forencoding
if filtr == "AHx":
data = extract_inline_AHex(stream)
elif filtr == "A85":
data = extract_inline_A85(stream)
elif filtr == "RL":
data = extract_inline_RL(stream)
elif filtr == "DCT":
data = extract_inline_DCT(stream)
elif filtr == "not set":
cs = settings["/CS"]
if cs == "/I" or cs == "/G":
lcs = 1
elif cs == "/RGB":
lcs = 3
elif cs == "/CMYK":
lcs = 4
else:
# Write out everything before the E.
data.write(buf[0:loc])

# Seek back in the stream to read the E next.
stream.seek(loc - len(buf), 1)
tok = stream.read(1) # E of "EI"
# Check for End Image
tok2 = stream.read(1) # I of "EI"
if tok2 != b"I":
stream.seek(-1, 1)
data.write(tok)
continue
# for further debug : print("!!!!",buf[loc-1:loc+10])
info = tok + tok2
tok3 = stream.read(
1
) # possible space after "EI" may not been loaded in buf
if tok3 not in WHITESPACES:
stream.seek(-2, 1) # to step back on I
data.write(tok)
elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES:
# Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
while tok3 in WHITESPACES:
# needed ???? : info += tok3
tok3 = stream.read(1)
stream.seek(-1, 1)
# we do not insert EI
break
else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
# Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars.
while tok3 in WHITESPACES:
info += tok3
tok3 = stream.read(1)
stream.seek(-1, 1)
if tok3 == b"Q":
break
elif tok3 == b"E":
ope = stream.read(3)
stream.seek(-3, 1)
if ope == b"EMC":
break
else:
data.write(info)
return {"settings": settings, "data": data.getvalue()}
raise PdfReadError("Invalid CS value:", cs)
data = stream.read(
cast(int, settings["/W"]) * cast(int, settings["/H"]) * lcs
)
ei = read_non_whitespace(stream)
ei += stream.read(1)
stream.seek(-2, 1)
else:
data = extract_inline_default(stream)

ei = stream.read(2)
assert ei == b"EI"
return {"settings": settings, "data": data}

# This overrides the parent method:
def get_data(self) -> bytes:
Expand Down
242 changes: 242 additions & 0 deletions pypdf/generic/_image_inline.py
@@ -0,0 +1,242 @@
# Copyright (c) 2024, PubPub-ZZ
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import logging
from io import BytesIO

from .._utils import (
WHITESPACES,
StreamType,
read_non_whitespace,
)
from ..errors import PdfReadError

logger = logging.getLogger(__name__)

BUFFER_SIZE = 8192


def extract_inline_AHex(stream: StreamType) -> bytes:
"""
Extract HexEncoded Stream from Inline Image.
the stream will be moved onto the EI
"""
data: bytes = b""
# Read data until delimiter > and EI as backup
# ignoring backup.
while True:
buf = stream.read(BUFFER_SIZE)
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(b">")
if loc >= 0: # found >
data += buf[: (loc + 1)]
stream.seek(-BUFFER_SIZE + loc + 1)
break
loc = buf.find(b"EI")
if loc >= 0: # found EI
stream.seek(-BUFFER_SIZE + loc - 1, 1)
c = stream.read(1)
while c in WHITESPACES:
stream.seek(-2, 1)
c = stream.read(1)
loc -= 1
data += buf[:loc]
else: # > nor EI found
data += buf[:-1]
stream.seek(-1, 1)

ei = read_non_whitespace(stream)
ei += stream.read(1)
stream.seek(-2, 1)
if ei != b"EI":
raise PdfReadError("EI stream not found")
return data


def extract_inline_A85(stream: StreamType) -> bytes:
"""
Extract A85 Stream from Inline Image.
the stream will be moved onto the EI
"""
data: bytes = b""
# Read data up to delimiter ~>
# see §3.3.2 from PDF ref 1.7
while True:
buf = stream.read(BUFFER_SIZE)
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(b"~>")
if loc >= 0: # found!
data += buf[: loc + 2]
stream.seek(-BUFFER_SIZE + loc + 2, 1)
break
data += buf[:-1] # back by one char in case of in the middle of ~>
stream.seek(-1, 1)

ei = read_non_whitespace(stream)
ei += stream.read(1)
stream.seek(-2, 1)
if ei != b"EI":
raise PdfReadError("EI stream not found")
return data


def extract_inline_RL(stream: StreamType) -> bytes:
"""
Extract RL Stream from Inline Image.
the stream will be moved onto the EI
"""
data: bytes = b""
# Read data up to delimiter ~>
# see §3.3.4 from PDF ref 1.7
while True:
buf = stream.read(BUFFER_SIZE)
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(b"\x80")
if loc >= 0: # found
data = buf[: loc + 1]
stream.seek(-BUFFER_SIZE + loc + 1, 1)
break
data += buf # back by one char in case of in the middle of ~>

data += buf[:loc]
ei = read_non_whitespace(stream)
ei += stream.read(1)
stream.seek(-2, 1)
if ei != b"EI":
raise PdfReadError("EI stream not found")
return data


def extract_inline_DCT(stream: StreamType) -> bytes:
"""
Extract DCT (JPEG) Stream from Inline Image.
the stream will be moved onto the EI
"""
data: bytes = b""
# Read Blocks of data (ID/Size/data) up to ID=FF/D9
# see https://www.digicamsoft.com/itu/itu-t81-36.html
while True:
c = stream.read(1)
data += c
if c != b"\xff":
continue
c = stream.read(1)
if c == b"\xff":
stream.seek(-1, 1)
elif c == b"\x00": # stuffing
data += c
elif c == b"\xd9": # end
data += c
break
elif c in (
b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
b"\xda\xdb\xdc\xdd\xde\xdf"
b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
):
data += c
c = stream.read(2)
data += c
sz = ord(c[0]) * 256 + c[1]
data += stream.read(sz - 2)
else:
data += c

ei = read_non_whitespace(stream)
ei += stream.read(1)
stream.seek(-2, 1)
if ei != b"EI":
raise PdfReadError("EI stream not found")
return data


def extract_inline_default(stream: StreamType) -> bytes:
"""
Legacy method
used by default
"""
data = BytesIO()
# Read the inline image, while checking for EI (End Image) operator.
while True:
buf = stream.read(BUFFER_SIZE)
if not buf:
raise PdfReadError("Unexpected end of stream")
loc = buf.find(
b"E"
) # we can not look straight for "EI" because it may not have been loaded in the buffer

if loc == -1:
data.write(buf)
else:
# Write out everything before the E.
data.write(buf[0:loc])

# Seek back in the stream to read the E next.
stream.seek(loc - len(buf), 1)
saved_pos = stream.tell()
tok = stream.read(1) # E of "EI"
# Check for End Image
tok2 = stream.read(1) # I of "EI"
if tok2 != b"I":
stream.seek(-1, 1)
data.write(tok)
continue
# for further debug : print("!!!!",buf[loc-1:loc+10])
info = tok + tok2
tok3 = stream.read(
1
) # possible space after "EI" may not been loaded in buf
if tok3 not in WHITESPACES:
stream.seek(-2, 1) # to step back on I
data.write(tok)
elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES:
# Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
while tok3 in WHITESPACES:
# needed ???? : info += tok3
tok3 = stream.read(1)
stream.seek(-1, 1)
# we do not insert EI
break
else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
# Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars.
while tok3 in WHITESPACES:
info += tok3
tok3 = stream.read(1)
stream.seek(-1, 1)
if tok3 == b"Q":
break
elif tok3 == b"E":
ope = stream.read(3)
stream.seek(-3, 1)
if ope == b"EMC":
break
else:
data.write(info)
stream.seek(saved_pos, 0)
return data.getvalue()
12 changes: 12 additions & 0 deletions tests/test_images.py
Expand Up @@ -346,3 +346,15 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
print(fn) # noqa: T201
img = Image.open(BytesIO(zf.read(fn)))
assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99


@pytest.mark.enable_socket()
def test_inline_image_extraction():
"""Cf #2598"""
url = "https://github.com/py-pdf/pypdf/files/14982414/lebo102.pdf"
name = "iss2598.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
# there is no error because images are correctly extracted
reader.pages[1].extract_text()
reader.pages[2].extract_text()
reader.pages[3].extract_text()

0 comments on commit b449664

Please sign in to comment.