ROB: improve inline image extraction

closes py-pdf#2598
pubpub-zz · May 3, 2024 · b449664 · b449664
1 parent e92b20e
commit b449664
Show file tree

Hide file tree

Showing 3 changed files with 294 additions and 59 deletions.
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -49,7 +49,6 @@
 
 from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol
 from .._utils import (
-    WHITESPACES,
     StreamType,
     b_,
     deprecate_no_replacement,
@@ -81,6 +80,13 @@
     TextStringObject,
 )
 from ._fit import Fit
+from ._image_inline import (
+    extract_inline_A85,
+    extract_inline_AHex,
+    extract_inline_DCT,
+    extract_inline_default,
+    extract_inline_RL,
+)
 from ._utils import read_hex_string_from_stream, read_string_from_stream
 
 if sys.version_info >= (3, 11):
@@ -1152,65 +1158,40 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
         # left at beginning of ID
         tmp = stream.read(3)
         assert tmp[:2] == b"ID"
-        data = BytesIO()
-        # Read the inline image, while checking for EI (End Image) operator.
-        while True:
-            # Read 8 kB at a time and check if the chunk contains the E operator.
-            buf = stream.read(8192)
-            # We have reached the end of the stream, but haven't found the EI operator.
-            if not buf:
-                raise PdfReadError("Unexpected end of stream")
-            loc = buf.find(
-                b"E"
-            )  # we can not look straight for "EI" because it may not have been loaded in the buffer
-
-            if loc == -1:
-                data.write(buf)
+        filtr = settings.get("/F", "not set")
+        # print("inline", stream.tell(),filtr,"*",settings)
+        if isinstance(filtr, list):
+            filtr = filtr[0]  # used forencoding
+        if filtr == "AHx":
+            data = extract_inline_AHex(stream)
+        elif filtr == "A85":
+            data = extract_inline_A85(stream)
+        elif filtr == "RL":
+            data = extract_inline_RL(stream)
+        elif filtr == "DCT":
+            data = extract_inline_DCT(stream)
+        elif filtr == "not set":
+            cs = settings["/CS"]
+            if cs == "/I" or cs == "/G":
+                lcs = 1
+            elif cs == "/RGB":
+                lcs = 3
+            elif cs == "/CMYK":
+                lcs = 4
             else:
-                # Write out everything before the E.
-                data.write(buf[0:loc])
-
-                # Seek back in the stream to read the E next.
-                stream.seek(loc - len(buf), 1)
-                tok = stream.read(1)  # E of "EI"
-                # Check for End Image
-                tok2 = stream.read(1)  # I of "EI"
-                if tok2 != b"I":
-                    stream.seek(-1, 1)
-                    data.write(tok)
-                    continue
-                # for further debug : print("!!!!",buf[loc-1:loc+10])
-                info = tok + tok2
-                tok3 = stream.read(
-                    1
-                )  # possible space after "EI" may not been loaded  in buf
-                if tok3 not in WHITESPACES:
-                    stream.seek(-2, 1)  # to step back on I
-                    data.write(tok)
-                elif buf[loc - 1 : loc] in WHITESPACES:  # and tok3 in WHITESPACES:
-                    # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
-                    while tok3 in WHITESPACES:
-                        # needed ???? : info += tok3
-                        tok3 = stream.read(1)
-                    stream.seek(-1, 1)
-                    # we do not insert EI
-                    break
-                else:  # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
-                    # Data can contain [!\s]EI[\s],  so check for Q or EMC operator is required to have 4 chars.
-                    while tok3 in WHITESPACES:
-                        info += tok3
-                        tok3 = stream.read(1)
-                    stream.seek(-1, 1)
-                    if tok3 == b"Q":
-                        break
-                    elif tok3 == b"E":
-                        ope = stream.read(3)
-                        stream.seek(-3, 1)
-                        if ope == b"EMC":
-                            break
-                    else:
-                        data.write(info)
-        return {"settings": settings, "data": data.getvalue()}
+                raise PdfReadError("Invalid CS value:", cs)
+            data = stream.read(
+                cast(int, settings["/W"]) * cast(int, settings["/H"]) * lcs
+            )
+            ei = read_non_whitespace(stream)
+            ei += stream.read(1)
+            stream.seek(-2, 1)
+        else:
+            data = extract_inline_default(stream)
+
+        ei = stream.read(2)
+        assert ei == b"EI"
+        return {"settings": settings, "data": data}
 
     # This overrides the parent method:
     def get_data(self) -> bytes:

diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2024, PubPub-ZZ
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import logging
+from io import BytesIO
+
+from .._utils import (
+    WHITESPACES,
+    StreamType,
+    read_non_whitespace,
+)
+from ..errors import PdfReadError
+
+logger = logging.getLogger(__name__)
+
+BUFFER_SIZE = 8192
+
+
+def extract_inline_AHex(stream: StreamType) -> bytes:
+    """
+    Extract HexEncoded Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data: bytes = b""
+    # Read data until delimiter > and EI as backup
+    # ignoring backup.
+    while True:
+        buf = stream.read(BUFFER_SIZE)
+        if not buf:
+            raise PdfReadError("Unexpected end of stream")
+        loc = buf.find(b">")
+        if loc >= 0:  # found >
+            data += buf[: (loc + 1)]
+            stream.seek(-BUFFER_SIZE + loc + 1)
+            break
+        loc = buf.find(b"EI")
+        if loc >= 0:  # found EI
+            stream.seek(-BUFFER_SIZE + loc - 1, 1)
+            c = stream.read(1)
+            while c in WHITESPACES:
+                stream.seek(-2, 1)
+                c = stream.read(1)
+                loc -= 1
+            data += buf[:loc]
+        else:  # > nor EI found
+            data += buf[:-1]
+            stream.seek(-1, 1)
+
+    ei = read_non_whitespace(stream)
+    ei += stream.read(1)
+    stream.seek(-2, 1)
+    if ei != b"EI":
+        raise PdfReadError("EI stream not found")
+    return data
+
+
+def extract_inline_A85(stream: StreamType) -> bytes:
+    """
+    Extract A85 Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data: bytes = b""
+    # Read data up to delimiter ~>
+    # see §3.3.2 from PDF ref 1.7
+    while True:
+        buf = stream.read(BUFFER_SIZE)
+        if not buf:
+            raise PdfReadError("Unexpected end of stream")
+        loc = buf.find(b"~>")
+        if loc >= 0:  # found!
+            data += buf[: loc + 2]
+            stream.seek(-BUFFER_SIZE + loc + 2, 1)
+            break
+        data += buf[:-1]  # back by one char in case of in the middle of ~>
+        stream.seek(-1, 1)
+
+    ei = read_non_whitespace(stream)
+    ei += stream.read(1)
+    stream.seek(-2, 1)
+    if ei != b"EI":
+        raise PdfReadError("EI stream not found")
+    return data
+
+
+def extract_inline_RL(stream: StreamType) -> bytes:
+    """
+    Extract RL Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data: bytes = b""
+    # Read data up to delimiter ~>
+    # see §3.3.4 from PDF ref 1.7
+    while True:
+        buf = stream.read(BUFFER_SIZE)
+        if not buf:
+            raise PdfReadError("Unexpected end of stream")
+        loc = buf.find(b"\x80")
+        if loc >= 0:  # found
+            data = buf[: loc + 1]
+            stream.seek(-BUFFER_SIZE + loc + 1, 1)
+            break
+        data += buf  # back by one char in case of in the middle of ~>
+
+    data += buf[:loc]
+    ei = read_non_whitespace(stream)
+    ei += stream.read(1)
+    stream.seek(-2, 1)
+    if ei != b"EI":
+        raise PdfReadError("EI stream not found")
+    return data
+
+
+def extract_inline_DCT(stream: StreamType) -> bytes:
+    """
+    Extract DCT (JPEG) Stream from Inline Image.
+    the stream will be moved onto the EI
+    """
+    data: bytes = b""
+    # Read Blocks of data (ID/Size/data) up to ID=FF/D9
+    # see https://www.digicamsoft.com/itu/itu-t81-36.html
+    while True:
+        c = stream.read(1)
+        data += c
+        if c != b"\xff":
+            continue
+        c = stream.read(1)
+        if c == b"\xff":
+            stream.seek(-1, 1)
+        elif c == b"\x00":  # stuffing
+            data += c
+        elif c == b"\xd9":  # end
+            data += c
+            break
+        elif c in (
+            b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
+            b"\xda\xdb\xdc\xdd\xde\xdf"
+            b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
+        ):
+            data += c
+            c = stream.read(2)
+            data += c
+            sz = ord(c[0]) * 256 + c[1]
+            data += stream.read(sz - 2)
+        else:
+            data += c
+
+    ei = read_non_whitespace(stream)
+    ei += stream.read(1)
+    stream.seek(-2, 1)
+    if ei != b"EI":
+        raise PdfReadError("EI stream not found")
+    return data
+
+
+def extract_inline_default(stream: StreamType) -> bytes:
+    """
+    Legacy method
+    used by default
+    """
+    data = BytesIO()
+    # Read the inline image, while checking for EI (End Image) operator.
+    while True:
+        buf = stream.read(BUFFER_SIZE)
+        if not buf:
+            raise PdfReadError("Unexpected end of stream")
+        loc = buf.find(
+            b"E"
+        )  # we can not look straight for "EI" because it may not have been loaded in the buffer
+
+        if loc == -1:
+            data.write(buf)
+        else:
+            # Write out everything before the E.
+            data.write(buf[0:loc])
+
+            # Seek back in the stream to read the E next.
+            stream.seek(loc - len(buf), 1)
+            saved_pos = stream.tell()
+            tok = stream.read(1)  # E of "EI"
+            # Check for End Image
+            tok2 = stream.read(1)  # I of "EI"
+            if tok2 != b"I":
+                stream.seek(-1, 1)
+                data.write(tok)
+                continue
+            # for further debug : print("!!!!",buf[loc-1:loc+10])
+            info = tok + tok2
+            tok3 = stream.read(
+                1
+            )  # possible space after "EI" may not been loaded  in buf
+            if tok3 not in WHITESPACES:
+                stream.seek(-2, 1)  # to step back on I
+                data.write(tok)
+            elif buf[loc - 1 : loc] in WHITESPACES:  # and tok3 in WHITESPACES:
+                # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required.
+                while tok3 in WHITESPACES:
+                    # needed ???? : info += tok3
+                    tok3 = stream.read(1)
+                stream.seek(-1, 1)
+                # we do not insert EI
+                break
+            else:  # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES:
+                # Data can contain [!\s]EI[\s],  so check for Q or EMC operator is required to have 4 chars.
+                while tok3 in WHITESPACES:
+                    info += tok3
+                    tok3 = stream.read(1)
+                stream.seek(-1, 1)
+                if tok3 == b"Q":
+                    break
+                elif tok3 == b"E":
+                    ope = stream.read(3)
+                    stream.seek(-3, 1)
+                    if ope == b"EMC":
+                        break
+                else:
+                    data.write(info)
+    stream.seek(saved_pos, 0)
+    return data.getvalue()
diff --git a/tests/test_images.py b/tests/test_images.py
@@ -346,3 +346,15 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
             print(fn)  # noqa: T201
             img = Image.open(BytesIO(zf.read(fn)))
             assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99
+
+
+@pytest.mark.enable_socket()
+def test_inline_image_extraction():
+    """Cf #2598"""
+    url = "https://github.com/py-pdf/pypdf/files/14982414/lebo102.pdf"
+    name = "iss2598.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    # there is no error because images are correctly extracted
+    reader.pages[1].extract_text()
+    reader.pages[2].extract_text()
+    reader.pages[3].extract_text()