Skip to content

Commit

Permalink
ROB: Cope with some issues in pillow (#2595)
Browse files Browse the repository at this point in the history
Closes #2265.
Closes #2266.
  • Loading branch information
pubpub-zz committed Apr 16, 2024
1 parent 2d8347b commit b171422
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 12 deletions.
18 changes: 9 additions & 9 deletions pypdf/_xobj_image_helpers.py
Expand Up @@ -73,9 +73,7 @@ def _get_imagemode(
color_components = cast(int, icc_profile["/N"])
color_space = icc_profile.get("/Alternate", "")
elif color_space[0] == "/Indexed":
color_space = color_space[1]
if isinstance(color_space, IndirectObject):
color_space = color_space.get_object()
color_space = color_space[1].get_object()
mode2, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
Expand Down Expand Up @@ -291,13 +289,15 @@ def _handle_jpx(
if img1.mode == "RGBA" and mode == "RGB":
mode = "RGBA"
# we need to convert to the good mode
try:
if img1.mode != mode:
img = Image.frombytes(mode, img1.size, img1.tobytes())
else:
img = img1
except OSError:
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
# L,P are indexed modes which should not be changed.
img = img1
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
# RGBA / CMYK are 4bytes encoding where
# the encoding should be corrected
img = Image.frombytes(mode, img1.size, img1.tobytes())
else: # pragma: no cover
img = img1.convert(mode)
# for CMYK conversion :
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
# not implemented for the moment as I need to get properly the ICC
Expand Down
9 changes: 6 additions & 3 deletions pypdf/filters.py
Expand Up @@ -894,10 +894,13 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
except OSError: # pragma: no cover
# odd error
except OSError: # pragma: no cover # covered with pillow 10.3
# in case of we convert to RGBA and then to PNG
img1 = img.convert("RGBA")
image_format = "PNG"
extension = ".png"
img_byte_arr = BytesIO()
img.save(img_byte_arr, format=image_format)
img1.save(img_byte_arr, format=image_format)
data = img_byte_arr.getvalue()

try: # temporary try/except until other fixes of images
Expand Down
63 changes: 63 additions & 0 deletions tests/test_images.py
Expand Up @@ -8,6 +8,7 @@
from io import BytesIO
from pathlib import Path
from typing import Union
from zipfile import ZipFile

import pytest
from PIL import Image, ImageChops, ImageDraw
Expand Down Expand Up @@ -283,3 +284,65 @@ def test_data_with_lf():
name = "iss2343b0.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[8].images[9].image, img) == 1.0


@pytest.mark.enable_socket()
def test_oserror():
"""Cf #2265"""
url = "https://github.com/py-pdf/pypdf/files/13127130/Binance.discovery.responses.2.gov.uscourts.dcd.256060.140.1.pdf"
name = "iss2265.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[2].images[1]
# Due to errors in translation in pillow we may not get
# the correct image. Therefore we cannot use `image_similarity`.


@pytest.mark.parametrize(
("pdf", "pdf_name", "images", "images_name", "filtr"),
[
(
"https://github.com/py-pdf/pypdf/files/13127197/FTX.Claim.SC30.01072023101624File595287144.pdf",
"iss2266a.pdf",
"https://github.com/py-pdf/pypdf/files/14967061/iss2266a_images.zip",
"iss2266a_images.zip",
((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test
),
(
"https://github.com/py-pdf/pypdf/files/13127242/FTX.Claim.Skybridge.Capital.30062023113350File971325116.pdf",
"iss2266b.pdf",
"https://github.com/py-pdf/pypdf/files/14967099/iss2266b_images.zip",
"iss2266b_images.zip",
((0, 0), (1, 0), (4, 0), (9, 0)), # random pick-up to speed up test
),
],
)
@pytest.mark.enable_socket()
def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
"""
Code to create zipfile:
import pypdf;zipfile
with pypdf.PdfReader("____inputfile___") as r:
with zipfile.ZipFile("__outputzip___","w") as z:
for p in r.pages:
for ii,i in enumerate(p.images):
print(i.name)
b=BytesIO()
i.image.save(b,"JPEG")
z.writestr(f"image_{p.page_number}_{ii}_{i.name}",b.getbuffer())
"""
url = pdf
name = pdf_name
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = images
name = images_name
print(pdf_name, images_name) # noqa: T201
with ZipFile(BytesIO(get_data_from_url(url, name=name)), "r") as zf:
for fn in zf.namelist():
sp = fn.split("_")
p, i = int(sp[1]), int(sp[2])
if filtr is not None and (p, i) not in filtr:
continue
print(fn) # noqa: T201
img = Image.open(BytesIO(zf.read(fn)))
assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99

0 comments on commit b171422

Please sign in to comment.