change: encode non-JPEG images as PNGs instead of JPEG2000 images

This uses Pillow to re-encode any non-JPEG image as a PNG, then inline that image's IDAT chunks as a FlateDecode value, which allows us to reuse the work from the PNG encoder. This means we'll reencode some PNGs we could have passed through directly, but that could be changed later. Alpha layers continue to be handled separately, as appears to be required by the PDF spec.
Kozea · Oct 29, 2021 · 730d4f3 · 730d4f3
1 parent a149af9
commit 730d4f3
Showing 1 changed file with 50 additions and 16 deletions.
diff --git a/weasyprint/document.py b/weasyprint/document.py
@@ -10,6 +10,7 @@
 import io
 import math
 import shutil
+import struct
 import zlib
 from os.path import basename
 from urllib.parse import unquote, urlsplit
@@ -255,19 +256,34 @@ def add_group(self, bounding_box):
         self._x_objects[group.id] = group
         return group
 
-    def _save_jpeg2000(self, pillow_image, optimize):
+    def _save_png(self, pillow_image, optimize):
         image_file = io.BytesIO()
-        try:
-            pillow_image.save(image_file, format='JPEG2000', optimize=optimize)
-        except OSError:
-            # Set number of resolutions to 1 because of
-            # https://github.com/uclouvain/openjpeg/issues/215
-            image_file.seek(0)
-            pillow_image.save(
-                image_file, format='JPEG2000', optimize=optimize,
-                num_resolutions=1)
+        pillow_image.save(image_file, format='PNG', optimize=optimize)
         return image_file
 
+    def _get_png_data(self, image_file):
+        image_file.seek(0)
+        # Read the PNG header, then discard it because we know it's a PNG. If
+        # this weren't just output from Pillow, we should actually check it.
+        res=image_file.read(8)
+
+        png_data = b''
+        raw_chunk_len = image_file.read(4)
+        # PNG files consist of a series of chunks.
+        while len(raw_chunk_len) > 0:
+            # Each chunk begins with its data length (four bytes, may be zero),
+            # then its type (four ASCII characters), then the data, then four
+            # bytes of a CRC.
+            chunk_len, = struct.unpack('!I', raw_chunk_len)
+            chunk_type = image_file.read(4)
+            chunk_data = image_file.read(chunk_len)
+            if chunk_type == b'IDAT':
+                png_data += chunk_data
+            # We aren't checking the CRC, we assume this is a valid PNG.
+            _chunk_crc = image_file.read(4)
+            raw_chunk_len = image_file.read(4)
+        return png_data
+
     def add_image(self, pillow_image, image_rendering, optimize_size):
         image_name = f'i{pillow_image.id}'
         self._x_objects[image_name] = None  # Set by write_pdf
@@ -306,24 +322,42 @@ def add_image(self, pillow_image, image_rendering, optimize_size):
             extra['Filter'] = '/DCTDecode'
             image_file = io.BytesIO()
             pillow_image.save(image_file, format='JPEG', optimize=optimize)
+            stream = [image_file.getvalue()]
         else:
-            extra['Filter'] = '/JPXDecode'
+            extra['Filter'] = '/FlateDecode'
+            extra['DecodeParms'] = pydyf.Dictionary({
+                # Predictor 15 specifies that we're providing PNG data,
+                # ostensibly using an "optimum predictor", but doesn't actually
+                # matter as long as the predictor value is 10+ according to the
+                # spec. (Other PNG predictor values assert that we're using
+                # specific predictors that we don't want to commit to, but
+                # "optimum" can vary.)
+                'Predictor': 15,
+                'Columns': pillow_image.width,
+            })
+            if pillow_image.mode in ('RGB', 'RGBA'):
+                # Defaults to 1.
+                extra['DecodeParms']['Colors'] = 3
             if pillow_image.mode in ('RGBA', 'LA'):
                 alpha = pillow_image.getchannel('A')
                 pillow_image = pillow_image.convert(pillow_image.mode[:-1])
-                alpha_file = self._save_jpeg2000(alpha, optimize)
-                extra['SMask'] = pydyf.Stream([alpha_file.getvalue()], extra={
-                    'Filter': '/JPXDecode',
+                alpha_file = self._save_png(alpha, optimize)
+                extra['SMask'] = pydyf.Stream([self._get_png_data(alpha_file)], extra={
+                    'Filter': '/FlateDecode',
                     'Type': '/XObject',
                     'Subtype': '/Image',
+                    'DecodeParms': pydyf.Dictionary({
+                        'Predictor': 15,
+                        'Columns': pillow_image.width,
+                    }),
                     'Width': pillow_image.width,
                     'Height': pillow_image.height,
                     'ColorSpace': '/DeviceGray',
                     'BitsPerComponent': 8,
                     'Interpolate': interpolate,
                 })
-            image_file = self._save_jpeg2000(pillow_image, optimize)
-        stream = [image_file.getvalue()]
+            image_file = self._save_png(pillow_image, optimize)
+            stream = [self._get_png_data(image_file)]
 
         xobject = pydyf.Stream(stream, extra=extra)
         self._images[image_name] = xobject