Skip to content

Commit

Permalink
TST: Increase Test coverage (#756)
Browse files Browse the repository at this point in the history
Adding unit Tests:

* xmp
* ConvertFunctionsToVirtualList
* PyPDF2.utils.hexStr
* Page operations with encoded file
* merging encrypted
* images

DOC: Comments to docstrings
STY: Remove vim comments

BUG: CCITTFaxDecode decodeParms can be an ArrayObject. 
          I don't know how a good solution would look like. Now it doesn't throw an error, but the result might be wrong.
BUG: struct was not imported for Python 2.X
  • Loading branch information
MartinThoma committed Apr 15, 2022
1 parent 9d53ee8 commit 012709f
Show file tree
Hide file tree
Showing 20 changed files with 296 additions and 119 deletions.
12 changes: 9 additions & 3 deletions PyPDF2/filters.py
@@ -1,5 +1,3 @@
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
Expand Down Expand Up @@ -40,7 +38,7 @@
from cStringIO import StringIO
else:
from io import StringIO
import struct
import struct

try:
import zlib
Expand Down Expand Up @@ -356,6 +354,10 @@ def decode(data, decodeParms=None):
class CCITTFaxDecode(object):
def decode(data, decodeParms=None, height=0):
if decodeParms:
from PyPDF2.generic import ArrayObject
if isinstance(decodeParms, ArrayObject):
if len(decodeParms) == 1:
decodeParms = decodeParms[0]
if decodeParms.get("/K", 1) == -1:
CCITTgroup = 4
else:
Expand Down Expand Up @@ -451,6 +453,10 @@ def _xobj_to_image(x_object_obj):
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()
elif x_object_obj["/Filter"] in (["/LZWDecode"], ['/ASCII85Decode'], ['/CCITTFaxDecode']):
from PyPDF2.utils import b_
extension = ".png"
data = b_(data)
elif x_object_obj["/Filter"] == "/DCTDecode":
extension = ".jpg"
elif x_object_obj["/Filter"] == "/JPXDecode":
Expand Down
49 changes: 25 additions & 24 deletions PyPDF2/generic.py
Expand Up @@ -44,6 +44,8 @@
import decimal
import codecs

from PyPDF2.utils import ERR_STREAM_TRUNCATED_PREMATURELY

ObjectPrefix = b_('/<[tf(n%')
NumberSigns = b_('+-')
IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
Expand Down Expand Up @@ -199,17 +201,15 @@ def readFromStream(stream, pdf):
while True:
tok = stream.read(1)
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
if tok.isspace():
break
idnum += tok
generation = b_("")
while True:
tok = stream.read(1)
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
if tok.isspace():
if not generation:
continue
Expand Down Expand Up @@ -273,10 +273,11 @@ def readFromStream(stream):
readFromStream = staticmethod(readFromStream)


##
# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
# TextStringObject to represent the string.
def createStringObject(string):
"""
Given a string (either a "str" or "unicode"), create a ByteStringObject or a
TextStringObject to represent the string.
"""
if isinstance(string, utils.string_type):
return TextStringObject(string)
elif isinstance(string, utils.bytes_type):
Expand Down Expand Up @@ -306,8 +307,7 @@ def readHexStringFromStream(stream):
while True:
tok = readNonWhitespace(stream)
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
if tok == b_(">"):
break
x += tok
Expand All @@ -328,8 +328,7 @@ def readStringFromStream(stream):
while True:
tok = stream.read(1)
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
if tok == b_("("):
parens += 1
elif tok == b_(")"):
Expand Down Expand Up @@ -392,16 +391,17 @@ def readStringFromStream(stream):
return createStringObject(txt)


##
# Represents a string object where the text encoding could not be determined.
# This occurs quite often, as the PDF spec doesn't provide an alternate way to
# represent strings -- for example, the encryption data stored in files (like
# /O) is clearly not text, but is still stored in a "String" object.
class ByteStringObject(utils.bytes_type, PdfObject):
"""
Represents a string object where the text encoding could not be determined.
This occurs quite often, as the PDF spec doesn't provide an alternate way to
represent strings -- for example, the encryption data stored in files (like
/O) is clearly not text, but is still stored in a "String" object.
"""

##
# For compatibility with TextStringObject.original_bytes. This method
# returns self.
# self.
original_bytes = property(lambda self: self)

def writeToStream(self, stream, encryption_key):
Expand All @@ -413,12 +413,14 @@ def writeToStream(self, stream, encryption_key):
stream.write(b_(">"))


##
# Represents a string object that has been decoded into a real unicode string.
# If read from a PDF document, this string appeared to match the
# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
# occur.
class TextStringObject(utils.string_type, PdfObject):
"""
Represents a string object that has been decoded into a real unicode string.
If read from a PDF document, this string appeared to match the
PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
occur.
"""

autodetect_pdfdocencoding = False
autodetect_utf16 = False

Expand Down Expand Up @@ -569,8 +571,7 @@ def readFromStream(stream, pdf):
skipOverComment(stream)
continue
if not tok:
# stream has truncated prematurely
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)

if debug: print(("Tok:", tok))
if tok == b_(">"):
Expand Down
2 changes: 0 additions & 2 deletions PyPDF2/merger.py
@@ -1,5 +1,3 @@
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
Expand Down
9 changes: 1 addition & 8 deletions PyPDF2/pdf.py
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
#
# vim: sw=4:expandtab:foldmethod=marker
#
# Copyright (c) 2006, Mathieu Fenniak
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
#
Expand Down Expand Up @@ -1637,7 +1635,7 @@ def _getObjectFromStream(self, indirectReference):
streamData.seek(0, 0)
lines = streamData.readlines()
for i in range(0, len(lines)):
print((lines[i]))
print(lines[i])
streamData.seek(pos, 0)
try:
obj = readObject(streamData, self)
Expand Down Expand Up @@ -2588,11 +2586,6 @@ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expan
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]], expand)

##
# Applys a transformation matrix the page.
#
# @param ctm A 6 elements tuple containing the operands of the
# transformation matrix
def addTransformation(self, ctm):
"""
Applies a transformation matrix to the page.
Expand Down
7 changes: 3 additions & 4 deletions PyPDF2/utils.py
Expand Up @@ -39,7 +39,7 @@
except ImportError: # Py3
import builtins


ERR_STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
xrange_fn = getattr(builtins, "xrange", range)
_basestring = getattr(builtins, "basestring", str)

Expand Down Expand Up @@ -122,7 +122,7 @@ def skipOverComment(stream):
def readUntilRegex(stream, regex, ignore_eof=False):
"""
Reads until the regular expression pattern matched (ignore the match)
Raise PdfStreamError on premature end-of-file.
:raises PdfStreamError: on premature end-of-file
:param bool ignore_eof: If true, ignore end-of-line and return immediately
"""
name = b_('')
Expand All @@ -133,7 +133,7 @@ def readUntilRegex(stream, regex, ignore_eof=False):
if ignore_eof:
return name
else:
raise PdfStreamError("Stream has ended unexpectedly")
raise PdfStreamError(ERR_STREAM_TRUNCATED_PREMATURELY)
m = regex.search(tok)
if m is not None:
name += tok[:m.start()]
Expand Down Expand Up @@ -242,7 +242,6 @@ def b_(s):
bc[s] = r
return r
except Exception:
print(s)
r = s.encode('utf-8')
if len(s) < 2:
bc[s] = r
Expand Down
Binary file added Resources/imagemagick-ASCII85Decode.pdf
Binary file not shown.
Binary file added Resources/imagemagick-CCITTFaxDecode.pdf
Binary file not shown.
Binary file added Resources/imagemagick-images.pdf
Binary file not shown.
Binary file added Resources/imagemagick-lzw.pdf
Binary file not shown.
Binary file added Resources/metadata.pdf
Binary file not shown.
34 changes: 17 additions & 17 deletions Tests/test_basic_features.py
Expand Up @@ -2,60 +2,60 @@

import pytest

from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.utils import PdfReadError
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import convertToInt
from PyPDF2.utils import PdfReadError

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")


def test_basic_features():
output = PdfFileWriter()
document1 = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
input1 = PdfFileReader(document1)
writer = PdfFileWriter()
pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
reader = PdfFileReader(pdf_path)

# print how many pages input1 has:
print("document1.pdf has %d pages." % input1.getNumPages())
print("document1.pdf has %d pages." % reader.getNumPages())

# add page 1 from input1 to output document, unchanged
output.addPage(input1.getPage(0))
writer.addPage(reader.getPage(0))

# add page 2 from input1, but rotated clockwise 90 degrees
output.addPage(input1.getPage(0).rotateClockwise(90))
writer.addPage(reader.getPage(0).rotateClockwise(90))

# add page 3 from input1, rotated the other way:
output.addPage(input1.getPage(0).rotateCounterClockwise(90))
writer.addPage(reader.getPage(0).rotateCounterClockwise(90))
# alt: output.addPage(input1.getPage(0).rotateClockwise(270))

# add page 4 from input1, but first add a watermark from another PDF:
page4 = input1.getPage(0)
watermark_pdf = document1
page4 = reader.getPage(0)
watermark_pdf = pdf_path
watermark = PdfFileReader(watermark_pdf)
page4.mergePage(watermark.getPage(0))
output.addPage(page4)
writer.addPage(page4)

# add page 5 from input1, but crop it to half size:
page5 = input1.getPage(0)
page5 = reader.getPage(0)
page5.mediaBox.upperRight = (
page5.mediaBox.getUpperRight_x() / 2,
page5.mediaBox.getUpperRight_y() / 2,
)
output.addPage(page5)
writer.addPage(page5)

# add some Javascript to launch the print window on opening this PDF.
# the password dialog may prevent the print dialog from being shown,
# comment the the encription lines, if that's the case, to try this out
output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

# encrypt your new PDF and add a password
password = "secret"
output.encrypt(password)
writer.encrypt(password)

# finally, write "output" to PyPDF2-output.pdf
with open("PyPDF2-output.pdf", "wb") as outputStream:
output.write(outputStream)
writer.write(outputStream)


def test_convertToInt():
Expand Down
36 changes: 21 additions & 15 deletions Tests/test_javascript.py
@@ -1,4 +1,5 @@
import os

import pytest

from PyPDF2 import PdfFileReader, PdfFileWriter
Expand All @@ -8,21 +9,28 @@
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")


@pytest.fixture
def pdf_file_writer():
ipdf = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
reader = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
pdf_file_writer = PdfFileWriter()
pdf_file_writer.appendPagesFromReader(ipdf)
pdf_file_writer.appendPagesFromReader(reader)
yield pdf_file_writer


def test_add_js(pdf_file_writer):
pdf_file_writer.addJS(
"this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
)
pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

assert (
"/Names" in pdf_file_writer._root_object
), "addJS should add a name catalog in the root object."
assert (
"/JavaScript" in pdf_file_writer._root_object["/Names"]
), "addJS should add a JavaScript name tree under the name catalog."
assert (
"/OpenAction" in pdf_file_writer._root_object
), "addJS should add an OpenAction to the catalog."

assert "/Names" in pdf_file_writer._root_object, "addJS should add a name catalog in the root object."
assert "/JavaScript" in pdf_file_writer._root_object["/Names"], "addJS should add a JavaScript name tree under the name catalog."
assert "/OpenAction" in pdf_file_writer._root_object, "addJS should add an OpenAction to the catalog."

def test_overwrite_js(pdf_file_writer):
def get_javascript_name():
Expand All @@ -31,14 +39,12 @@ def get_javascript_name():
assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"]
return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][0]

pdf_file_writer.addJS(
"this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
)
pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
first_js = get_javascript_name()

pdf_file_writer.addJS(
"this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
)
pdf_file_writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
second_js = get_javascript_name()

assert first_js != second_js, "addJS should overwrite the previous script in the catalog."
assert (
first_js != second_js
), "addJS should overwrite the previous script in the catalog."

0 comments on commit 012709f

Please sign in to comment.