Tests/tests.py

import os
import sys
import unittest
import binascii

from PyPDF2 import PdfFileReader, PdfFileWriter


# Configure path environment
TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")

sys.path.append(PROJECT_ROOT)


if sys.version_info[0] < 3:

    def u_(s):
        return s.decode("utf-8")

else:

    def u_(s):
        return s


class PdfReaderTestCases(unittest.TestCase):
    def test_PdfReaderFileLoad(self):
        """
        Test loading and parsing of a file. Extract text of the file and compare to expected
        textual output. Expected outcome: file loads, text matches expected.
        """

        with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)
            ipdf_p1 = ipdf.getPage(0)

            # Retrieve the text of the PDF
            with open(
                os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb"
            ) as pdftext_file:
                pdftext = pdftext_file.read()

            ipdf_p1_text = ipdf_p1.extractText().replace("\n", "").encode("utf-8")

            # Compare the text of the PDF to a known source
            self.assertEqual(
                ipdf_p1_text,
                pdftext,
                msg="PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
                % (pdftext, ipdf_p1_text),
            )

    def test_PdfReaderJpegImage(self):
        """
        Test loading and parsing of a file. Extract the image of the file and compare to expected
        textual output. Expected outcome: file loads, image matches expected.
        """

        with open(os.path.join(RESOURCE_ROOT, "jpeg.pdf"), "rb") as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)

            # Retrieve the text of the image
            with open(os.path.join(RESOURCE_ROOT, "jpeg.txt"), "r") as pdftext_file:
                imagetext = pdftext_file.read()

            ipdf_p0 = ipdf.getPage(0)
            xObject = ipdf_p0["/Resources"]["/XObject"].getObject()
            data = xObject["/Im4"].getData()

            # Compare the text of the PDF to a known source
            self.assertEqual(
                binascii.hexlify(data).decode(),
                imagetext,
                msg="PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
                % (imagetext, binascii.hexlify(data).decode()),
            )


class AddJsTestCase(unittest.TestCase):
    def setUp(self):
        ipdf = PdfFileReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
        self.pdf_file_writer = PdfFileWriter()
        self.pdf_file_writer.appendPagesFromReader(ipdf)

    def test_add(self):

        self.pdf_file_writer.addJS(
            "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
        )

        self.assertIn(
            "/Names",
            self.pdf_file_writer._root_object,
            "addJS should add a name catalog in the root object.",
        )
        self.assertIn(
            "/JavaScript",
            self.pdf_file_writer._root_object["/Names"],
            "addJS should add a JavaScript name tree under the name catalog.",
        )
        self.assertIn(
            "/OpenAction",
            self.pdf_file_writer._root_object,
            "addJS should add an OpenAction to the catalog.",
        )

    def test_overwrite(self):

        self.pdf_file_writer.addJS(
            "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
        )
        first_js = self.get_javascript_name()

        self.pdf_file_writer.addJS(
            "this.print({bUI:true,bSilent:false,bShrinkToFit:true});"
        )
        second_js = self.get_javascript_name()

        self.assertNotEqual(
            first_js,
            second_js,
            "addJS should overwrite the previous script in the catalog.",
        )

    def get_javascript_name(self):
        self.assertIn("/Names", self.pdf_file_writer._root_object)
        self.assertIn("/JavaScript", self.pdf_file_writer._root_object["/Names"])
        self.assertIn(
            "/Names", self.pdf_file_writer._root_object["/Names"]["/JavaScript"]
        )
        return self.pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][0]