From 7698e4eb7a50e10ffcae72dc3c8b3237ca69b7cc Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 29 Nov 2022 23:15:02 +0100 Subject: [PATCH] [ENH] miscellaneous enhancement in Reader and test main change : get_object(int) now available --- PyPDF2/_reader.py | 45 ++++++++++++++++++++++++++++---------------- tests/test_reader.py | 41 +++++++++++++++++++++++++++++++++++++++- tests/test_writer.py | 1 + 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 655278b8d..65586d7d5 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -863,14 +863,18 @@ def getDestinationPageNumber( def _build_destination( self, title: str, - array: List[Union[NumberObject, IndirectObject, NullObject, DictionaryObject]], + array: Optional[ + List[ + Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] + ] + ], ) -> Destination: page, typ = None, None # handle outline items with missing or invalid destination if ( - isinstance(array, (type(None), NullObject)) + isinstance(array, (NullObject, str)) or (isinstance(array, ArrayObject) and len(array) == 0) - or (isinstance(array, str)) + or array is None ): page = NullObject() @@ -898,7 +902,7 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: # title required for valid outline # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary try: - title = node["/Title"] + title = cast("str", node["/Title"]) except KeyError: if self.strict: raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") @@ -918,9 +922,10 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: dest = dest["/D"] if isinstance(dest, ArrayObject): - outline_item = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) elif isinstance(dest, str): # named destination, addresses NameObject Issue #193 + # TODO : keep named destination instead of replacing it ? try: outline_item = self._build_destination( title, self._namedDests[dest].dest_array @@ -928,13 +933,18 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: except KeyError: # named destination not found in Name Dict outline_item = self._build_destination(title, None) - elif isinstance(dest, type(None)): + elif dest is None: # outline item not required to have destination or action # PDFv1.7 Table 153 - outline_item = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) else: if self.strict: raise PdfReadError(f"Unexpected destination {dest!r}") + else: + logger_warning( + f"Removed unexpected destination {dest!r} from destination", + __name__, + ) outline_item = self._build_destination(title, None) # type: ignore # if outline item created, add color, format, and child count if present @@ -950,7 +960,6 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: # absolute value = num. visible children # positive = open/unfolded, negative = closed/folded outline_item[NameObject("/Count")] = node["/Count"] - return outline_item @property @@ -1154,7 +1163,18 @@ def _get_object_from_stream( raise PdfReadError("This is a fatal error in strict mode.") return NullObject() - def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + used to ease development + equivalent to generic.IndirectObject(num,gen,self).get_object() + """ + return IndirectObject(num, gen, self).get_object() + + def get_object( + self, indirect_reference: Union[int, IndirectObject] + ) -> Optional[PdfObject]: + if isinstance(indirect_reference, int): + indirect_reference = IndirectObject(indirect_reference, 0, self) retval = self.cache_get_indirect_object( indirect_reference.generation, indirect_reference.idnum ) @@ -1928,13 +1948,6 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval - def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: - """ - used to ease development - equivalent to generic.IndirectObject(num,gen,self).get_object() - """ - return IndirectObject(num, gen, self).get_object() - class PdfFileReader(PdfReader): # pragma: no cover def __init__(self, *args: Any, **kwargs: Any) -> None: diff --git a/tests/test_reader.py b/tests/test_reader.py index 192825f16..0338d6eb2 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -17,7 +17,13 @@ PdfReadWarning, WrongPasswordError, ) -from PyPDF2.generic import Destination +from PyPDF2.generic import ( + Destination, + DictionaryObject, + NameObject, + NumberObject, + TextStringObject, +) from . import get_pdf_from_url, normalize_warnings @@ -755,6 +761,12 @@ def test_iss925(): annot.get_object() +def test_get_object(): + reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") + assert reader.get_object(22)["/Type"] == "/Catalog" + assert reader._get_indirect_object(22, 0)["/Type"] == "/Catalog" + + @pytest.mark.xfail(reason="#591") def test_extract_text_hello_world(): reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf") @@ -1179,3 +1191,30 @@ def test_zeroing_xref(): name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) len(reader.pages) + + +def test_build_outline_item(caplog): + url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" + name = "shiv_resume.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + outline = reader._build_outline_item( + DictionaryObject( + { + NameObject("/Title"): TextStringObject("Toto"), + NameObject("/Dest"): NumberObject(2), + } + ) + ) + assert "Removed unexpected destination 2 from destination" in caplog.text + assert outline["/Title"] == "Toto" + reader.strict = True + with pytest.raises(PdfReadError) as exc: + reader._build_outline_item( + DictionaryObject( + { + NameObject("/Title"): TextStringObject("Toto"), + NameObject("/Dest"): NumberObject(2), + } + ) + ) + assert "Unexpected destination 2" in exc.value.args[0] diff --git a/tests/test_writer.py b/tests/test_writer.py index 9ab514672..70adf3ba2 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -343,6 +343,7 @@ def test_write_metadata(): reader = PdfReader(pdf_path) writer = PdfWriter() + writer.add_page(reader.pages[0]) for page in reader.pages: writer.add_page(page)