From 0dc5d8646a4ad03ffeb11d6bee782e1d131feaf2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 28 Nov 2022 19:04:29 +0100 Subject: [PATCH 1/3] ENH : add threads access in PdfWriter this currently returns only an empty list . Pending to PR#1371 --- PyPDF2/_writer.py | 23 +++++++++++++++++++++++ PyPDF2/constants.py | 1 + tests/test_writer.py | 8 ++++++++ 3 files changed, 32 insertions(+) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 12539900d..3ad538da9 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1098,6 +1098,29 @@ def get_outline_root(self) -> TreeObject: return outline + def get_threads_root(self) -> ArrayObject: + """ + the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + if CO.THREADS in self._root_object: + # TABLE 3.25 Entries in the catalog dictionary + threads = cast(ArrayObject, self._root_object[CO.THREADS]) + else: + threads = ArrayObject() + self._root_object[NameObject(CO.THREADS)] = threads + return threads + + @property + def threads(self) -> ArrayObject: + """ + Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + return self.get_threads_root() + def getOutlineRoot(self) -> TreeObject: # pragma: no cover """ .. deprecated:: 1.28.0 diff --git a/PyPDF2/constants.py b/PyPDF2/constants.py index f8d3faf8f..a2f8c49ed 100644 --- a/PyPDF2/constants.py +++ b/PyPDF2/constants.py @@ -16,6 +16,7 @@ class Core: """Keywords that don't quite belong anywhere else.""" OUTLINES = "/Outlines" + THREADS = "/Threads" PAGE = "/Page" PAGES = "/Pages" CATALOG = "/Catalog" diff --git a/tests/test_writer.py b/tests/test_writer.py index 9ab514672..3b7f1039b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -7,6 +7,7 @@ from PyPDF2 import PageObject, PdfMerger, PdfReader, PdfWriter from PyPDF2.errors import PageSizeNotDefinedError from PyPDF2.generic import ( + ArrayObject, IndirectObject, NameObject, NumberObject, @@ -855,3 +856,10 @@ def test_startup_dest(): pdf_file_writer.open_destination = None assert "/OpenAction" not in pdf_file_writer._root_object pdf_file_writer.open_destination = None + + +def test_threads_empty(): + writer = PdfWriter() + thr = writer.threads + assert isinstance(thr, ArrayObject) + assert len(thr) == 0 From ba25ef0b1712e2fd4b48c54019bc84ce3ffb170b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 28 Nov 2022 19:20:57 +0100 Subject: [PATCH 2/3] test extended --- tests/test_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index 3b7f1039b..5b02d1265 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -863,3 +863,5 @@ def test_threads_empty(): thr = writer.threads assert isinstance(thr, ArrayObject) assert len(thr) == 0 + thr2 = writer.threads + assert thr == thr2 From 16a1e25ff1ca59756160948e4a71399a50fb27b9 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 28 Nov 2022 23:33:33 +0100 Subject: [PATCH 3/3] add threads for reader --- PyPDF2/_reader.py | 14 ++++++++++++++ tests/test_reader.py | 14 +++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 655278b8d..b0d5d649a 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -800,6 +800,20 @@ def getOutlines( deprecate_with_replacement("getOutlines", "outline") return self._get_outline(node, outline) + @property + def threads(self) -> Optional[ArrayObject]: + """ + Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array of Dictionnaries with "/F" and "/I" properties + or None if no articles. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CO.THREADS in catalog: + return cast("ArrayObject", catalog[CO.THREADS]) + else: + return None + def _get_page_number_by_indirect( self, indirect_ref: Union[None, int, NullObject, IndirectObject] ) -> int: diff --git a/tests/test_reader.py b/tests/test_reader.py index 192825f16..16c17b7e6 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -17,7 +17,7 @@ PdfReadWarning, WrongPasswordError, ) -from PyPDF2.generic import Destination +from PyPDF2.generic import ArrayObject, Destination from . import get_pdf_from_url, normalize_warnings @@ -1179,3 +1179,15 @@ def test_zeroing_xref(): name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) len(reader.pages) + + +def test_thread(): + url = "https://github.com/py-pdf/PyPDF2/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + name = "UTA_OSHA.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert reader.threads is None + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert isinstance(reader.threads, ArrayObject) + assert len(reader.threads) >= 1