From 3a1dcff0daf5dbf7d88826746d97c90aa612a15e Mon Sep 17 00:00:00 2001 From: Joseph Hale Date: Mon, 12 Dec 2022 13:31:24 -0700 Subject: [PATCH 1/4] DEV: Add in-project virtual envs to .gitignore Many developers (like myself) like to use virtual environments included within the current project. These virtual environment are local development constructs and should not be checked into source control. This commit adds two common virtual environment directory names to the .gitignore to avoid accidental commits from future developers. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 6449fe86b..4dfd2ef51 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,10 @@ build dist/* __pycache__/ +# in-project virtual environments +venv/ +.venv/ + # .mutmut-cache mutmut-results.* From 6b7e055c21851d5114e9ac30c8bd458e8d8871be Mon Sep 17 00:00:00 2001 From: Joseph Hale Date: Mon, 12 Dec 2022 13:44:57 -0700 Subject: [PATCH 2/4] DEV: Include `pillow` in `requirements/dev.in` The current contribution instructions in `docs/dev/intro.md` direct new code contributors to install the `dev` requirements. After following that instruction, the minimal test suite fails with the following errors: ``` python -m venv .venv source .venv/bin/activate pip install -r requirements/dev.txt pytest -m "not external" -m "not samples" -m "not slow" ``` =================================================================================================== short test summary info ==================================================================================================== FAILED tests/test_reader.py::test_get_images[pdflatex-outline.pdf-expected_images0] - ModuleNotFoundError: No module named 'PIL' FAILED tests/test_reader.py::test_get_images[crazyones.pdf-expected_images1] - ModuleNotFoundError: No module named 'PIL' FAILED tests/test_reader.py::test_get_images[git.pdf-expected_images2] - ModuleNotFoundError: No module named 'PIL' FAILED tests/test_reader.py::test_get_images[imagemagick-CCITTFaxDecode.pdf-expected_images5] - ModuleNotFoundError: No module named 'PIL' FAILED tests/test_reader.py::test_get_images[src6-expected_images6] - ModuleNotFoundError: No module named 'PIL' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/994/994636.pdf-tika-994636.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/952/952133.pdf-tika-952133.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/914/914568.pdf-tika-914568.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/952/952016.pdf-tika-952016.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/965/965118.pdf-tika-952016.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/959/959184.pdf-tika-959184.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/958/958496.pdf-tika-958496.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf-tika-972174.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/972/972243.pdf-tika-972243.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://corpora.tika.apache.org/base/docs/govdocs1/969/969502.pdf-tika-969502.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction[https://arxiv.org/pdf/2201.00214.pdf-arxiv-2201.00214.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction_strict - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' FAILED tests/test_workflows.py::test_image_extraction2[https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf-tika-977609.pdf] - ImportError: pillow is required to do image extraction. It can be installed via 'pip install PyPDF2[image]' ======================================================================= 18 failed, 536 passed, 5 skipped, 53 deselected, 5 xfailed in 146.94s (0:02:26) ======================================================================== This commit adds `pillow` to `requirements/dev.in` so that the minimal test suite can pass on the first try so that new code contributors can start implementing improvements with confidence. --- requirements/dev.in | 1 + requirements/dev.txt | 21 +++++---------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/requirements/dev.in b/requirements/dev.in index 374d81236..6229bd533 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -1,4 +1,5 @@ black +pillow pip-tools pre-commit<2.18.0 pytest-cov diff --git a/requirements/dev.txt b/requirements/dev.txt index 82b8d6fdd..0062323bc 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,6 +1,6 @@ # -# This file is autogenerated by pip-compile with python 3.7 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: # # pip-compile requirements/dev.in # @@ -38,13 +38,6 @@ identify==2.5.9 # via pre-commit idna==3.4 # via requests -importlib-metadata==5.1.0 - # via - # build - # click - # pre-commit - # pytest - # virtualenv iniconfig==1.1.1 # via pytest mypy-extensions==0.4.3 @@ -59,6 +52,8 @@ pathspec==0.10.3 # via black pep517==0.13.0 # via build +pillow==9.3.0 + # via -r requirements/dev.in pip-tools==6.11.0 # via -r requirements/dev.in platformdirs==2.6.0 @@ -87,12 +82,8 @@ tomli==2.0.1 # pytest tomli-w==1.0.0 # via flit -typed-ast==1.5.4 - # via black typing-extensions==4.4.0 - # via - # black - # importlib-metadata + # via black urllib3==1.26.13 # via requests virtualenv==20.17.1 @@ -101,8 +92,6 @@ wheel==0.38.4 # via # -r requirements/dev.in # pip-tools -zipp==3.11.0 - # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip From f25e817f82b20934de0e3969fbafd57b6a6c7974 Mon Sep 17 00:00:00 2001 From: Joseph Hale Date: Mon, 12 Dec 2022 14:23:17 -0700 Subject: [PATCH 3/4] STY: Use official `IO` type for file streams The Python standard library provides the `IO` type for file streams. (Source: https://docs.python.org/3/library/typing.html#typing.IO) This commit replaces the complex Union type of the `IO` implementations with the official `IO` type. This will improve the accuracy of type checking in users' IDEs. --- PyPDF2/_utils.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 2dd91e533..947233919 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -34,15 +34,10 @@ import warnings from codecs import getencoder from dataclasses import dataclass -from io import ( - DEFAULT_BUFFER_SIZE, - BufferedReader, - BufferedWriter, - BytesIO, - FileIO, -) +from io import DEFAULT_BUFFER_SIZE from os import SEEK_CUR from typing import ( + IO, Any, Callable, Dict, @@ -68,7 +63,7 @@ float, float, float, float, float, float ] -StreamType = Union[BytesIO, BufferedReader, BufferedWriter, FileIO] +StreamType = IO StrByteType = Union[str, StreamType] DEPR_MSG_NO_REPLACEMENT = "{} is deprecated and will be removed in PyPDF2 {}." From c9e7ec37787a6933139fb51e8d603d74a87da8f0 Mon Sep 17 00:00:00 2001 From: Joseph Hale Date: Mon, 12 Dec 2022 15:39:57 -0700 Subject: [PATCH 4/4] STY: Use standard `IO` type hint for writers The CI system flagged some additional conflicts with the `IO` type in the writer classes. This commit changes the writer classes to use the standard `IO` type instead of the union of IO implementations. --- PyPDF2/_protocols.py | 7 ++----- PyPDF2/_writer.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/PyPDF2/_protocols.py b/PyPDF2/_protocols.py index b83db961b..89c80f9a5 100644 --- a/PyPDF2/_protocols.py +++ b/PyPDF2/_protocols.py @@ -1,8 +1,7 @@ """Helpers for working with PDF types.""" -from io import BufferedReader, BufferedWriter, BytesIO, FileIO from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import IO, Any, Dict, List, Optional, Tuple, Union try: # Python 3.8+: https://peps.python.org/pep-0586 @@ -59,7 +58,5 @@ class PdfWriterProtocol(Protocol): # pragma: no cover def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: ... - def write( - self, stream: Union[Path, StrByteType] - ) -> Tuple[bool, Union[FileIO, BytesIO, BufferedReader, BufferedWriter]]: + def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: ... diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index db5e394ac..dcb5ffee0 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -38,10 +38,11 @@ import uuid import warnings from hashlib import md5 -from io import BufferedReader, BufferedWriter, BytesIO, FileIO, IOBase +from io import BytesIO, FileIO, IOBase from pathlib import Path from types import TracebackType from typing import ( + IO, Any, Callable, Deque, @@ -962,9 +963,7 @@ def write_stream(self, stream: StreamType) -> None: self._write_trailer(stream) stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof - def write( - self, stream: Union[Path, StrByteType] - ) -> Tuple[bool, Union[FileIO, BytesIO, BufferedReader, BufferedWriter]]: + def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: """ Write the collection of pages added to this object out as a PDF file. @@ -1289,7 +1288,7 @@ def add_outline_item_destination( page_destination: Union[None, PageObject, TreeObject] = None, parent: Union[None, TreeObject, IndirectObject] = None, before: Union[None, TreeObject, IndirectObject] = None, - dest: Union[None, PageObject, TreeObject] = None, # deprecated + dest: Union[None, PageObject, TreeObject] = None, # deprecated ) -> IndirectObject: if page_destination is not None and dest is not None: # deprecated raise ValueError( @@ -2483,7 +2482,9 @@ def _add_articles_thread( pag_obj = cast("PageObject", pag.get_object()) if "/B" not in pag_obj: pag_obj[NameObject("/B")] = ArrayObject() - cast("ArrayObject", pag_obj["/B"]).append(new_article.indirect_reference) + cast("ArrayObject", pag_obj["/B"]).append( + new_article.indirect_reference + ) current_article = cast("DictionaryObject", current_article["/N"]) if current_article == first_article: new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore @@ -2674,7 +2675,10 @@ def find_outline_item( i = 0 while o is not None: - if o.indirect_reference == outline_item or o.get("/Title", None) == outline_item: + if ( + o.indirect_reference == outline_item + or o.get("/Title", None) == outline_item + ): return [i] else: if "/First" in o: