Skip to content

Commit

Permalink
ENH: Add Cloning (#1371)
Browse files Browse the repository at this point in the history
The method `.clone(pdf_dest,[force_duplicate])` clones the objects and all referenced objects.

If an object is already cloned, the already cloned object is returned (unless force_duplicate is set)
mainly for internal use but can be used on a page
for pageObject/DictionnaryObject/[Encoded/Decoded/Content]Stream an extra parameter ignore_fields list that provide the list of fields that should not be cloned.

When available, the pointer to an object is available in `indirect_obj` attribute.

New API for add_page/insert_page that :

* returns the cloned page object
* ignore_fields can be provided as a parameter.

## Others

* file is closed at the end of PdfWriter.write when a filename is provided
* Breaking Change: `add_outline_item` now has a parameter before which is not the last parameter

## Update
* The public API of PdfMerger has been added to PdfWriter (ready to make PdfMerger an alias of it)
* Process properly Outline merging
* Process properly Named destinated

Deals with #1194, #1322, #471, #1337
  • Loading branch information
pubpub-zz committed Dec 11, 2022
1 parent 9674f5f commit 74b8a63
Show file tree
Hide file tree
Showing 13 changed files with 1,695 additions and 160 deletions.
1 change: 1 addition & 0 deletions PyPDF2/_merger.py
Expand Up @@ -702,6 +702,7 @@ def add_outline_item(
title,
page_number,
parent,
None,
color,
bold,
italic,
Expand Down
8 changes: 5 additions & 3 deletions PyPDF2/_page.py
Expand Up @@ -46,6 +46,7 @@
)

from ._cmap import build_char_map, unknown_char_map
from ._protocols import PdfReaderProtocol
from ._utils import (
CompressedTransformationMatrix,
File,
Expand Down Expand Up @@ -288,16 +289,17 @@ class PageObject(DictionaryObject):
this object in its source PDF
"""

original_page: "PageObject" # very local use in writer when appending

def __init__(
self,
pdf: Optional[Any] = None, # PdfReader
pdf: Optional[PdfReaderProtocol] = None,
indirect_reference: Optional[IndirectObject] = None,
indirect_ref: Optional[IndirectObject] = None,
) -> None:
from ._reader import PdfReader

DictionaryObject.__init__(self)
self.pdf: Optional[PdfReader] = pdf
self.pdf: Optional[PdfReaderProtocol] = pdf
if indirect_ref is not None: # deprecated
warnings.warn(
"Use indirect_reference instead of indirect_ref.", DeprecationWarning
Expand Down
65 changes: 65 additions & 0 deletions PyPDF2/_protocols.py
@@ -0,0 +1,65 @@
"""Helpers for working with PDF types."""

from io import BufferedReader, BufferedWriter, BytesIO, FileIO
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

try:
# Python 3.8+: https://peps.python.org/pep-0586
from typing import Protocol # type: ignore[attr-defined]
except ImportError:
from typing_extensions import Protocol # type: ignore[misc]

from ._utils import StrByteType


class PdfObjectProtocol(Protocol):
indirect_reference: Any

def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> Any:
...

def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
...

def get_object(self) -> Optional["PdfObjectProtocol"]:
...


class PdfReaderProtocol(Protocol): # pragma: no cover
@property
def pdf_header(self) -> str:
...

@property
def strict(self) -> bool:
...

@property
def xref(self) -> Dict[int, Dict[int, Any]]:
...

@property
def pages(self) -> List[Any]:
...

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...


class PdfWriterProtocol(Protocol): # pragma: no cover
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]

def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
...

def write(
self, stream: Union[Path, StrByteType]
) -> Tuple[bool, Union[FileIO, BytesIO, BufferedReader, BufferedWriter]]:
...
3 changes: 3 additions & 0 deletions PyPDF2/_reader.py
Expand Up @@ -972,6 +972,7 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
# absolute value = num. visible children
# positive = open/unfolded, negative = closed/folded
outline_item[NameObject("/Count")] = node["/Count"]
outline_item.node = node
return outline_item

@property
Expand Down Expand Up @@ -1389,6 +1390,8 @@ def cache_indirect_object(
raise PdfReadError(msg)
logger_warning(msg, __name__)
self.resolved_objects[(generation, idnum)] = obj
if obj is not None:
obj.indirect_reference = IndirectObject(idnum, generation, self)
return obj

def cacheIndirectObject(
Expand Down

0 comments on commit 74b8a63

Please sign in to comment.