Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Ambiguous translated references #2558

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion pypdf/_protocols.py
Expand Up @@ -3,6 +3,7 @@
from abc import abstractmethod
from pathlib import Path
from typing import IO, Any, Dict, List, Optional, Tuple, Union
from weakref import WeakKeyDictionary

try:
# Python 3.8+: https://peps.python.org/pep-0586
Expand Down Expand Up @@ -78,7 +79,7 @@ def trailer(self) -> Dict[str, Any]:

class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]
_id_translated: "WeakKeyDictionary[PdfReaderProtocol, Dict[int, int]]"

@abstractmethod
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
Expand Down
16 changes: 9 additions & 7 deletions pypdf/_writer.py
Expand Up @@ -52,7 +52,9 @@
Union,
cast,
)
from weakref import WeakKeyDictionary

from ._protocols import PdfReaderProtocol
from ._cmap import build_char_map_from_dict
from ._doc_common import PdfDocCommon
from ._encryption import EncryptAlgorithm, Encryption
Expand Down Expand Up @@ -178,7 +180,7 @@ def __init__(
self._idnum_hash: Dict[bytes, IndirectObject] = {}
"""Maps hash values of indirect objects to their IndirectObject instances."""

self._id_translated: Dict[int, Dict[int, int]] = {}
self._id_translated: "WeakKeyDictionary[PdfReaderProtocol, dict[int, int]]" = WeakKeyDictionary()

# The root of our page tree node.
pages = DictionaryObject()
Expand Down Expand Up @@ -389,7 +391,7 @@ def _add_page(
# page, we need to create a new dictionary for the page, however the
# objects below (including content) are not duplicated:
try: # delete an already existing page
del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore
del self._id_translated[page_org.indirect_reference.pdf][ # type: ignore
page_org.indirect_reference.idnum # type: ignore
]
except Exception:
Expand Down Expand Up @@ -2451,7 +2453,7 @@ def merge(
ArrayObject,
cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
)
trslat = self._id_translated[id(reader)]
trslat = self._id_translated[reader]
try:
for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
try:
Expand Down Expand Up @@ -2564,7 +2566,7 @@ def add_filtered_articles(
else:
thr = thr.get_object()
if thr.indirect_reference.idnum not in self._id_translated[
id(reader)
reader
] and fltr.search((thr["/I"] if "/I" in thr else {}).get("/Title", "")):
self._add_articles_thread(thr, pages, reader)

Expand Down Expand Up @@ -2780,15 +2782,15 @@ def reset_translation(
if set to None or omitted, all tables will be reset.
"""
if reader is None:
self._id_translated = {}
self._id_translated = WeakKeyDictionary()
elif isinstance(reader, PdfReader):
try:
del self._id_translated[id(reader)]
del self._id_translated[reader]
except Exception:
pass
elif isinstance(reader, IndirectObject):
try:
del self._id_translated[id(reader.pdf)]
del self._id_translated[reader.pdf]
except Exception:
pass
else:
Expand Down
19 changes: 9 additions & 10 deletions pypdf/generic/_base.py
Expand Up @@ -124,19 +124,18 @@ def _reference_clone(
return clone
i = len(pdf_dest._objects) + 1
if ind is not None:
if id(ind.pdf) not in pdf_dest._id_translated:
pdf_dest._id_translated[id(ind.pdf)] = {}
pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore
if ind.pdf not in pdf_dest._id_translated:
pdf_dest._id_translated[ind.pdf] = {}
if (
not force_duplicate
and ind.idnum in pdf_dest._id_translated[id(ind.pdf)]
and ind.idnum in pdf_dest._id_translated[ind.pdf]
):
obj = pdf_dest.get_object(
pdf_dest._id_translated[id(ind.pdf)][ind.idnum]
pdf_dest._id_translated[ind.pdf][ind.idnum]
)
assert obj is not None
return obj
pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
pdf_dest._id_translated[ind.pdf][ind.idnum] = i
pdf_dest._objects.append(clone)
clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
return clone
Expand Down Expand Up @@ -250,11 +249,11 @@ def clone(
if self.pdf == pdf_dest and not force_duplicate:
# Already duplicated and no extra duplication required
return self
if id(self.pdf) not in pdf_dest._id_translated:
pdf_dest._id_translated[id(self.pdf)] = {}
if self.pdf not in pdf_dest._id_translated:
pdf_dest._id_translated[self.pdf] = {}

if self.idnum in pdf_dest._id_translated[id(self.pdf)]:
dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum])
if self.idnum in pdf_dest._id_translated[self.pdf]:
dup = pdf_dest.get_object(pdf_dest._id_translated[self.pdf][self.idnum])
if force_duplicate:
assert dup is not None
assert dup.indirect_reference is not None
Expand Down