py-pdf · pubpub-zz · Sep 12, 2023 · Sep 16, 2023 · Sep 17, 2023 · Oct 3, 2023
diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -76,6 +76,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]:
     def _add_object(self, obj: Any) -> Any:
         ...
 
+    def _replace_object(self, indirect_reference: Any, obj: Any) -> Any:
+        ...
+
     @property
     def pages(self) -> List[Any]:
         ...

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -39,9 +39,7 @@
     Callable,
     Dict,
     Iterable,
-    Iterator,
     List,
-    Mapping,
     Optional,
     Tuple,
     Union,
@@ -87,6 +85,7 @@
 )
 from .generic import (
     ArrayObject,
+    AttachmentBytesDictionary,
     BooleanObject,
     ContentStream,
     DecodedStreamObject,
@@ -98,6 +97,7 @@
     FloatObject,
     IndirectObject,
     NameObject,
+    NameTree,
     NullObject,
     NumberObject,
     PdfObject,
@@ -2206,107 +2206,50 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
         interim[NameObject("/T")] = TextStringObject(name)
         return interim
 
-    @property
-    def attachments(self) -> Mapping[str, List[bytes]]:
-        return LazyDict(
-            {
-                name: (self._get_attachment_list, name)
-                for name in self._list_attachments()
-            }
-        )
-
-    def _list_attachments(self) -> List[str]:
+    def _get_embedded_files_root(self) -> Optional[NameTree]:
         """
-        Retrieves the list of filenames of file attachments.
-
-        Returns:
-            list of filenames
+        Returns the EmbeddedFiles root as a NameTree Object
+        if the root does not exists, return None
         """
         catalog = cast(DictionaryObject, self.trailer["/Root"])
-        # From the catalog get the embedded file names
-        try:
-            filenames = cast(
-                ArrayObject,
-                cast(
-                    DictionaryObject,
-                    cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
-                )["/Names"],
-            )
-        except KeyError:
-            return []
-        attachments_names = [f for f in filenames if isinstance(f, str)]
-        return attachments_names
-
-    def _get_attachment_list(self, name: str) -> List[bytes]:
-        out = self._get_attachments(name)[name]
-        if isinstance(out, list):
-            return out
-        return [out]
-
-    def _get_attachments(
-        self, filename: Optional[str] = None
-    ) -> Dict[str, Union[bytes, List[bytes]]]:
+        if "/Names" not in catalog:
+            return None
+        ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None)
+        if ef is None:
+            return None
+        efo = ef.get_object()
+        # not for reader
         """
-        Retrieves all or selected file attachments of the PDF as a dictionary of file names
-        and the file data as a bytestring.
-
-        Args:
-            filename: If filename is None, then a dictionary of all attachments
-                will be returned, where the key is the filename and the value
-                is the content. Otherwise, a dictionary with just a single key
-                - the filename - and its content will be returned.
+            if not isinstance(efo,NameTree):
+            if isinstance(ef,IndirectObject):
+                ef.replace_object(efo)
+            else:
+                cast(DictionaryObject,catalog["/Names"])[
+                    NameObject("/EmbeddedFiles")] = NameTree(efo)
+        """
+        return NameTree(efo)
 
+    @property
+    def attachments_names(self) -> List[str]:
+        """
         Returns:
-            dictionary of filename -> Union[bytestring or List[ByteString]]
-            if the filename exists multiple times a List of the different version will be provided
+            List of names
         """
-        catalog = cast(DictionaryObject, self.trailer["/Root"])
-        # From the catalog get the embedded file names
-        try:
-            filenames = cast(
-                ArrayObject,
-                cast(
-                    DictionaryObject,
-                    cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
-                )["/Names"],
-            )
-        except KeyError:
-            return {}
-        attachments: Dict[str, Union[bytes, List[bytes]]] = {}
-        # Loop through attachments
-        for i in range(len(filenames)):
-            f = filenames[i]
-            if isinstance(f, str):
-                if filename is not None and f != filename:
-                    continue
-                name = f
-                f_dict = filenames[i + 1].get_object()
-                f_data = f_dict["/EF"]["/F"].get_data()
-                if name in attachments:
-                    if not isinstance(attachments[name], list):
-                        attachments[name] = [attachments[name]]  # type:ignore
-                    attachments[name].append(f_data)  # type:ignore
-                else:
-                    attachments[name] = f_data
-        return attachments
+        return self.attachments.keys()
 
+    @property
+    def attachments(self) -> AttachmentBytesDictionary:
+        """
+        extracts the /EF entries as bytes from the embedded files
+        Returns:
+            Dictionary with the filenames as keys and the file content as bytes,
+            extra data cah be accessed with Attachmentbytes extra properties(.name,
+            .list_rf_names(), .get_embeddedfile(), .all_files)
 
-class LazyDict(Mapping):
-    def __init__(self, *args: Any, **kw: Any) -> None:
-        self._raw_dict = dict(*args, **kw)
-
-    def __getitem__(self, key: str) -> Any:
-        func, arg = self._raw_dict.__getitem__(key)
-        return func(arg)
-
-    def __iter__(self) -> Iterator[Any]:
-        return iter(self._raw_dict)
-
-    def __len__(self) -> int:
-        return len(self._raw_dict)
-
-    def __str__(self) -> str:
-        return f"LazyDict(keys={list(self.keys())})"
+        Note:
+            If you want to access /RF
+        """
+        return AttachmentBytesDictionary(self._get_embedded_files_root())
 
 
 class PdfFileReader(PdfReader):  # deprecated

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -95,6 +95,7 @@
 from .generic import (
     PAGE_FIT,
     ArrayObject,
+    AttachmentBytesDictionary,
     BooleanObject,
     ByteStringObject,
     ContentStream,
@@ -105,6 +106,7 @@
     FloatObject,
     IndirectObject,
     NameObject,
+    NameTree,
     NullObject,
     NumberObject,
     PdfObject,
@@ -702,7 +704,71 @@
         deprecation_with_replacement("addJS", "add_js", "3.0.0")
         return self.add_js(javascript)
 
-    def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
+    def _get_embedded_files_root(self) -> Optional[NameTree]:
+        """
+        Returns the EmbeddedFiles root as a NameTree Object
+        if the root does not exists, return None
+        """
+        catalog = self._root_object
+        if "/Names" not in catalog:
+            return None
+        ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None)
+        if ef is None:
+            return None
+        efo = ef.get_object()
+        if not isinstance(efo, NameTree):
+            efo = NameTree(efo)
+            if isinstance(ef, IndirectObject):
+                ef.replace_object(efo)
+            else:
+                cast(DictionaryObject, catalog["/Names"])[
+                    NameObject("/EmbeddedFiles")
+                ] = efo
+        return efo
+
+    def _create_attachment_root(self) -> NameTree:
+        if "/Names" not in self._root_object:
+            self._root_object[NameObject("/Names")] = self._add_object(
+                DictionaryObject()
+            )
+        node = cast(DictionaryObject, self._root_object["/Names"])
+        if "/EmbeddedFiles" not in node:
+            node[NameObject("/EmbeddedFiles")] = self._add_object(NameTree())
+        node = cast(NameTree, node["/EmbeddedFiles"])
+        if "/Kids" not in node and "/Names" not in node:
+            node[NameObject("/Names")] = ArrayObject()
+        return node
+
+    @property
+    def attachments_names(self) -> List[str]:
+        """
+        Returns:
+            List of names
+        """
+        return self.attachments.keys()
+
+    @property
+    def attachments(self) -> AttachmentBytesDictionary:
+        """
+        extracts the /EF entries as bytes from the embedded files
+        Returns:
+            Dictionary with the filenames as keys and the file content as bytes,
+            extra data cah be accessed with Attachmentbytes extra properties(.name,
+            .list_rf_names(), .get_embeddedfile(), .all_files)
+
+        Note:
+            If you want to access /RF
+        """
+        return AttachmentBytesDictionary(self._get_embedded_files_root())
+
+    def add_attachment(
+        self,
+        filename: str,
+        data: Union[str, bytes, List[Tuple[str, bytes]]],
+        overwrite: bool = True,
+        fname: Optional[str] = None,
+        desc: str = "",
+    ) -> Optional[DictionaryObject]:
         """
         Embed a file inside the PDF.
 
@@ -711,9 +777,22 @@
         Section 7.11.3
 
         Args:
-            filename: The filename to display.
+            filename: The filename to display (in UTF-16).
             data: The data in the file.
+                if data is an array, it will feed
+            fname: an old style name for "/F" entry (should be ansi). if None will be automatically proposed
+            desc: a description string
+
+        Returns:
+            The filespec DictionaryObject
         """
+        if not overwrite and filename in self.attachments_names:
+            return None
+        if fname is None:
+            st = filename.replace("/", "\\/").replace("\\\\/", "\\/")
+            fname = st.encode().decode("ascii", errors="xmlcharreplace")
+            fname = f"{fname}"  # to escape string
+
         # We need three entries:
         # * The file's data
         # * The /Filespec entry
@@ -731,9 +810,22 @@
         # endstream
         # endobj
 
-        file_entry = DecodedStreamObject()
-        file_entry.set_data(b_(data))
-        file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
+        if isinstance(data, list):
+            ef_entry = DictionaryObject()
+            a = ArrayObject()
+            ef_entry.update({NameObject("/F"): self._add_object(a)})
+            for fn, da in data:
+                a.append(TextStringObject(fn))
+                file_entry = DecodedStreamObject()
+                file_entry.set_data(b_(da))
+                file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
+                a.append(self._add_object(file_entry))
+        else:
+            file_entry = DecodedStreamObject()
+            file_entry.set_data(b_(data))
+            file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
+            ef_entry = DictionaryObject()
+            ef_entry.update({NameObject("/F"): self._add_object(file_entry)})
 
         # The Filespec entry
         # Sample:
@@ -744,51 +836,29 @@
         #  /EF << /F 8 0 R >>
         # >>
 
-        ef_entry = DictionaryObject()
-        ef_entry.update({NameObject("/F"): self._add_object(file_entry)})
-
         filespec = DictionaryObject()
         filespec.update(
             {
                 NameObject(PA.TYPE): NameObject("/Filespec"),
-                NameObject(FileSpecificationDictionaryEntries.F): create_string_object(
+                NameObject(FileSpecificationDictionaryEntries.UF): TextStringObject(
                     filename
-                ),  # Perhaps also try TextStringObject
-                NameObject(FileSpecificationDictionaryEntries.EF): ef_entry,
+                ),
+                NameObject(FileSpecificationDictionaryEntries.F): TextStringObject(
+                    fname
+                ),
+                NameObject(FileSpecificationDictionaryEntries.DESC): TextStringObject(
+                    desc
+                ),
             }
         )
-
-        # Then create the entry for the root, as it needs
-        # a reference to the Filespec
-        # Sample:
-        # 1 0 obj
-        # <<
-        #  /Type /Catalog
-        #  /Outlines 2 0 R
-        #  /Pages 3 0 R
-        #  /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
-        # >>
-        # endobj
-
-        if CA.NAMES not in self._root_object:
-            self._root_object[NameObject(CA.NAMES)] = self._add_object(
-                DictionaryObject()
-            )
-        if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]):
-            embedded_files_names_dictionary = DictionaryObject(
-                {NameObject(CA.NAMES): ArrayObject()}
-            )
-            cast(DictionaryObject, self._root_object[CA.NAMES])[
-                NameObject("/EmbeddedFiles")
-            ] = self._add_object(embedded_files_names_dictionary)
+        if isinstance(data, list):
+            filespec[NameObject(FileSpecificationDictionaryEntries.RF)] = ef_entry
         else:
-            embedded_files_names_dictionary = cast(
-                DictionaryObject,
-                cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"],
-            )
-        cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend(
-            [create_string_object(filename), filespec]
-        )
+            filespec[NameObject(FileSpecificationDictionaryEntries.EF)] = ef_entry
+
+        nm = self._get_embedded_files_root() or self._create_attachment_root()
+        nm.list_add(filename, filespec, overwrite=True)
+        return filespec
 
     def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None:  # deprecated
         """
@@ -797,7 +867,7 @@
         .. deprecated:: 1.28.0
         """
         deprecation_with_replacement("addAttachment", "add_attachment", "3.0.0")
-        return self.add_attachment(fname, fdata)
+        self.add_attachment(fname, fdata)
 
     def append_pages_from_reader(
         self,