Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: NameObject doesn't handle all values correctly #2601

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
68 changes: 24 additions & 44 deletions pypdf/generic/_base.py
Expand Up @@ -43,7 +43,12 @@
read_until_regex,
str_,
)
from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
from ..errors import (
STREAM_TRUNCATED_PREMATURELY,
PdfReadError,
PdfStreamError,
PyPdfError,
)

__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
Expand Down Expand Up @@ -580,15 +585,14 @@ def write_to_stream(


class NameObject(str, PdfObject): # noqa: SLOT000
encoding = "utf8"
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
surfix = b"/"
renumber_table: ClassVar[Dict[str, bytes]] = {
"#": b"#23",
"(": b"#28",
")": b"#29",
"/": b"#2F",
"%": b"#25",
**{chr(i): f"#{i:02X}".encode() for i in range(33)},
renumber_table: ClassVar[Dict[int, bytes]] = {
**{i: f"#{i:02X}".encode() for i in range(0x21)},
**{i: bytes([i]) for i in range(0x21, 0x7F)},
**{i: f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
**{i: f"#{i:02X}".encode() for i in range(0x7F, 0x100)},
}

def clone(
Expand All @@ -610,24 +614,19 @@ def write_to_stream(
deprecate_no_replacement(
"the encryption_key parameter of write_to_stream", "5.0.0"
)
if "\0" in self:
raise PyPdfError("Null character is not allowed in NameObject")
stream.write(self.renumber())

def renumber(self) -> bytes:
out = self[0].encode("utf-8")
if out != b"/":
out = self.surfix
if self[0] != "/":
deprecate_no_replacement(
f"Incorrect first char in NameObject, should start with '/': ({self})",
"6.0.0",
)
for c in self[1:]:
if c > "~":
for x in c.encode("utf-8"):
out += f"#{x:02X}".encode()
else:
try:
out += self.renumber_table[c]
except KeyError:
out += c.encode("utf-8")
for c in self[1:].encode(self.encoding):
out += self.renumber_table[c]
return out

@staticmethod
Expand All @@ -643,38 +642,19 @@ def unnumber(sin: bytes) -> bytes:
i = i + 1
return sin

CHARSETS = ("utf-8", "gbk", "latin1")

@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
if name != NameObject.surfix:
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern)
name = NameObject.unnumber(name)
try:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
name = NameObject.unnumber(name)
for enc in NameObject.CHARSETS:
try:
ret = name.decode(enc)
return NameObject(ret)
except Exception:
pass
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning(
f"Illegal character in NameObject ({name!r}), "
"you may need to adjust NameObject.CHARSETS",
__name__,
)
return NameObject(name.decode("charmap"))
else:
raise PdfReadError(
f"Illegal character in NameObject ({name!r}). "
"You may need to adjust NameObject.CHARSETS.",
) from e
name = NameObject(name.decode(NameObject.encoding))
except UnicodeDecodeError:
name = NameObject(name.decode("latin1"))
name.encoding = "latin1"
return name


def encode_pdfdocencoding(unicode_string: str) -> bytes:
Expand Down
60 changes: 47 additions & 13 deletions tests/test_generic.py
Expand Up @@ -9,7 +9,7 @@

from pypdf import PdfMerger, PdfReader, PdfWriter
from pypdf.constants import CheckboxRadioButtonAttributes
from pypdf.errors import PdfReadError, PdfStreamError
from pypdf.errors import PdfReadError, PdfStreamError, PyPdfError
from pypdf.generic import (
AnnotationBuilder,
ArrayObject,
Expand Down Expand Up @@ -185,17 +185,6 @@ def test_name_object(caplog):
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, None)
assert exc.value.args[0] == "name read error"
assert (
NameObject.read_from_stream(
BytesIO(b"/A;Name_With-Various***Characters?"), None
)
== "/A;Name_With-Various***Characters?"
)
assert (
NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
== "/paired()parentheses"
)
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"

assert (
NameObject.read_from_stream(
Expand Down Expand Up @@ -223,7 +212,6 @@ def test_name_object(caplog):
NameObject("/hello").write_to_stream(b)
assert bytes(b.getbuffer()) == b"/hello"

caplog.clear()
b = BytesIO()
with pytest.raises(DeprecationWarning):
NameObject("hello").write_to_stream(b)
Expand All @@ -239,6 +227,52 @@ def test_name_object(caplog):
assert bytes(b.getbuffer()) == b"/#E4#BD#A0#E5#A5#BD#E4#B8#96#E7#95#8C#20#28#25#29"
assert caplog.text == ""

# ISO/DIS 32000-2 Table 4: Examples of literal names
for b, n in (
(b"/Name1", "Name1"),
(b"/ASomewhatLongerName", "ASomewhatLongerName"),
(b"/A;Name_With-Various***Characters?", "A;Name_With-Various***Characters?"),
(b"/1.2", "1.2"),
(b"/$$", "$$"),
(b"/@pattern", "@pattern"),
(b"/.notdef", ".notdef"),
(b"/Lime#20Green", "Lime Green"),
(b"/paired#28#29parentheses", "paired()parentheses"),
(b"/The_Key_of_F#23_Minor", "The_Key_of_F#_Minor"),
):
assert (NameObject.read_from_stream(BytesIO(b), None)) == "/" + n
bio = BytesIO()
NameObject("/" + n).write_to_stream(bio)
assert bio.getbuffer() == b
assert (NameObject.read_from_stream(BytesIO(b"/A#42"), None)) == "/" + "AB"

with pytest.raises(PyPdfError):
NameObject("/\0").write_to_stream(BytesIO())

# testing all allowed values
NameObject.encoding = "latin1"
Rak424 marked this conversation as resolved.
Show resolved Hide resolved
value = "/" + bytes(range(1, 0x100)).decode("latin1")
bio = BytesIO()
NameObject(value).write_to_stream(bio)
bio.seek(0)
assert (
bio.read()
== b'/#01#02#03#04#05#06#07#08#09#0A#0B#0C#0D#0E#0F#10#11#12#13#14#15#16#17#18#19#1A#1B#1C#1D#1E#1F#20!"#23$#25'
b"&'#28#29*+,-.#2F0123456789:;#3C=#3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ#5B\\#5D^_`abcdefghijklmnopqrstuvwxyz#7B|#7D~#"
b"7F#80#81#82#83#84#85#86#87#88#89#8A#8B#8C#8D#8E#8F#90#91#92#93#94#95#96#97#98#99#9A#9B#9C#9D#9E#9F#A0#A1#A2#A"
b"3#A4#A5#A6#A7#A8#A9#AA#AB#AC#AD#AE#AF#B0#B1#B2#B3#B4#B5#B6#B7#B8#B9#BA#BB#BC#BD#BE#BF#C0#C1#C2#C3#C4#C5#C6#C7"
b"#C8#C9#CA#CB#CC#CD#CE#CF#D0#D1#D2#D3#D4#D5#D6#D7#D8#D9#DA#DB#DC#DD#DE#DF#E0#E1#E2#E3#E4#E5#E6#E7#E8#E9#EA#EB#"
b"EC#ED#EE#EF#F0#F1#F2#F3#F4#F5#F6#F7#F8#F9#FA#FB#FC#FD#FE#FF"
)
bio.seek(0)
assert NameObject.read_from_stream(bio, None) == value
NameObject.encoding = "utf8"
bio.seek(0)
name = NameObject.read_from_stream(bio, None)
assert name == value
assert name.encoding == "latin1"
assert NameObject.encoding == "utf8"


def test_destination_fit_r():
d = Destination(
Expand Down