Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: NameObject doesn't handle all values correctly #2601

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
35 changes: 16 additions & 19 deletions pypdf/generic/_base.py
Expand Up @@ -585,12 +585,14 @@ def write_to_stream(


class NameObject(str, PdfObject): # noqa: SLOT000
encoding = "utf8"
delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]")
surfix = b"/"
renumber_table: ClassVar[Dict[str, bytes]] = {
**{chr(i): f"#{i:02X}".encode() for i in range(0x21)},
**{chr(i): f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
**{chr(i): f"#{i:02X}".encode() for i in range(0x7F, 0x100)},
renumber_table: ClassVar[Dict[int, bytes]] = {
**{i: f"#{i:02X}".encode() for i in range(0x21)},
**{i: bytes([i]) for i in range(0x21, 0x7F)},
**{i: f"#{i:02X}".encode() for i in b"#()<>[]{}/%"},
**{i: f"#{i:02X}".encode() for i in range(0x7F, 0x100)},
}

def clone(
Expand All @@ -617,25 +619,14 @@ def write_to_stream(
stream.write(self.renumber())

def renumber(self) -> bytes:
out = b"/"
out = self.surfix
if self[0] != "/":
deprecate_no_replacement(
f"Incorrect first char in NameObject, should start with '/': ({self})",
"6.0.0",
)
for c in self[1:]:
try:
out += self.renumber_table[c]
except KeyError:
try:
out += c.encode("latin1")
except UnicodeEncodeError:
deprecate_no_replacement(
f"Only 8-bit characters are allowed by specs in NameObject: ({self})",
"6.0.0",
)
for x in c.encode("utf-8"):
out += f"#{x:02X}".encode()
for c in self[1:].encode(self.encoding):
out += self.renumber_table[c]
return out

@staticmethod
Expand All @@ -657,7 +648,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
if name != NameObject.surfix:
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern)
return NameObject(NameObject.unnumber(name).decode("latin1"))
name = NameObject.unnumber(name)
try:
name = NameObject(name.decode(NameObject.encoding))
except UnicodeDecodeError:
name = NameObject(name.decode("latin1"))
name.encoding = "latin1"
return name


def encode_pdfdocencoding(unicode_string: str) -> bytes:
Expand Down
39 changes: 20 additions & 19 deletions tests/test_generic.py
@@ -1,5 +1,5 @@
"""Test the pypdf.generic module."""
import warnings

from copy import deepcopy
from io import BytesIO
from pathlib import Path
Expand Down Expand Up @@ -185,17 +185,6 @@ def test_name_object(caplog):
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, None)
assert exc.value.args[0] == "name read error"
assert (
NameObject.read_from_stream(
BytesIO(b"/A;Name_With-Various***Characters?"), None
)
== "/A;Name_With-Various***Characters?"
)
assert (
NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
== "/paired()parentheses"
)
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"

assert (
NameObject.read_from_stream(
Expand All @@ -211,7 +200,7 @@ def test_name_object(caplog):
NameObject.read_from_stream(
BytesIO(b"/#e4#bd#a0#e5#a5#bd#e4#b8#96#e7#95#8c"), None
)
) == "/ä½\xa0好ä¸\x96ç\x95\x8c"
) == "/你好世界"

# to test latin-1 aka stdencoding
assert (
Expand All @@ -223,7 +212,6 @@ def test_name_object(caplog):
NameObject("/hello").write_to_stream(b)
assert bytes(b.getbuffer()) == b"/hello"

caplog.clear()
b = BytesIO()
with pytest.raises(DeprecationWarning):
NameObject("hello").write_to_stream(b)
Expand All @@ -234,16 +222,11 @@ def test_name_object(caplog):
assert bytes(b.getbuffer()) == b"/DIJMAC+Arial#20Black#231"
assert caplog.text == ""

warnings.filterwarnings("ignore", category=DeprecationWarning)
b = BytesIO()
NameObject("/你好世界 (%)").write_to_stream(b)
assert bytes(b.getbuffer()) == b"/#E4#BD#A0#E5#A5#BD#E4#B8#96#E7#95#8C#20#28#25#29"
assert caplog.text == ""

warnings.filterwarnings("error", category=DeprecationWarning)
with pytest.raises(DeprecationWarning):
NameObject("/你好世界 (%)").write_to_stream(b)

# ISO/DIS 32000-2 Table 4: Examples of literal names
for b, n in (
(b"/Name1", "Name1"),
Expand All @@ -266,11 +249,29 @@ def test_name_object(caplog):
with pytest.raises(PyPdfError):
NameObject("/\0").write_to_stream(BytesIO())

# testing all allowed values
NameObject.encoding = "latin1"
Rak424 marked this conversation as resolved.
Show resolved Hide resolved
value = "/" + bytes(range(1, 0x100)).decode("latin1")
bio = BytesIO()
NameObject(value).write_to_stream(bio)
bio.seek(0)
assert (
bio.read()
== b'/#01#02#03#04#05#06#07#08#09#0A#0B#0C#0D#0E#0F#10#11#12#13#14#15#16#17#18#19#1A#1B#1C#1D#1E#1F#20!"#23$#25'
b"&'#28#29*+,-.#2F0123456789:;#3C=#3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ#5B\\#5D^_`abcdefghijklmnopqrstuvwxyz#7B|#7D~#"
b"7F#80#81#82#83#84#85#86#87#88#89#8A#8B#8C#8D#8E#8F#90#91#92#93#94#95#96#97#98#99#9A#9B#9C#9D#9E#9F#A0#A1#A2#A"
b"3#A4#A5#A6#A7#A8#A9#AA#AB#AC#AD#AE#AF#B0#B1#B2#B3#B4#B5#B6#B7#B8#B9#BA#BB#BC#BD#BE#BF#C0#C1#C2#C3#C4#C5#C6#C7"
b"#C8#C9#CA#CB#CC#CD#CE#CF#D0#D1#D2#D3#D4#D5#D6#D7#D8#D9#DA#DB#DC#DD#DE#DF#E0#E1#E2#E3#E4#E5#E6#E7#E8#E9#EA#EB#"
b"EC#ED#EE#EF#F0#F1#F2#F3#F4#F5#F6#F7#F8#F9#FA#FB#FC#FD#FE#FF"
)
bio.seek(0)
assert NameObject.read_from_stream(bio, None) == value
NameObject.encoding = "utf8"
bio.seek(0)
name = NameObject.read_from_stream(bio, None)
assert name == value
assert name.encoding == "latin1"
assert NameObject.encoding == "utf8"


def test_destination_fit_r():
Expand Down