Skip to content

Commit

Permalink
ENH: Allow multiple charsets for NameObject.read_from_stream (#2585)
Browse files Browse the repository at this point in the history
Closes #2323
  • Loading branch information
pubpub-zz committed Apr 6, 2024
1 parent 0f7c8fe commit 956fd03
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 14 deletions.
18 changes: 14 additions & 4 deletions pypdf/generic/_base.py
Expand Up @@ -615,7 +615,10 @@ def write_to_stream(
def renumber(self) -> bytes:
out = self[0].encode("utf-8")
if out != b"/":
deprecate_no_replacement(f"Incorrect first char in NameObject, should start with '/': ({self})", "6.0.0")
deprecate_no_replacement(
f"Incorrect first char in NameObject, should start with '/': ({self})",
"6.0.0",
)
for c in self[1:]:
if c > "~":
for x in c.encode("utf-8"):
Expand All @@ -640,6 +643,8 @@ def unnumber(sin: bytes) -> bytes:
i = i + 1
return sin

CHARSETS = ("utf-8", "gbk", "latin1")

@staticmethod
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
Expand All @@ -650,7 +655,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
name = NameObject.unnumber(name)
for enc in ("utf-8", "gbk"):
for enc in NameObject.CHARSETS:
try:
ret = name.decode(enc)
return NameObject(ret)
Expand All @@ -659,11 +664,16 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
except (UnicodeEncodeError, UnicodeDecodeError) as e:
if not pdf.strict:
logger_warning(f"Illegal character in Name Object ({name!r})", __name__)
logger_warning(
f"Illegal character in NameObject ({name!r}), "
"you may need to adjust NameObject.CHARSETS",
__name__,
)
return NameObject(name.decode("charmap"))
else:
raise PdfReadError(
f"Illegal character in Name Object ({name!r})"
f"Illegal character in NameObject ({name!r}). "
"You may need to adjust NameObject.CHARSETS.",
) from e


Expand Down
30 changes: 20 additions & 10 deletions tests/test_generic.py
@@ -1,5 +1,6 @@
"""Test the pypdf.generic module."""

from copy import deepcopy
from io import BytesIO
from pathlib import Path
from unittest.mock import patch
Expand Down Expand Up @@ -212,6 +213,11 @@ def test_name_object(caplog):
)
) == "/你好世界"

# to test latin-1 aka stdencoding
assert (
NameObject.read_from_stream(BytesIO(b"/DocuSign\xae"), None)
) == "/DocuSign®"

# test write
b = BytesIO()
NameObject("/hello").write_to_stream(b)
Expand Down Expand Up @@ -1036,16 +1042,20 @@ def test_checkboxradiobuttonattributes_opt():


def test_name_object_invalid_decode():
stream = BytesIO(b"/\x80\x02\x03")

# strict:
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, ReaderDummy(strict=True))
assert "Illegal character in Name Object" in exc.value.args[0]

# non-strict:
stream.seek(0)
NameObject.read_from_stream(stream, ReaderDummy(strict=False))
charsets = deepcopy(NameObject.CHARSETS)
try:
NameObject.CHARSETS = ("utf-8",)
stream = BytesIO(b"/\x80\x02\x03")
# strict:
with pytest.raises(PdfReadError) as exc:
NameObject.read_from_stream(stream, ReaderDummy(strict=True))
assert "Illegal character in NameObject " in exc.value.args[0]

# non-strict:
stream.seek(0)
NameObject.read_from_stream(stream, ReaderDummy(strict=False))
finally:
NameObject.CHARSETS = charsets


def test_indirect_object_invalid_read():
Expand Down

0 comments on commit 956fd03

Please sign in to comment.