Skip to content

Commit

Permalink
ROB: Tolerate "truncated" xref (#2580)
Browse files Browse the repository at this point in the history
Closes #2575.
  • Loading branch information
pubpub-zz committed Apr 5, 2024
1 parent 6152893 commit 0f7c8fe
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
11 changes: 11 additions & 0 deletions pypdf/_reader.py
Expand Up @@ -677,6 +677,14 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
read_non_whitespace(stream)
stream.seek(-1, 1)
size = cast(int, read_object(stream, self))
if not isinstance(size, int):
logger_warning(
"Invalid/Truncated xref table. Rebuilding it.",
__name__,
)
self._rebuild_xref_table(stream)
stream.read()
return
read_non_whitespace(stream)
stream.seek(-1, 1)
cnt = 0
Expand Down Expand Up @@ -815,6 +823,9 @@ def _read_xref_tables_and_trailers(

def _read_xref(self, stream: StreamType) -> Optional[int]:
self._read_standard_xref_table(stream)
if stream.read(1) == b"":
return None
stream.seek(-1, 1)
read_non_whitespace(stream)
stream.seek(-1, 1)
new_trailer = cast(Dict[str, Any], read_object(stream, self))
Expand Down
8 changes: 8 additions & 0 deletions tests/test_reader.py
Expand Up @@ -1508,3 +1508,11 @@ def test_corrupted_xref():
name = "iss2516.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.root_object["/Type"] == "/Catalog"


@pytest.mark.enable_socket()
def test_truncated_xref(caplog):
url = "https://github.com/py-pdf/pypdf/files/14843553/002-trivial-libre-office-writer-broken.pdf"
name = "iss2575.pdf"
PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Invalid/Truncated xref table. Rebuilding it." in caplog.text

0 comments on commit 0f7c8fe

Please sign in to comment.