Skip to content

Commit 3a6e4d0

Browse files
authored
ENH: Tolerate PDF with invalid xref pointed objects (#2335)
Closes #2326
1 parent 7883580 commit 3a6e4d0

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

pypdf/_reader.py

+19
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,25 @@ def read(self, stream: StreamType) -> None:
573573
# non-zero-index is actually correct
574574
stream.seek(loc, 0) # return to where it was
575575

576+
# remove wrong objects (not pointing to correct structures) - cf #2326
577+
if not self.strict:
578+
loc = stream.tell()
579+
for gen, xref_entry in self.xref.items():
580+
if gen == 65535:
581+
continue
582+
ids = list(xref_entry.keys())
583+
for id in ids:
584+
stream.seek(xref_entry[id], 0)
585+
try:
586+
self.read_object_header(stream)
587+
except ValueError:
588+
logger_warning(
589+
f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})",
590+
__name__,
591+
)
592+
del xref_entry[id] # we can delete the id, we are parsing ids
593+
stream.seek(loc, 0) # return to where it was
594+
576595
def _basic_validation(self, stream: StreamType) -> None:
577596
"""Ensure file is not empty. Read at most 5 bytes."""
578597
stream.seek(0, os.SEEK_SET)

tests/test_reader.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,9 @@ def test_get_images(src, expected_images):
268268
False,
269269
-1,
270270
False,
271-
["startxref on same line as offset"],
271+
[
272+
"startxref on same line as offset",
273+
],
272274
),
273275
(
274276
False,
@@ -322,11 +324,12 @@ def test_get_images_raw(
322324
b"%%%%EOF"
323325
)
324326
pdf_data = pdf_data % (
325-
pdf_data.find(b"1 0 obj"),
326-
pdf_data.find(b"2 0 obj"),
327-
pdf_data.find(b"3 0 obj"),
328-
pdf_data.find(b"4 0 obj"),
329-
pdf_data.find(b"5 0 obj"),
327+
# - 1 below in the find because of the double %
328+
pdf_data.find(b"1 0 obj") - 1,
329+
pdf_data.find(b"2 0 obj") - 1,
330+
pdf_data.find(b"3 0 obj") - 1,
331+
pdf_data.find(b"4 0 obj") - 1,
332+
pdf_data.find(b"5 0 obj") - 1,
330333
b"/Prev 0 " if with_prev_0 else b"",
331334
# startx_correction should be -1 due to double % at the beginning
332335
# inducing an error on startxref computation
@@ -593,11 +596,11 @@ def test_read_unknown_zero_pages(caplog):
593596
b"%%%%EOF"
594597
)
595598
pdf_data = pdf_data % (
596-
pdf_data.find(b"1 0 obj"),
597-
pdf_data.find(b"2 0 obj"),
598-
pdf_data.find(b"3 0 obj"),
599-
pdf_data.find(b"4 0 obj"),
600-
pdf_data.find(b"5 0 obj"),
599+
pdf_data.find(b"1 0 obj") - 1,
600+
pdf_data.find(b"2 0 obj") - 1,
601+
pdf_data.find(b"3 0 obj") - 1,
602+
pdf_data.find(b"4 0 obj") - 1,
603+
pdf_data.find(b"5 0 obj") - 1,
601604
pdf_data.find(b"xref") - 1,
602605
)
603606
pdf_stream = io.BytesIO(pdf_data)

tests/test_workflows.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name):
547547
retrieved_fields = reader.get_fields(fileobj=fp)
548548

549549
assert retrieved_fields == {}
550-
assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."]
550+
assert normalize_warnings(caplog.text) == [
551+
"Ignoring wrong pointing object 1 65536 (offset 0)",
552+
"Ignoring wrong pointing object 2 65536 (offset 0)",
553+
"Object 2 0 not defined.",
554+
]
551555

552556

553557
@pytest.mark.enable_socket()

0 commit comments

Comments
 (0)