Skip to content

Commit f8edf3c

Browse files
authored
ROB: Rebuild xref table if one entry is invalid (#2528)
Fixes #2523 Situation met: * length field is not correct * xref may contain unordered stream data * xref contains some free entries (i.e. does not contain stream offset)
1 parent c4641d1 commit f8edf3c

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

pypdf/_reader.py

+6
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,11 @@ def get_object(
12741274
self.stream.seek(start, 0)
12751275
try:
12761276
idnum, generation = self.read_object_header(self.stream)
1277+
if (
1278+
idnum != indirect_reference.idnum
1279+
or generation != indirect_reference.generation
1280+
):
1281+
raise PdfReadError("not matching, we parse the file for it")
12771282
except Exception:
12781283
if hasattr(self.stream, "getbuffer"):
12791284
buf = bytes(self.stream.getbuffer())
@@ -1452,6 +1457,7 @@ def read(self, stream: StreamType) -> None:
14521457
try:
14531458
pid, _pgen = self.read_object_header(stream)
14541459
except ValueError:
1460+
self._rebuild_xref_table(stream)
14551461
break
14561462
if pid == id - self.xref_index:
14571463
# fixing index item per item is required for revised PDF.

tests/test_reader.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1290,8 +1290,6 @@ def test_reader(caplog):
12901290
caplog.clear()
12911291
# first call requires some reparations...
12921292
reader.pages[0].extract_text()
1293-
assert "repaired" in caplog.text
1294-
assert "found" in caplog.text
12951293
caplog.clear()
12961294
# ...and now no more required
12971295
reader.pages[0].extract_text()
@@ -1498,3 +1496,11 @@ def test_xyz_with_missing_param():
14981496
assert reader.outline[0]["/Top"] == 0
14991497
assert reader.outline[1]["/Left"] == 0
15001498
assert reader.outline[0]["/Top"] == 0
1499+
1500+
1501+
@pytest.mark.enable_socket()
1502+
def test_corrupted_xref():
1503+
url = "https://github.com/py-pdf/pypdf/files/14628314/iss2516.pdf"
1504+
name = "iss2516.pdf"
1505+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
1506+
assert reader.root_object["/Type"] == "/Catalog"

0 commit comments

Comments
 (0)