ENH: Tolerate PDF with invalid xref pointed objects (#2335)

pubpub-zz · web-flow · commit 3a6e4d0d4235 · 2024-03-30T10:00:06.000+01:00
Closes #2326
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -573,6 +573,25 @@ def read(self, stream: StreamType) -> None:
                     # non-zero-index is actually correct
             stream.seek(loc, 0)  # return to where it was
 
+        # remove wrong objects (not pointing to correct structures) - cf #2326
+        if not self.strict:
+            loc = stream.tell()
+            for gen, xref_entry in self.xref.items():
+                if gen == 65535:
+                    continue
+                ids = list(xref_entry.keys())
+                for id in ids:
+                    stream.seek(xref_entry[id], 0)
+                    try:
+                        self.read_object_header(stream)
+                    except ValueError:
+                        logger_warning(
+                            f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})",
+                            __name__,
+                        )
+                        del xref_entry[id]  # we can delete the id, we are parsing ids
+            stream.seek(loc, 0)  # return to where it was
+
     def _basic_validation(self, stream: StreamType) -> None:
         """Ensure file is not empty. Read at most 5 bytes."""
         stream.seek(0, os.SEEK_SET)
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -268,7 +268,9 @@ def test_get_images(src, expected_images):
             False,
             -1,
             False,
-            ["startxref on same line as offset"],
+            [
+                "startxref on same line as offset",
+            ],
         ),
         (
             False,
@@ -322,11 +324,12 @@ def test_get_images_raw(
         b"%%%%EOF"
     )
     pdf_data = pdf_data % (
-        pdf_data.find(b"1 0 obj"),
-        pdf_data.find(b"2 0 obj"),
-        pdf_data.find(b"3 0 obj"),
-        pdf_data.find(b"4 0 obj"),
-        pdf_data.find(b"5 0 obj"),
+        # - 1 below in the find because of the double %
+        pdf_data.find(b"1 0 obj") - 1,
+        pdf_data.find(b"2 0 obj") - 1,
+        pdf_data.find(b"3 0 obj") - 1,
+        pdf_data.find(b"4 0 obj") - 1,
+        pdf_data.find(b"5 0 obj") - 1,
         b"/Prev 0 " if with_prev_0 else b"",
         # startx_correction should be -1 due to double % at the beginning
         # inducing an error on startxref computation
@@ -593,11 +596,11 @@ def test_read_unknown_zero_pages(caplog):
         b"%%%%EOF"
     )
     pdf_data = pdf_data % (
-        pdf_data.find(b"1 0 obj"),
-        pdf_data.find(b"2 0 obj"),
-        pdf_data.find(b"3 0 obj"),
-        pdf_data.find(b"4 0 obj"),
-        pdf_data.find(b"5 0 obj"),
+        pdf_data.find(b"1 0 obj") - 1,
+        pdf_data.find(b"2 0 obj") - 1,
+        pdf_data.find(b"3 0 obj") - 1,
+        pdf_data.find(b"4 0 obj") - 1,
+        pdf_data.find(b"5 0 obj") - 1,
         pdf_data.find(b"xref") - 1,
     )
     pdf_stream = io.BytesIO(pdf_data)
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -547,7 +547,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name):
         retrieved_fields = reader.get_fields(fileobj=fp)
 
     assert retrieved_fields == {}
-    assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."]
+    assert normalize_warnings(caplog.text) == [
+        "Ignoring wrong pointing object 1 65536 (offset 0)",
+        "Ignoring wrong pointing object 2 65536 (offset 0)",
+        "Object 2 0 not defined.",
+    ]
 
 
 @pytest.mark.enable_socket()