BUG: Fix stream position on inline image fallback extraction (#3120)

stefan6419846 · web-flow · commit be414c1b8ec9 · 2025-02-12T14:22:48.000+01:00
When fixing the inline image extraction in #3002, the fallback behavior would break accidentally by leaving the stream before the EI marker. We never saw this due to the tests somehow being not strict enough, thus leading to the error > b'EI' operator met whereas not expected, please share usecase with pypdf dev team
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -1385,9 +1385,18 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
         ei = stream.read(3)
         stream.seek(-1, 1)
         if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES:
-            # Deal with wrong/missing `EI` tags.
+            # Deal with wrong/missing `EI` tags. Example: Wrong dimensions specified above.
             stream.seek(savpos, 0)
             data = extract_inline_default(stream)
+            ei = stream.read(3)
+            stream.seek(-1, 1)
+            if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES:  # pragma: no cover
+                # Check the same condition again. This should never fail as
+                # edge cases are covered by `extract_inline_default` above,
+                # but check this ot make sure that we are behind the `EI` afterwards.
+                raise PdfStreamError(
+                    f"Could not extract inline image, even using fallback. Expected 'EI', got {ei!r}"
+                )
         return {"settings": settings, "data": data}
 
     # This overrides the parent method:
diff --git a/tests/test_images.py b/tests/test_images.py
@@ -8,13 +8,14 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Union
+from unittest import mock
 from zipfile import ZipFile
 
 import pytest
 from PIL import Image, ImageChops, ImageDraw
 
 from pypdf import PageObject, PdfReader, PdfWriter
-from pypdf.generic import NameObject, NullObject
+from pypdf.generic import ContentStream, NameObject, NullObject
 
 from . import get_data_from_url
 
@@ -484,3 +485,28 @@ def test_no_filter_with_colorspace_as_list():
 
     page = reader.pages[0]
     page.images.items()
+
+
+def test_contentstream__read_inline_image__fallback_is_successful():
+    stream = ContentStream(stream=None, pdf=None)
+    stream.set_data(
+        b"""Q
+q 9.6 0 0 4.8 5523.6 1031 cm
+BI
+/CS /RGB
+/W 2
+/H 1
+/BPC 8
+ID \x8b\x8b\x8b\xfe\xfe\xfe
+EI Q
+/R413 gs
+        """
+    )
+    page = PageObject(pdf=None)
+    with mock.patch.object(page, "get_contents", return_value=stream):
+        images = page._get_inline_images()
+        assert list(images) == ["~0~"]
+        assert images["~0~"].data == (
+            b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x0f"
+            b"IDATx\x9cc\xe8\xee\xee\xfe\xf7\xef\x1f\x00\x0e \x04\x9cpr_\x96\x00\x00\x00\x00IEND\xaeB`\x82"
+        )