Skip to content

Commit be414c1

Browse files
BUG: Fix stream position on inline image fallback extraction (#3120)
When fixing the inline image extraction in #3002, the fallback behavior would break accidentally by leaving the stream before the EI marker. We never saw this due to the tests somehow being not strict enough, thus leading to the error > b'EI' operator met whereas not expected, please share usecase with pypdf dev team
1 parent 55b0a89 commit be414c1

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

pypdf/generic/_data_structures.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1385,9 +1385,18 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
13851385
ei = stream.read(3)
13861386
stream.seek(-1, 1)
13871387
if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES:
1388-
# Deal with wrong/missing `EI` tags.
1388+
# Deal with wrong/missing `EI` tags. Example: Wrong dimensions specified above.
13891389
stream.seek(savpos, 0)
13901390
data = extract_inline_default(stream)
1391+
ei = stream.read(3)
1392+
stream.seek(-1, 1)
1393+
if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES: # pragma: no cover
1394+
# Check the same condition again. This should never fail as
1395+
# edge cases are covered by `extract_inline_default` above,
1396+
# but check this ot make sure that we are behind the `EI` afterwards.
1397+
raise PdfStreamError(
1398+
f"Could not extract inline image, even using fallback. Expected 'EI', got {ei!r}"
1399+
)
13911400
return {"settings": settings, "data": data}
13921401

13931402
# This overrides the parent method:

tests/test_images.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
from io import BytesIO
99
from pathlib import Path
1010
from typing import Union
11+
from unittest import mock
1112
from zipfile import ZipFile
1213

1314
import pytest
1415
from PIL import Image, ImageChops, ImageDraw
1516

1617
from pypdf import PageObject, PdfReader, PdfWriter
17-
from pypdf.generic import NameObject, NullObject
18+
from pypdf.generic import ContentStream, NameObject, NullObject
1819

1920
from . import get_data_from_url
2021

@@ -484,3 +485,28 @@ def test_no_filter_with_colorspace_as_list():
484485

485486
page = reader.pages[0]
486487
page.images.items()
488+
489+
490+
def test_contentstream__read_inline_image__fallback_is_successful():
491+
stream = ContentStream(stream=None, pdf=None)
492+
stream.set_data(
493+
b"""Q
494+
q 9.6 0 0 4.8 5523.6 1031 cm
495+
BI
496+
/CS /RGB
497+
/W 2
498+
/H 1
499+
/BPC 8
500+
ID \x8b\x8b\x8b\xfe\xfe\xfe
501+
EI Q
502+
/R413 gs
503+
"""
504+
)
505+
page = PageObject(pdf=None)
506+
with mock.patch.object(page, "get_contents", return_value=stream):
507+
images = page._get_inline_images()
508+
assert list(images) == ["~0~"]
509+
assert images["~0~"].data == (
510+
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x0f"
511+
b"IDATx\x9cc\xe8\xee\xee\xfe\xf7\xef\x1f\x00\x0e \x04\x9cpr_\x96\x00\x00\x00\x00IEND\xaeB`\x82"
512+
)

0 commit comments

Comments
 (0)