Skip to content

Commit e35df5a

Browse files
authoredMar 29, 2024
FIX: Extract text in layout mode without finding resources (py-pdf#2555)
Closes py-pdf#2533
1 parent 253cde4 commit e35df5a

File tree

2 files changed

+32
-21
lines changed

2 files changed

+32
-21
lines changed
 

‎pypdf/_page.py

+23-21
Original file line numberDiff line numberDiff line change
@@ -1891,28 +1891,30 @@ def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
18911891
"""
18921892
# Font retrieval logic adapted from pypdf.PageObject._extract_text()
18931893
objr: Any = self
1894-
while NameObject(PG.RESOURCES) not in objr:
1895-
objr = objr["/Parent"].get_object()
1896-
resources_dict: Any = objr[PG.RESOURCES]
18971894
fonts: Dict[str, _layout_mode.Font] = {}
1898-
if "/Font" in resources_dict and self.pdf is not None:
1899-
for font_name in resources_dict["/Font"]:
1900-
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
1901-
font_dict = {
1902-
k: self.pdf.get_object(v)
1903-
if isinstance(v, IndirectObject)
1904-
else [
1905-
self.pdf.get_object(_v)
1906-
if isinstance(_v, IndirectObject)
1907-
else _v
1908-
for _v in v
1909-
]
1910-
if isinstance(v, ArrayObject)
1911-
else v
1912-
for k, v in font_dict_obj.items()
1913-
}
1914-
# mypy really sucks at unpacking
1915-
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
1895+
while objr is not None:
1896+
try:
1897+
resources_dict: Any = objr[PG.RESOURCES]
1898+
except KeyError:
1899+
resources_dict = {}
1900+
if "/Font" in resources_dict and self.pdf is not None:
1901+
for font_name in resources_dict["/Font"]:
1902+
*cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
1903+
font_dict = {
1904+
k: v.get_object()
1905+
if isinstance(v, IndirectObject)
1906+
else [_v.get_object() for _v in v]
1907+
if isinstance(v, ArrayObject)
1908+
else v
1909+
for k, v in font_dict_obj.items()
1910+
}
1911+
# mypy really sucks at unpacking
1912+
fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type]
1913+
try:
1914+
objr = objr["/Parent"].get_object()
1915+
except KeyError:
1916+
objr = None
1917+
19161918
return fonts
19171919

19181920
def _layout_mode_text(

‎tests/test_workflows.py

+9
Original file line numberDiff line numberDiff line change
@@ -1272,3 +1272,12 @@ def test_get_page_showing_field():
12721272
writer._root_object["/AcroForm"]["/Fields"][-1]
12731273
)
12741274
] == []
1275+
1276+
1277+
@pytest.mark.enable_socket()
1278+
def test_extract_empty_page():
1279+
"""Cf #2533"""
1280+
url = "https://github.com/py-pdf/pypdf/files/14718318/test.pdf"
1281+
name = "iss2533.pdf"
1282+
reader = PdfReader(BytesIO(get_data_from_url(url, name)))
1283+
assert reader.pages[1].extract_text(extraction_mode="layout") == ""

0 commit comments

Comments
 (0)
Please sign in to comment.