Skip to content

Commit bc29901

Browse files
ENH: Add support for /Kids in page labels (#2562)
* ENH: Add support for /Kids in page labels --------- Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
1 parent 4bdca16 commit bc29901

File tree

2 files changed

+119
-44
lines changed

2 files changed

+119
-44
lines changed

pypdf/_page_labels.py

+68-44
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@
5757
aa to zz for the next 26, and so on)
5858
"""
5959

60-
from typing import Iterator, Optional, Tuple, cast
60+
from typing import Iterator, List, Optional, Tuple, cast
6161

6262
from ._protocols import PdfCommonDocProtocol
6363
from ._utils import logger_warning
64-
from .generic import ArrayObject, DictionaryObject, NumberObject
64+
from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject
6565

6666

6767
def number2uppercase_roman_numeral(num: int) -> str:
@@ -116,6 +116,42 @@ def number2lowercase_letter(number: int) -> str:
116116
return number2uppercase_letter(number).lower()
117117

118118

119+
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
120+
# [Nums] shall be an array of the form
121+
# [ key 1 value 1 key 2 value 2 ... key n value n ]
122+
# where each key_i is an integer and the corresponding
123+
# value_i shall be the object associated with that key.
124+
# The keys shall be sorted in numerical order,
125+
# analogously to the arrangement of keys in a name tree
126+
# as described in 7.9.6, "Name Trees."
127+
nums = cast(ArrayObject, dictionary_object["/Nums"])
128+
i = 0
129+
value = None
130+
start_index = 0
131+
while i < len(nums):
132+
start_index = nums[i]
133+
value = nums[i + 1].get_object()
134+
if i + 2 == len(nums):
135+
break
136+
if nums[i + 2] > index:
137+
break
138+
i += 2
139+
m = {
140+
None: lambda n: "",
141+
"/D": lambda n: str(n),
142+
"/R": number2uppercase_roman_numeral,
143+
"/r": number2lowercase_roman_numeral,
144+
"/A": number2uppercase_letter,
145+
"/a": number2lowercase_letter,
146+
}
147+
# if /Nums array is not following the specification or if /Nums is empty
148+
if not isinstance(value, dict):
149+
return str(index + 1) # Fallback
150+
start = value.get("/St", 1)
151+
prefix = value.get("/P", "")
152+
return prefix + m[value.get("/S")](index - start_index + start)
153+
154+
119155
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
120156
"""
121157
See 7.9.7 "Number Trees".
@@ -132,49 +168,37 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
132168
return str(index + 1) # Fallback
133169
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
134170
if "/Nums" in number_tree:
135-
# [Nums] shall be an array of the form
136-
# [ key 1 value 1 key 2 value 2 ... key n value n ]
137-
# where each key_i is an integer and the corresponding
138-
# value_i shall be the object associated with that key.
139-
# The keys shall be sorted in numerical order,
140-
# analogously to the arrangement of keys in a name tree
141-
# as described in 7.9.6, "Name Trees."
142-
nums = cast(ArrayObject, number_tree["/Nums"])
143-
i = 0
144-
value = None
145-
start_index = 0
146-
while i < len(nums):
147-
start_index = nums[i]
148-
value = nums[i + 1].get_object()
149-
if i + 2 == len(nums):
171+
return get_label_from_nums(number_tree, index)
172+
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
173+
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
174+
# Limit maximum depth.
175+
level = 0
176+
while level < 100:
177+
kids = cast(List[DictionaryObject], number_tree["/Kids"])
178+
for kid in kids:
179+
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
180+
limits = cast(List[int], kid["/Limits"])
181+
if limits[0] <= index <= limits[1]:
182+
if kid.get("/Kids", None) is not None:
183+
# Recursive definition.
184+
level += 1
185+
if level == 100: # pragma: no cover
186+
raise NotImplementedError("Too deep nesting is not supported.")
187+
number_tree = kid
188+
# Exit the inner `for` loop and continue at the next level with the
189+
# next iteration of the `while` loop.
190+
break
191+
return get_label_from_nums(kid, index)
192+
else:
193+
# When there are no kids, make sure to exit the `while` loop directly
194+
# and continue with the fallback.
150195
break
151-
if nums[i + 2] > index:
152-
break
153-
i += 2
154-
m = {
155-
None: lambda n: "",
156-
"/D": lambda n: str(n),
157-
"/R": number2uppercase_roman_numeral,
158-
"/r": number2lowercase_roman_numeral,
159-
"/A": number2uppercase_letter,
160-
"/a": number2lowercase_letter,
161-
}
162-
# if /Nums array is not following the specification or if /Nums is empty
163-
if not isinstance(value, dict):
164-
return str(index + 1) # Fallback
165-
start = value.get("/St", 1)
166-
prefix = value.get("/P", "")
167-
return prefix + m[value.get("/S")](index - start_index + start)
168-
if "/Kids" in number_tree or "/Limits" in number_tree:
169-
logger_warning(
170-
(
171-
"/Kids or /Limits found in PageLabels. "
172-
"This is not yet supported."
173-
),
174-
__name__,
175-
)
176-
# TODO: Implement /Kids and /Limits for number tree
177-
return str(index + 1) # Fallback if /Nums is not in the number_tree
196+
197+
logger_warning(
198+
f"Could not reliably determine page label for {index}.",
199+
__name__
200+
)
201+
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
178202

179203

180204
def nums_insert(

tests/test_page_labels.py

+51
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
"""Test the pypdf._page_labels module."""
22
from io import BytesIO
3+
from pathlib import Path
34

45
import pytest
56

67
from pypdf import PdfReader
78
from pypdf._page_labels import (
9+
get_label_from_nums,
810
index2label,
911
number2lowercase_letter,
1012
number2lowercase_roman_numeral,
@@ -15,6 +17,7 @@
1517
nums_next,
1618
)
1719
from pypdf.generic import (
20+
ArrayObject,
1821
DictionaryObject,
1922
NameObject,
2023
NullObject,
@@ -23,6 +26,10 @@
2326

2427
from . import get_data_from_url
2528

29+
TESTS_ROOT = Path(__file__).parent.resolve()
30+
PROJECT_ROOT = TESTS_ROOT.parent
31+
RESOURCE_ROOT = PROJECT_ROOT / "resources"
32+
2633

2734
@pytest.mark.parametrize(
2835
("number", "expected"),
@@ -103,3 +110,47 @@ def test_index2label(caplog):
103110
r.trailer["/Root"]["/PageLabels"][NameObject("/Kids")] = NullObject()
104111
assert index2label(r, 1) == "2"
105112
assert caplog.text != ""
113+
114+
115+
@pytest.mark.enable_socket()
116+
def test_index2label_kids():
117+
url = "https://www.bk.admin.ch/dam/bk/de/dokumente/terminologie/publikation_25_jahre_rtd.pdf.download.pdf/Terminologie_Epochen,%20Schwerpunkte,%20Umsetzungen.pdf" # noqa: E501
118+
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids.pdf")))
119+
expected = [
120+
"C1",
121+
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
122+
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII",
123+
] + list(map(str, range(1, 284)))
124+
for x in ["20", "44", "58", "82", "94", "116", "154", "166", "192", "224", "250"]:
125+
# Some page labels are unused. Removing them is still easier than copying the
126+
# whole list itself here.
127+
expected.remove(x)
128+
assert r.page_labels == expected
129+
130+
131+
@pytest.mark.enable_socket()
132+
def test_index2label_kids__recursive(caplog):
133+
url = "https://github.com/py-pdf/pypdf/files/14842446/tt1.pdf"
134+
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids_recursive.pdf")))
135+
expected = [
136+
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
137+
"M", "N", "O", "P", "17", "18", "19"
138+
]
139+
assert r.page_labels == expected
140+
assert caplog.text != ""
141+
142+
143+
def test_get_label_from_nums__empty_nums_list():
144+
dictionary_object = DictionaryObject()
145+
dictionary_object[NameObject("/Nums")] = ArrayObject()
146+
assert get_label_from_nums(dictionary_object, 13) == "14"
147+
148+
149+
def test_index2label__empty_kids_list():
150+
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
151+
number_tree = DictionaryObject()
152+
number_tree[NameObject("/Kids")] = ArrayObject()
153+
root = reader.root_object
154+
root[NameObject("/PageLabels")] = number_tree
155+
156+
assert index2label(reader, 42) == "43"

0 commit comments

Comments
 (0)