Skip to content

Commit fb1f5df

Browse files
authored
MAINT: Add root_object, _info and _ID to PdfReader (#2495)
Use common functionality between PdfReader and PdfWriter.
1 parent 6cf47c5 commit fb1f5df

File tree

5 files changed

+101
-63
lines changed

5 files changed

+101
-63
lines changed

pypdf/_page_labels.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
Example 1
1212
---------
1313
14-
>>> reader.trailer["/Root"]["/PageLabels"]["/Nums"]
14+
>>> reader.root_object["/PageLabels"]["/Nums"]
1515
[0, IndirectObject(18, 0, 139929798197504),
1616
8, IndirectObject(19, 0, 139929798197504)]
17-
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1])
17+
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
1818
{'/S': '/r'}
19-
>>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3])
19+
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
2020
{'/S': '/D'}
2121
2222
Example 2
@@ -57,7 +57,7 @@
5757
aa to zz for the next 26, and so on)
5858
"""
5959

60-
from typing import Iterator, Optional, Tuple
60+
from typing import Iterator, Optional, Tuple, cast
6161

6262
from ._protocols import PdfReaderProtocol
6363
from ._utils import logger_warning
@@ -127,10 +127,10 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
127127
Returns:
128128
The label of the page, e.g. "iv" or "4".
129129
"""
130-
root = reader.trailer["/Root"]
130+
root = cast(DictionaryObject, reader.root_object)
131131
if "/PageLabels" not in root:
132132
return str(index + 1) # Fallback
133-
number_tree = root["/PageLabels"]
133+
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
134134
if "/Nums" in number_tree:
135135
# [Nums] shall be an array of the form
136136
# [ key 1 value 1 key 2 value 2 ... key n value n ]
@@ -139,7 +139,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str:
139139
# The keys shall be sorted in numerical order,
140140
# analogously to the arrangement of keys in a name tree
141141
# as described in 7.9.6, "Name Trees."
142-
nums = number_tree["/Nums"]
142+
nums = cast(ArrayObject, number_tree["/Nums"])
143143
i = 0
144144
value = None
145145
start_index = 0

pypdf/_protocols.py

+8
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ def pages(self) -> List[Any]:
5959
def trailer(self) -> Dict[str, Any]:
6060
...
6161

62+
@property
63+
def root_object(self) -> PdfObjectProtocol:
64+
...
65+
6266
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
6367
...
6468

@@ -67,6 +71,10 @@ class PdfWriterProtocol(Protocol): # deprecated
6771
_objects: List[Any]
6872
_id_translated: Dict[int, Dict[int, int]]
6973

74+
@property
75+
def root_object(self) -> PdfObjectProtocol:
76+
...
77+
7078
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
7179
...
7280

pypdf/_reader.py

+47-24
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,7 @@ class PdfReader:
282282
@property
283283
def viewer_preferences(self) -> Optional[ViewerPreferences]:
284284
"""Returns the existing ViewerPreferences as an overloaded dictionary."""
285-
o = cast(DictionaryObject, self.trailer["/Root"]).get(
286-
CD.VIEWER_PREFERENCES, None
287-
)
285+
o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
288286
if o is None:
289287
return None
290288
o = o.get_object()
@@ -344,6 +342,33 @@ def __init__(
344342
elif password is not None:
345343
raise PdfReadError("Not encrypted file")
346344

345+
@property
346+
def root_object(self) -> DictionaryObject:
347+
"""Provide access to "/Root". standardized with PdfWriter."""
348+
return cast(DictionaryObject, self.trailer[TK.ROOT].get_object())
349+
350+
@property
351+
def _info(self) -> Optional[DictionaryObject]:
352+
"""
353+
Provide access to "/Info". standardized with PdfWriter.
354+
355+
Returns:
356+
/Info Dictionary ; None if the entry does not exists
357+
"""
358+
info = self.trailer.get(TK.INFO, None)
359+
return None if info is None else cast(DictionaryObject, info.get_object())
360+
361+
@property
362+
def _ID(self) -> Optional[ArrayObject]:
363+
"""
364+
Provide access to "/ID". standardized with PdfWriter.
365+
366+
Returns:
367+
/ID array ; None if the entry does not exists
368+
"""
369+
id = self.trailer.get(TK.ID, None)
370+
return None if id is None else cast(ArrayObject, id.get_object())
371+
347372
def _repr_mimebundle_(
348373
self,
349374
include: Union[None, Iterable[str]] = None,
@@ -400,21 +425,20 @@ def metadata(self) -> Optional[DocumentInformation]:
400425
"""
401426
if TK.INFO not in self.trailer:
402427
return None
403-
obj = self.trailer[TK.INFO]
404428
retval = DocumentInformation()
405-
if isinstance(obj, type(None)):
429+
if isinstance(self._info, type(None)):
406430
raise PdfReadError(
407431
"trailer not found or does not point to document information directory"
408432
)
409-
retval.update(obj) # type: ignore
433+
retval.update(self._info) # type: ignore
410434
return retval
411435

412436
@property
413437
def xmp_metadata(self) -> Optional[XmpInformation]:
414438
"""XMP (Extensible Metadata Platform) data."""
415439
try:
416440
self._override_encryption = True
417-
return self.trailer[TK.ROOT].xmp_metadata # type: ignore
441+
return self.root_object.xmp_metadata # type: ignore
418442
finally:
419443
self._override_encryption = False
420444

@@ -433,7 +457,7 @@ def _get_num_pages(self) -> int:
433457
# the PDF file's page count is used in this case. Otherwise,
434458
# the original method (flattened page count) is used.
435459
if self.is_encrypted:
436-
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
460+
return self.root_object["/Pages"]["/Count"] # type: ignore
437461
else:
438462
if self.flattened_pages is None:
439463
self._flatten()
@@ -493,7 +517,7 @@ def get_fields(
493517
field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
494518
if retval is None:
495519
retval = {}
496-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
520+
catalog = self.root_object
497521
# get the AcroForm tree
498522
if CD.ACRO_FORM in catalog:
499523
tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
@@ -755,7 +779,7 @@ def _get_named_destinations(
755779
"""
756780
if retval is None:
757781
retval = {}
758-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
782+
catalog = self.root_object
759783

760784
# get the name tree
761785
if CA.DESTS in catalog:
@@ -822,7 +846,7 @@ def _get_outline(
822846
) -> OutlineType:
823847
if outline is None:
824848
outline = []
825-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
849+
catalog = self.root_object
826850

827851
# get the outline dictionary and named destinations
828852
if CO.OUTLINES in catalog:
@@ -868,7 +892,7 @@ def threads(self) -> Optional[ArrayObject]:
868892
It's an array of dictionaries with "/F" and "/I" properties or
869893
None if there are no articles.
870894
"""
871-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
895+
catalog = self.root_object
872896
if CO.THREADS in catalog:
873897
return cast("ArrayObject", catalog[CO.THREADS])
874898
else:
@@ -1071,9 +1095,8 @@ def page_layout(self) -> Optional[str]:
10711095
* - /TwoPageRight
10721096
- Show two pages at a time, odd-numbered pages on the right
10731097
"""
1074-
trailer = cast(DictionaryObject, self.trailer[TK.ROOT])
1075-
if CD.PAGE_LAYOUT in trailer:
1076-
return cast(NameObject, trailer[CD.PAGE_LAYOUT])
1098+
if CD.PAGE_LAYOUT in self.root_object:
1099+
return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
10771100
return None
10781101

10791102
@property
@@ -1098,7 +1121,7 @@ def page_mode(self) -> Optional[PagemodeType]:
10981121
- Show attachments panel
10991122
"""
11001123
try:
1101-
return self.trailer[TK.ROOT]["/PageMode"] # type: ignore
1124+
return self.root_object["/PageMode"] # type: ignore
11021125
except KeyError:
11031126
return None
11041127

@@ -1119,12 +1142,12 @@ def _flatten(
11191142
if pages is None:
11201143
# Fix issue 327: set flattened_pages attribute only for
11211144
# decrypted file
1122-
catalog = self.trailer[TK.ROOT].get_object()
1123-
pages = catalog["/Pages"].get_object() # type: ignore
1145+
catalog = self.root_object
1146+
pages = cast(DictionaryObject, catalog["/Pages"].get_object())
11241147
self.flattened_pages = []
11251148

11261149
if PA.TYPE in pages:
1127-
t = pages[PA.TYPE]
1150+
t = cast(str, pages[PA.TYPE])
11281151
# if pdf has no type, considered as a page if /Kids is missing
11291152
elif PA.KIDS not in pages:
11301153
t = "/Page"
@@ -1925,7 +1948,7 @@ def is_encrypted(self) -> bool:
19251948
def xfa(self) -> Optional[Dict[str, Any]]:
19261949
tree: Optional[TreeObject] = None
19271950
retval: Dict[str, Any] = {}
1928-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
1951+
catalog = self.root_object
19291952

19301953
if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
19311954
return None
@@ -1955,7 +1978,7 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]:
19551978
Returns:
19561979
The created object. ``None`` means no object was created.
19571980
"""
1958-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
1981+
catalog = self.root_object
19591982

19601983
if "/AcroForm" not in catalog or not isinstance(
19611984
catalog["/AcroForm"], DictionaryObject
@@ -1997,7 +2020,7 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
19972020
Returns:
19982021
The modified object. ``None`` means no object was modified.
19992022
"""
2000-
catalog = cast(DictionaryObject, self.trailer[TK.ROOT])
2023+
catalog = self.root_object
20012024

20022025
if "/AcroForm" not in catalog or not isinstance(
20032026
catalog["/AcroForm"], DictionaryObject
@@ -2030,7 +2053,7 @@ def _list_attachments(self) -> List[str]:
20302053
Returns:
20312054
list of filenames
20322055
"""
2033-
catalog = cast(DictionaryObject, self.trailer["/Root"])
2056+
catalog = self.root_object
20342057
# From the catalog get the embedded file names
20352058
try:
20362059
filenames = cast(
@@ -2068,7 +2091,7 @@ def _get_attachments(
20682091
dictionary of filename -> Union[bytestring or List[ByteString]]
20692092
if the filename exists multiple times a List of the different version will be provided
20702093
"""
2071-
catalog = cast(DictionaryObject, self.trailer["/Root"])
2094+
catalog = self.root_object
20722095
# From the catalog get the embedded file names
20732096
try:
20742097
filenames = cast(

pypdf/_writer.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ def __init__(
211211
self._encrypt_entry: Optional[DictionaryObject] = None
212212
self._ID: Union[ArrayObject, None] = None
213213

214+
@property
215+
def root_object(self) -> DictionaryObject:
216+
"""
217+
Provide direct access to Pdf Structure
218+
219+
Note:
220+
Recommended be used only for read access
221+
"""
222+
return self._root_object
223+
214224
def __enter__(self) -> "PdfWriter":
215225
"""Store that writer is initialized by 'with'."""
216226
self.with_as_usage = True
@@ -1084,7 +1094,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
10841094
reader: PdfReader from the document root should be copied.
10851095
"""
10861096
self._objects.clear()
1087-
self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self))
1097+
self._root_object = reader.root_object.clone(self)
10881098
self._root = self._root_object.indirect_reference # type: ignore[assignment]
10891099
self._pages = self._root_object.raw_get("/Pages")
10901100
self._flatten()
@@ -1165,10 +1175,10 @@ def clone_document_from_reader(
11651175
"""
11661176
self.clone_reader_document_root(reader)
11671177
if TK.INFO in reader.trailer:
1168-
self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore
1178+
self._info = reader._info.clone(self).indirect_reference # type: ignore
11691179
try:
1170-
self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self))
1171-
except KeyError:
1180+
self._ID = cast(ArrayObject, reader._ID).clone(self)
1181+
except AttributeError:
11721182
pass
11731183
if callable(after_page_append):
11741184
for page in cast(
@@ -2546,7 +2556,7 @@ def merge(
25462556
else:
25472557
outline_item_typ = self.get_outline_root()
25482558

2549-
_ro = cast("DictionaryObject", reader.trailer[TK.ROOT])
2559+
_ro = reader.root_object
25502560
if import_outline and CO.OUTLINES in _ro:
25512561
outline = self._get_filtered_outline(
25522562
_ro.get(CO.OUTLINES, None), srcpages, reader
@@ -2569,7 +2579,7 @@ def merge(
25692579
self._root_object[NameObject("/AcroForm")] = self._add_object(
25702580
cast(
25712581
DictionaryObject,
2572-
cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"],
2582+
reader.root_object["/AcroForm"],
25732583
).clone(self, False, ("/Fields",))
25742584
)
25752585
arr = ArrayObject()
@@ -2580,7 +2590,7 @@ def merge(
25802590
)
25812591
trslat = self._id_translated[id(reader)]
25822592
try:
2583-
for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]: # type: ignore
2593+
for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore
25842594
try:
25852595
ind = IndirectObject(trslat[f.idnum], 0, self)
25862596
if ind not in arr:

0 commit comments

Comments
 (0)