Skip to content

Commit 956fd03

Browse files
authored
ENH: Allow multiple charsets for NameObject.read_from_stream (#2585)
Closes #2323
1 parent 0f7c8fe commit 956fd03

File tree

2 files changed

+34
-14
lines changed

2 files changed

+34
-14
lines changed

pypdf/generic/_base.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,10 @@ def write_to_stream(
615615
def renumber(self) -> bytes:
616616
out = self[0].encode("utf-8")
617617
if out != b"/":
618-
deprecate_no_replacement(f"Incorrect first char in NameObject, should start with '/': ({self})", "6.0.0")
618+
deprecate_no_replacement(
619+
f"Incorrect first char in NameObject, should start with '/': ({self})",
620+
"6.0.0",
621+
)
619622
for c in self[1:]:
620623
if c > "~":
621624
for x in c.encode("utf-8"):
@@ -640,6 +643,8 @@ def unnumber(sin: bytes) -> bytes:
640643
i = i + 1
641644
return sin
642645

646+
CHARSETS = ("utf-8", "gbk", "latin1")
647+
643648
@staticmethod
644649
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
645650
name = stream.read(1)
@@ -650,7 +655,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
650655
# Name objects should represent irregular characters
651656
# with a '#' followed by the symbol's hex number
652657
name = NameObject.unnumber(name)
653-
for enc in ("utf-8", "gbk"):
658+
for enc in NameObject.CHARSETS:
654659
try:
655660
ret = name.decode(enc)
656661
return NameObject(ret)
@@ -659,11 +664,16 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
659664
raise UnicodeDecodeError("", name, 0, 0, "Code Not Found")
660665
except (UnicodeEncodeError, UnicodeDecodeError) as e:
661666
if not pdf.strict:
662-
logger_warning(f"Illegal character in Name Object ({name!r})", __name__)
667+
logger_warning(
668+
f"Illegal character in NameObject ({name!r}), "
669+
"you may need to adjust NameObject.CHARSETS",
670+
__name__,
671+
)
663672
return NameObject(name.decode("charmap"))
664673
else:
665674
raise PdfReadError(
666-
f"Illegal character in Name Object ({name!r})"
675+
f"Illegal character in NameObject ({name!r}). "
676+
"You may need to adjust NameObject.CHARSETS.",
667677
) from e
668678

669679

tests/test_generic.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Test the pypdf.generic module."""
22

3+
from copy import deepcopy
34
from io import BytesIO
45
from pathlib import Path
56
from unittest.mock import patch
@@ -212,6 +213,11 @@ def test_name_object(caplog):
212213
)
213214
) == "/你好世界"
214215

216+
# to test latin-1 aka stdencoding
217+
assert (
218+
NameObject.read_from_stream(BytesIO(b"/DocuSign\xae"), None)
219+
) == "/DocuSign®"
220+
215221
# test write
216222
b = BytesIO()
217223
NameObject("/hello").write_to_stream(b)
@@ -1036,16 +1042,20 @@ def test_checkboxradiobuttonattributes_opt():
10361042

10371043

10381044
def test_name_object_invalid_decode():
1039-
stream = BytesIO(b"/\x80\x02\x03")
1040-
1041-
# strict:
1042-
with pytest.raises(PdfReadError) as exc:
1043-
NameObject.read_from_stream(stream, ReaderDummy(strict=True))
1044-
assert "Illegal character in Name Object" in exc.value.args[0]
1045-
1046-
# non-strict:
1047-
stream.seek(0)
1048-
NameObject.read_from_stream(stream, ReaderDummy(strict=False))
1045+
charsets = deepcopy(NameObject.CHARSETS)
1046+
try:
1047+
NameObject.CHARSETS = ("utf-8",)
1048+
stream = BytesIO(b"/\x80\x02\x03")
1049+
# strict:
1050+
with pytest.raises(PdfReadError) as exc:
1051+
NameObject.read_from_stream(stream, ReaderDummy(strict=True))
1052+
assert "Illegal character in NameObject " in exc.value.args[0]
1053+
1054+
# non-strict:
1055+
stream.seek(0)
1056+
NameObject.read_from_stream(stream, ReaderDummy(strict=False))
1057+
finally:
1058+
NameObject.CHARSETS = charsets
10491059

10501060

10511061
def test_indirect_object_invalid_read():

0 commit comments

Comments
 (0)