Skip to content

Commit abb62ac

Browse files
authored
MAINT: Unnecessary character mapping process (#2888)
This is a fix for the problem that occurred when #2882 was changed. The string length of characters was checked after conversion by cmap, but after cmap conversion, there is a pattern where the string length is more than one character, and it cannot be measured accurately. This is necessary, for example, when considering whether to measure the distance from the ligature or the base character corresponding to the ligature in fixing #1351. The change in handle_tj is because it cannot pass Ruff's check. Error: PLR0915 Too many statements (nnn > 176) The following code is only used to get the character code for a space. However, I think it would be better to split the code into parts for obtaining the character code. Style changes are considered in another PR.
1 parent e825ac0 commit abb62ac

File tree

3 files changed

+157
-109
lines changed

3 files changed

+157
-109
lines changed

pypdf/_cmap.py

+10-30
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def build_char_map_from_dict(
5050
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
5151
The font-dictionary itself is suitable for the curious.
5252
"""
53-
font_type: str = cast(str, ft["/Subtype"])
53+
font_type = cast(str, ft["/Subtype"].get_object())
5454

5555
space_code = 32
5656
encoding, space_code = parse_encoding(ft, space_code)
@@ -75,21 +75,12 @@ def build_char_map_from_dict(
7575
for x in int_entry:
7676
if x <= 255:
7777
encoding[x] = chr(x)
78-
# I consider the space_code is available on one byte
7978
if isinstance(space_code, str):
80-
try: # one byte
81-
sp = space_code.encode("charmap")[0]
82-
except Exception:
83-
sp = space_code.encode("utf-16-be")
84-
sp = sp[0] + 256 * sp[1]
85-
try:
86-
sp = ord(map_dict[chr(sp)])
87-
except KeyError:
88-
pass
89-
else:
9079
sp = space_code
91-
font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
92-
half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0
80+
else:
81+
sp = chr(space_code)
82+
font_width_map = build_font_width_map(ft, space_width * 2.0)
83+
half_space_width = compute_space_width(font_width_map, sp) / 2.0
9384

9485
return (
9586
font_type,
@@ -403,17 +394,14 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
403394

404395

405396
def build_font_width_map(
406-
ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
397+
ft: DictionaryObject, default_font_width: float
407398
) -> Dict[Any, float]:
408399
font_width_map: Dict[Any, float] = {}
409400
st: int = 0
410401
en: int = 0
411-
if ft is None:
412-
font_width_map["default"] = default_font_width
413-
return font_width_map
414402
try:
415-
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
416-
except Exception:
403+
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0
404+
except KeyError:
417405
pass
418406
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
419407
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
@@ -435,21 +423,13 @@ def build_font_width_map(
435423
# C_first C_last same_W
436424
en = second
437425
for c_code in range(st, en + 1):
438-
try:
439-
conversion_char = map_dict[chr(c_code)]
440-
font_width_map[conversion_char] = w[2]
441-
except KeyError:
442-
pass
426+
font_width_map[chr(c_code)] = w[2]
443427
w = w[3:]
444428
elif isinstance(second, list):
445429
# Starting_C [W1 W2 ... Wn]
446430
c_code = st
447431
for width in second:
448-
try:
449-
conversion_char = map_dict[chr(c_code)]
450-
font_width_map[conversion_char] = width
451-
except KeyError:
452-
pass
432+
font_width_map[chr(c_code)] = width
453433
c_code += 1
454434
w = w[2:]
455435
else:

pypdf/_page.py

+86-23
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,21 @@
4949
overload,
5050
)
5151

52-
from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
52+
from ._cmap import (
53+
build_char_map,
54+
build_font_width_map,
55+
compute_font_width,
56+
parse_encoding,
57+
parse_to_unicode,
58+
unknown_char_map,
59+
)
5360
from ._protocols import PdfCommonDocProtocol
5461
from ._text_extraction import (
5562
OrientationNotFoundError,
5663
_layout_mode,
5764
crlf_space_check,
58-
handle_tj,
65+
get_display_str,
66+
get_text_operands,
5967
mult,
6068
)
6169
from ._utils import (
@@ -84,6 +92,7 @@
8492
PdfObject,
8593
RectangleObject,
8694
StreamObject,
95+
TextStringObject,
8796
is_null_or_none,
8897
)
8998

@@ -496,7 +505,7 @@ def __init__(
496505
if not is_null_or_none(indirect_reference):
497506
assert indirect_reference is not None, "mypy"
498507
self.update(cast(DictionaryObject, indirect_reference.get_object()))
499-
self._font_width_maps: Dict[str, Dict[str, float]] = {}
508+
self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}
500509

501510
def hash_bin(self) -> int:
502511
"""
@@ -1722,19 +1731,78 @@ def _get_acutual_font_widths(
17221731
cmap: Tuple[
17231732
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
17241733
],
1725-
add_text: str,
1734+
text_operands: str,
17261735
font_size: float,
1727-
default_space_width: float
1736+
space_width: float
17281737
) -> Tuple[float, float, float]:
17291738
font_widths: float = 0
17301739
font_name: str = cmap[2]
17311740
if font_name not in self._font_width_maps:
1732-
self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
1733-
font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
1734-
if add_text:
1735-
for char in add_text:
1741+
if cmap[3] is None:
1742+
font_width_map: Dict[Any, float] = {}
1743+
space_char = " "
1744+
actual_space_width: float = space_width
1745+
font_width_map["default"] = actual_space_width * 2
1746+
else:
1747+
space_code = 32
1748+
_, space_code = parse_encoding(cmap[3], space_code)
1749+
_, space_code, _ = parse_to_unicode(cmap[3], space_code)
1750+
if isinstance(space_code, str):
1751+
space_char = space_code
1752+
else:
1753+
space_char = chr(space_code)
1754+
font_width_map = build_font_width_map(cmap[3], space_width * 2)
1755+
actual_space_width = compute_font_width(font_width_map, space_char)
1756+
if actual_space_width == 0:
1757+
actual_space_width = space_width
1758+
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
1759+
font_width_map = self._font_width_maps[font_name][0]
1760+
space_char = self._font_width_maps[font_name][1]
1761+
actual_space_width = self._font_width_maps[font_name][2]
1762+
1763+
if text_operands:
1764+
for char in text_operands:
1765+
if char == space_char:
1766+
font_widths += actual_space_width
1767+
continue
17361768
font_widths += compute_font_width(font_width_map, char)
1737-
return (font_widths * font_size, default_space_width * font_size, font_size)
1769+
return (font_widths * font_size, space_width * font_size, font_size)
1770+
1771+
def _handle_tj(
1772+
self,
1773+
text: str,
1774+
operands: List[Union[str, TextStringObject]],
1775+
cm_matrix: List[float],
1776+
tm_matrix: List[float],
1777+
cmap: Tuple[
1778+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1779+
],
1780+
orientations: Tuple[int, ...],
1781+
font_size: float,
1782+
rtl_dir: bool,
1783+
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
1784+
space_width: float,
1785+
actual_str_size: Dict[str, float]
1786+
) -> Tuple[str, bool, Dict[str, float]]:
1787+
text_operands, is_str_operands = get_text_operands(
1788+
operands, cm_matrix, tm_matrix, cmap, orientations)
1789+
if is_str_operands:
1790+
text += text_operands
1791+
else:
1792+
text, rtl_dir = get_display_str(
1793+
text,
1794+
cm_matrix,
1795+
tm_matrix, # text matrix
1796+
cmap,
1797+
text_operands,
1798+
font_size,
1799+
rtl_dir,
1800+
visitor_text)
1801+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
1802+
self._get_acutual_font_widths(cmap, text_operands, font_size, space_width))
1803+
actual_str_size["str_widths"] += font_widths
1804+
1805+
return text, rtl_dir, actual_str_size
17381806

17391807
def _extract_text(
17401808
self,
@@ -1818,11 +1886,8 @@ def _extract_text(
18181886
TL = 0.0
18191887
font_size = 12.0 # init just in case of
18201888

1821-
def current_spacewidth() -> float:
1822-
return _space_width / 1000.0
1823-
1824-
def current_strwidths() -> float:
1825-
return _actual_str_size["str_widths"] / 1000.0
1889+
def compute_strwidths(str_widths: float) -> float:
1890+
return str_widths / 1000.0
18261891

18271892
def process_operation(operator: bytes, operands: List[Any]) -> None:
18281893
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
@@ -1945,7 +2010,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19452010
ty = float(operands[1])
19462011
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
19472012
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
1948-
str_widths = current_strwidths()
2013+
str_widths = compute_strwidths(_actual_str_size["str_widths"])
19492014
_actual_str_size["str_widths"] = 0.0
19502015
elif operator == b"Tm":
19512016
check_crlf_space = True
@@ -1957,28 +2022,26 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19572022
float(operands[4]),
19582023
float(operands[5]),
19592024
]
1960-
str_widths = current_strwidths()
2025+
str_widths = compute_strwidths(_actual_str_size["str_widths"])
19612026
_actual_str_size["str_widths"] = 0.0
19622027
elif operator == b"T*":
19632028
check_crlf_space = True
19642029
tm_matrix[5] -= TL
19652030
elif operator == b"Tj":
19662031
check_crlf_space = True
1967-
text, rtl_dir, add_text = handle_tj(
2032+
text, rtl_dir, _actual_str_size = self._handle_tj(
19682033
text,
19692034
operands,
19702035
cm_matrix,
19712036
tm_matrix, # text matrix
19722037
cmap,
19732038
orientations,
1974-
output,
19752039
font_size,
19762040
rtl_dir,
19772041
visitor_text,
2042+
_space_width,
2043+
_actual_str_size,
19782044
)
1979-
current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
1980-
self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
1981-
_actual_str_size["str_widths"] += current_font_widths
19822045
else:
19832046
return None
19842047
if check_crlf_space:
@@ -1994,7 +2057,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19942057
font_size,
19952058
visitor_text,
19962059
str_widths,
1997-
_actual_str_size["space_width"],
2060+
compute_strwidths(_actual_str_size["space_width"]),
19982061
_actual_str_size["str_height"]
19992062
)
20002063
if text == "":

0 commit comments

Comments
 (0)