49
49
overload ,
50
50
)
51
51
52
- from ._cmap import build_char_map , build_font_width_map , compute_font_width , unknown_char_map
52
+ from ._cmap import (
53
+ build_char_map ,
54
+ build_font_width_map ,
55
+ compute_font_width ,
56
+ parse_encoding ,
57
+ parse_to_unicode ,
58
+ unknown_char_map ,
59
+ )
53
60
from ._protocols import PdfCommonDocProtocol
54
61
from ._text_extraction import (
55
62
OrientationNotFoundError ,
56
63
_layout_mode ,
57
64
crlf_space_check ,
58
- handle_tj ,
65
+ get_display_str ,
66
+ get_text_operands ,
59
67
mult ,
60
68
)
61
69
from ._utils import (
84
92
PdfObject ,
85
93
RectangleObject ,
86
94
StreamObject ,
95
+ TextStringObject ,
87
96
is_null_or_none ,
88
97
)
89
98
@@ -496,7 +505,7 @@ def __init__(
496
505
if not is_null_or_none (indirect_reference ):
497
506
assert indirect_reference is not None , "mypy"
498
507
self .update (cast (DictionaryObject , indirect_reference .get_object ()))
499
- self ._font_width_maps : Dict [str , Dict [str , float ]] = {}
508
+ self ._font_width_maps : Dict [str , Tuple [ Dict [str , float ], str , float ]] = {}
500
509
501
510
def hash_bin (self ) -> int :
502
511
"""
@@ -1722,19 +1731,78 @@ def _get_acutual_font_widths(
1722
1731
cmap : Tuple [
1723
1732
Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
1724
1733
],
1725
- add_text : str ,
1734
+ text_operands : str ,
1726
1735
font_size : float ,
1727
- default_space_width : float
1736
+ space_width : float
1728
1737
) -> Tuple [float , float , float ]:
1729
1738
font_widths : float = 0
1730
1739
font_name : str = cmap [2 ]
1731
1740
if font_name not in self ._font_width_maps :
1732
- self ._font_width_maps [font_name ] = build_font_width_map (cmap [3 ], cmap [1 ], default_space_width * 2 )
1733
- font_width_map : Dict [Any , float ] = self ._font_width_maps [font_name ]
1734
- if add_text :
1735
- for char in add_text :
1741
+ if cmap [3 ] is None :
1742
+ font_width_map : Dict [Any , float ] = {}
1743
+ space_char = " "
1744
+ actual_space_width : float = space_width
1745
+ font_width_map ["default" ] = actual_space_width * 2
1746
+ else :
1747
+ space_code = 32
1748
+ _ , space_code = parse_encoding (cmap [3 ], space_code )
1749
+ _ , space_code , _ = parse_to_unicode (cmap [3 ], space_code )
1750
+ if isinstance (space_code , str ):
1751
+ space_char = space_code
1752
+ else :
1753
+ space_char = chr (space_code )
1754
+ font_width_map = build_font_width_map (cmap [3 ], space_width * 2 )
1755
+ actual_space_width = compute_font_width (font_width_map , space_char )
1756
+ if actual_space_width == 0 :
1757
+ actual_space_width = space_width
1758
+ self ._font_width_maps [font_name ] = (font_width_map , space_char , actual_space_width )
1759
+ font_width_map = self ._font_width_maps [font_name ][0 ]
1760
+ space_char = self ._font_width_maps [font_name ][1 ]
1761
+ actual_space_width = self ._font_width_maps [font_name ][2 ]
1762
+
1763
+ if text_operands :
1764
+ for char in text_operands :
1765
+ if char == space_char :
1766
+ font_widths += actual_space_width
1767
+ continue
1736
1768
font_widths += compute_font_width (font_width_map , char )
1737
- return (font_widths * font_size , default_space_width * font_size , font_size )
1769
+ return (font_widths * font_size , space_width * font_size , font_size )
1770
+
1771
+ def _handle_tj (
1772
+ self ,
1773
+ text : str ,
1774
+ operands : List [Union [str , TextStringObject ]],
1775
+ cm_matrix : List [float ],
1776
+ tm_matrix : List [float ],
1777
+ cmap : Tuple [
1778
+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
1779
+ ],
1780
+ orientations : Tuple [int , ...],
1781
+ font_size : float ,
1782
+ rtl_dir : bool ,
1783
+ visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]],
1784
+ space_width : float ,
1785
+ actual_str_size : Dict [str , float ]
1786
+ ) -> Tuple [str , bool , Dict [str , float ]]:
1787
+ text_operands , is_str_operands = get_text_operands (
1788
+ operands , cm_matrix , tm_matrix , cmap , orientations )
1789
+ if is_str_operands :
1790
+ text += text_operands
1791
+ else :
1792
+ text , rtl_dir = get_display_str (
1793
+ text ,
1794
+ cm_matrix ,
1795
+ tm_matrix , # text matrix
1796
+ cmap ,
1797
+ text_operands ,
1798
+ font_size ,
1799
+ rtl_dir ,
1800
+ visitor_text )
1801
+ font_widths , actual_str_size ["space_width" ], actual_str_size ["str_height" ] = (
1802
+ self ._get_acutual_font_widths (cmap , text_operands , font_size , space_width ))
1803
+ actual_str_size ["str_widths" ] += font_widths
1804
+
1805
+ return text , rtl_dir , actual_str_size
1738
1806
1739
1807
def _extract_text (
1740
1808
self ,
@@ -1818,11 +1886,8 @@ def _extract_text(
1818
1886
TL = 0.0
1819
1887
font_size = 12.0 # init just in case of
1820
1888
1821
- def current_spacewidth () -> float :
1822
- return _space_width / 1000.0
1823
-
1824
- def current_strwidths () -> float :
1825
- return _actual_str_size ["str_widths" ] / 1000.0
1889
+ def compute_strwidths (str_widths : float ) -> float :
1890
+ return str_widths / 1000.0
1826
1891
1827
1892
def process_operation (operator : bytes , operands : List [Any ]) -> None :
1828
1893
nonlocal cm_matrix , cm_stack , tm_matrix , cm_prev , tm_prev , memo_cm , memo_tm
@@ -1945,7 +2010,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
1945
2010
ty = float (operands [1 ])
1946
2011
tm_matrix [4 ] += tx * tm_matrix [0 ] + ty * tm_matrix [2 ]
1947
2012
tm_matrix [5 ] += tx * tm_matrix [1 ] + ty * tm_matrix [3 ]
1948
- str_widths = current_strwidths ( )
2013
+ str_widths = compute_strwidths ( _actual_str_size [ "str_widths" ] )
1949
2014
_actual_str_size ["str_widths" ] = 0.0
1950
2015
elif operator == b"Tm" :
1951
2016
check_crlf_space = True
@@ -1957,28 +2022,26 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
1957
2022
float (operands [4 ]),
1958
2023
float (operands [5 ]),
1959
2024
]
1960
- str_widths = current_strwidths ( )
2025
+ str_widths = compute_strwidths ( _actual_str_size [ "str_widths" ] )
1961
2026
_actual_str_size ["str_widths" ] = 0.0
1962
2027
elif operator == b"T*" :
1963
2028
check_crlf_space = True
1964
2029
tm_matrix [5 ] -= TL
1965
2030
elif operator == b"Tj" :
1966
2031
check_crlf_space = True
1967
- text , rtl_dir , add_text = handle_tj (
2032
+ text , rtl_dir , _actual_str_size = self . _handle_tj (
1968
2033
text ,
1969
2034
operands ,
1970
2035
cm_matrix ,
1971
2036
tm_matrix , # text matrix
1972
2037
cmap ,
1973
2038
orientations ,
1974
- output ,
1975
2039
font_size ,
1976
2040
rtl_dir ,
1977
2041
visitor_text ,
2042
+ _space_width ,
2043
+ _actual_str_size ,
1978
2044
)
1979
- current_font_widths , _actual_str_size ["space_width" ], _actual_str_size ["str_height" ] = (
1980
- self ._get_acutual_font_widths (cmap , add_text , font_size , current_spacewidth ()))
1981
- _actual_str_size ["str_widths" ] += current_font_widths
1982
2045
else :
1983
2046
return None
1984
2047
if check_crlf_space :
@@ -1994,7 +2057,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
1994
2057
font_size ,
1995
2058
visitor_text ,
1996
2059
str_widths ,
1997
- _actual_str_size ["space_width" ],
2060
+ compute_strwidths ( _actual_str_size ["space_width" ]) ,
1998
2061
_actual_str_size ["str_height" ]
1999
2062
)
2000
2063
if text == "" :
0 commit comments