Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Issue in text extraction (spaces) (#1153) #2882

Merged
merged 62 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
5400f5a
BUG: Missing spaces in extract_text() method (#1328)
ssjkamei Sep 24, 2024
aac0436
Revert "BUG: Missing spaces in extract_text() method (#1328)"
ssjkamei Sep 24, 2024
64b1c92
BUG: Missing spaces in extract_text() method (#1328)
ssjkamei Sep 24, 2024
70e9b38
BUG: Missing spaces in extract_text() method (#1328) add test
ssjkamei Sep 24, 2024
65224e1
Revert "BUG: Missing spaces in extract_text() method (#1328)"
ssjkamei Sep 24, 2024
788d56d
Merge branch 'main' of https://github.com/ssjkamei/pypdf
ssjkamei Sep 24, 2024
f6dcb43
BUG: Missing spaces in extract_text() method (#1328) Convert font siz…
ssjkamei Sep 24, 2024
fd1c489
Correction to new file URL.
ssjkamei Sep 24, 2024
2873b9e
BUG: Missing spaces in extract_text() method (py-pdf#1328) calculatio…
ssjkamei Sep 24, 2024
7597704
BUG: Missing spaces in extract_text() method (py-pdf#1328) Simplify t…
ssjkamei Sep 24, 2024
4a2afe9
Merge branch 'py-pdf:main' into main
ssjkamei Sep 28, 2024
fb4de41
BUG: Issue in text extraction (spaces) (#1153)
ssjkamei Sep 28, 2024
373eaec
BUG: Issue in text extraction (spaces) (#1153) add test
ssjkamei Sep 28, 2024
066f594
style: Correcting code style issues
ssjkamei Sep 28, 2024
d406e23
Text position return support
ssjkamei Sep 28, 2024
d338e18
Add code for CIDFont
ssjkamei Sep 28, 2024
f7c4236
Added horizontal CIDFont calculation code
ssjkamei Sep 28, 2024
a32fbc9
Style: Correcting code style issues
ssjkamei Sep 28, 2024
a237f2d
Integrate font width calculation and space width calculation
ssjkamei Sep 29, 2024
e159e4d
Font width map and space width acquisition process separation
ssjkamei Sep 29, 2024
a19a8f4
Revert to original adjustment space width
ssjkamei Sep 29, 2024
6dbda50
Supports diagonal travel distance
ssjkamei Sep 29, 2024
34efe52
Font size defaults to twice the space
ssjkamei Sep 29, 2024
52aa7ac
Get the default space width from the argument
ssjkamei Sep 29, 2024
7a028bb
fix self-made bugs
ssjkamei Sep 29, 2024
f02fa23
Style: Correcting code style issues
ssjkamei Sep 29, 2024
980d831
Style: Correcting code style issues
ssjkamei Sep 29, 2024
5e6a0dd
fix self-made bugs
ssjkamei Sep 29, 2024
8078ac1
Style: Correcting code style issues
ssjkamei Sep 29, 2024
b842cee
Compliant with PDF1.7 specifications
ssjkamei Sep 30, 2024
d1c54db
fix self-made bugs
ssjkamei Sep 30, 2024
328d22b
font_map efficiency
ssjkamei Sep 30, 2024
e392416
fix self-made bugs
ssjkamei Sep 30, 2024
9e6d2ce
fix self-made bugs
ssjkamei Sep 30, 2024
1fe5285
style: Correcting code style issues
ssjkamei Sep 30, 2024
2110372
Merge branch 'py-pdf:main' into main
ssjkamei Oct 1, 2024
0292b13
Style: Correcting code style issues
ssjkamei Oct 1, 2024
444bef8
BUG: Changed timing of font size calculation to before font switching
ssjkamei Oct 1, 2024
ce36f48
STY: Correcting code style issues
ssjkamei Oct 1, 2024
68862dc
BUG: Modify space calculation results to match original code
ssjkamei Oct 1, 2024
4bcfac3
BUG: If there is no default value for font, set to argument
ssjkamei Oct 1, 2024
6d7f75e
BUG: Mistakes in Multiplication
ssjkamei Oct 1, 2024
3e79f20
Update pypdf/_page.py
ssjkamei Oct 1, 2024
e33b65f
Update pypdf/_text_extraction/__init__.py
ssjkamei Oct 1, 2024
bef7862
Typo
ssjkamei Oct 1, 2024
2b0e530
Modifying a comment
ssjkamei Oct 1, 2024
cb5bf4a
More detailed error codes
ssjkamei Oct 1, 2024
c8ce234
Allow list conversion of /Widths other than ArrayObject
ssjkamei Oct 1, 2024
fd82bde
Exception code omitted
ssjkamei Oct 1, 2024
d79da5b
Explicit description of type
ssjkamei Oct 1, 2024
98ccb3a
style: Correcting code style issues
ssjkamei Oct 1, 2024
b13b97f
Convert character map keys from int(ord) to str
ssjkamei Oct 1, 2024
ef73315
Style: Correcting code style issues
ssjkamei Oct 1, 2024
f884160
Update pypdf/_cmap.py
ssjkamei Oct 1, 2024
20a6883
Update pypdf/_text_extraction/__init__.py
ssjkamei Oct 1, 2024
e6132fa
Exception code omitted
ssjkamei Oct 2, 2024
9a82eb8
Style: Correcting code style issues
ssjkamei Oct 2, 2024
d4f1835
Style: Correcting code style issues
ssjkamei Oct 2, 2024
96fcf7c
fix self-made bugs
ssjkamei Oct 2, 2024
780a632
fix self-made bugs
ssjkamei Oct 2, 2024
ce11d0d
Insufficient height consideration for front and rear fonts
ssjkamei Oct 2, 2024
03eb1cb
style: Correcting code style issues
ssjkamei Oct 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 100 additions & 55 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,28 +75,28 @@ def build_char_map_from_dict(
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
try:
# override space_width with new params
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
pass
# I consider the space_code is available on one byte
if isinstance(space_code, str):
try: # one byte
sp = space_code.encode("charmap")[0]
except Exception:
sp = space_code.encode("utf-16-be")
sp = sp[0] + 256 * sp[1]
try:
sp = ord(map_dict[chr(sp)])
except KeyError:
pass
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)
font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0

return (
font_type,
float(sp_width / 2),
half_space_width,
encoding,
# https://github.com/python/mypy/issues/4374
map_dict,
map_dict
)


Expand Down Expand Up @@ -402,78 +402,123 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
lst = lst[2:]


def compute_space_width(
ft: DictionaryObject, space_code: int, space_width: float
) -> float:
sp_width: float = space_width * 2.0 # default value
w = []
w1 = {}
def build_font_width_map(
ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
) -> Dict[Any, float]:
font_width_map: Dict[Any, float] = {}
st: int = 0
en: int = 0
if ft is None:
font_width_map["default"] = default_font_width
return font_width_map
try:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
except Exception:
pass
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
# Widths for a CIDFont are defined using the DW and W entries.
# DW2 and W2 are for vertical use. Vertical type is not implemented.
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
font_width_map["default"] = cast(float, ft1["/DW"])
except Exception:
w1[-1] = 1000.0
font_width_map["default"] = default_font_width
if "/W" in ft1:
w = list(ft1["/W"])
w = ft1["/W"].get_object()
else:
w = []
while len(w) > 0:
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
for x in range(st, second):
w1[x] = w[2]
# C_first C_last same_W
en = second
for c_code in range(st, en + 1):
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = w[2]
except KeyError:
pass
w = w[3:]
elif isinstance(second, list):
for y in second:
w1[st] = y
st += 1
# Starting_C [W1 W2 ... Wn]
c_code = st
for width in second:
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = width
except KeyError:
pass
c_code += 1
w = w[2:]
else:
logger_warning(
"unknown widths : \n" + (ft1["/W"]).__repr__(),
__name__,
)
break
try:
sp_width = w1[space_code]
except Exception:
sp_width = (
w1[-1] / 2.0
) # if using default we consider space will be only half size
elif "/Widths" in ft:
w = list(ft["/Widths"]) # type: ignore
try:
st = cast(int, ft["/FirstChar"])
en: int = cast(int, ft["/LastChar"])
if st > space_code or en < space_code:
raise Exception("Not in range")
if w[space_code - st].get_object() == 0:
raise Exception("null width")
sp_width = w[space_code - st].get_object()
except Exception:
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)/2
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
sp_width = m / max(1, cpt) / 2

if is_null_or_none(sp_width):
sp_width = 0.0
w = ft["/Widths"].get_object()
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
font_width_map["default"] = m / max(1, cpt)
st = cast(int, ft["/FirstChar"])
en = cast(int, ft["/LastChar"])
for c_code in range(st, en + 1):
try:
width = w[c_code - st].get_object()
font_width_map[chr(c_code)] = width
except (IndexError, KeyError):
# The PDF structure is invalid. The array is too small
# for the specified font width.
pass
if is_null_or_none(font_width_map.get("default")):
font_width_map["default"] = default_font_width if default_font_width else 0.0
return font_width_map


def compute_space_width(
font_width_map: Dict[Any, float], space_char: str
) -> float:
try:
sp_width = font_width_map[space_char]
if sp_width == 0:
raise ValueError("Zero width")
except (KeyError, ValueError):
sp_width = (
font_width_map["default"] / 2.0
) # if using default we consider space will be only half size

return sp_width


def compute_font_width(
font_width_map: Dict[Any, float],
char: str
) -> float:
char_width: float = 0.0
try:
char_width = font_width_map[char]
except KeyError:
char_width = (
font_width_map["default"]
)

return char_width


def type1_alternative(
ft: DictionaryObject,
map_dict: Dict[Any, Any],
Expand Down
45 changes: 39 additions & 6 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
overload,
)

from ._cmap import build_char_map, unknown_char_map
from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
from ._protocols import PdfCommonDocProtocol
from ._text_extraction import (
OrientationNotFoundError,
Expand Down Expand Up @@ -496,6 +496,7 @@ def __init__(
if not is_null_or_none(indirect_reference):
assert indirect_reference is not None, "mypy"
self.update(cast(DictionaryObject, indirect_reference.get_object()))
self._font_width_maps: Dict[str, Dict[str, float]] = {}

def hash_bin(self) -> int:
"""
Expand Down Expand Up @@ -1716,6 +1717,25 @@ def _debug_for_extract(self) -> str: # pragma: no cover
out += "No Font\n"
return out

def _get_acutual_font_widths(
self,
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
add_text: str,
font_size: float,
default_space_width: float
) -> Tuple[float, float, float]:
font_widths: float = 0
font_name: str = cmap[2]
if font_name not in self._font_width_maps:
self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
if add_text:
for char in add_text:
font_widths += compute_font_width(font_width_map, char)
return (font_widths * font_size, default_space_width * font_size, font_size)

def _extract_text(
self,
obj: Any,
Expand Down Expand Up @@ -1793,19 +1813,25 @@ def _extract_text(
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
_actual_str_size: Dict[str, float] = {
"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set to string length calculation result
TL = 0.0
font_size = 12.0 # init just in case of

def current_spacewidth() -> float:
return _space_width / 1000.0

def current_strwidths() -> float:
return _actual_str_size["str_widths"] / 1000.0

def process_operation(operator: bytes, operands: List[Any]) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text, output, text
nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
str_widths: float = 0.0
# Table 5.4 page 405
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Expand Down Expand Up @@ -1919,6 +1945,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
ty = float(operands[1])
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
str_widths = current_strwidths()
_actual_str_size["str_widths"] = 0.0
elif operator == b"Tm":
check_crlf_space = True
tm_matrix = [
Expand All @@ -1929,13 +1957,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
float(operands[4]),
float(operands[5]),
]
str_widths = current_strwidths()
_actual_str_size["str_widths"] = 0.0
elif operator == b"T*":
check_crlf_space = True
tm_matrix[5] -= TL

elif operator == b"Tj":
check_crlf_space = True
text, rtl_dir = handle_tj(
text, rtl_dir, add_text = handle_tj(
text,
operands,
cm_matrix,
Expand All @@ -1947,6 +1976,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
rtl_dir,
visitor_text,
)
current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
_actual_str_size["str_widths"] += current_font_widths
else:
return None
if check_crlf_space:
Expand All @@ -1961,7 +1993,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
output,
font_size,
visitor_text,
current_spacewidth(),
str_widths,
_actual_str_size["space_width"],
_actual_str_size["str_height"]
)
if text == "":
memo_cm = cm_matrix.copy()
Expand Down Expand Up @@ -2042,7 +2076,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
Expand Down
Loading
Loading