From 5400f5ada933b7b4f54ba9eb7c9db53729ef551c Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:07:59 +0900 Subject: [PATCH 01/59] BUG: Missing spaces in extract_text() method (#1328) --- pypdf/_page.py | 2 +- tests/test_text_extraction.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..8e9dbc21e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (math.ceil(abs(float(op))) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..faef6d980 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() + assert """Reporting crude oil leak. +Leak was isolated to well +pad. Segment of line was +immediately isolated, now +estimated at 5 barrels of oil +spilt. Root cause still +unknown at this time.""" == extracted From aac04364611818571fc24a53f36e325849f0371a Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:42:47 +0900 Subject: [PATCH 02/59] Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. --- pypdf/_page.py | 2 +- tests/test_text_extraction.py | 17 ----------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 8e9dbc21e..e4ec053c8 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (math.ceil(abs(float(op))) >= _space_width) + (abs(float(op)) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index faef6d980..2f0eaad1d 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,20 +189,3 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) - - -@pytest.mark.enable_socket() -def test_space_with_one_unit_smaller_than_font_width(): - """Tests for #1328""" - url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" - name = "iss1328.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - page = reader.pages[0] - extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert """Reporting crude oil leak. -Leak was isolated to well -pad. Segment of line was -immediately isolated, now -estimated at 5 barrels of oil -spilt. Root cause still -unknown at this time.""" == extracted From 64b1c92abec2d72d90086c0a074c3712ff86249d Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:44:42 +0900 Subject: [PATCH 03/59] BUG: Missing spaces in extract_text() method (#1328) --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e4ec053c8..8e9dbc21e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op)) >= _space_width) + (math.ceil(abs(float(op))) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): From 70e9b386a409b7dbe559fc1db4759eb866746d82 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:48:01 +0900 Subject: [PATCH 04/59] BUG: Missing spaces in extract_text() method (#1328) add test --- tests/test_text_extraction.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 2f0eaad1d..93082349a 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning): mock_logger_warning.assert_called_with( "Argument visitor_text is ignored in layout mode", "pypdf._page" ) + + +@pytest.mark.enable_socket() +def test_space_with_one_unit_smaller_than_font_width(): + """Tests for #1328""" + url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + name = "iss1328.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() + assert extracted == """Reporting crude oil leak. +Leak was isolated to well +pad. Segment of line was +immediately isolated, now +estimated at 5 barrels of oil +spilt. Root cause still +unknown at this time.""" From 65224e1f2dc85da56beedddd25179f22ccacf6be Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 13:42:47 +0900 Subject: [PATCH 05/59] Revert "BUG: Missing spaces in extract_text() method (#1328)" This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c. BUG: Missing spaces in extract_text() method (#1328) BUG: Missing spaces in extract_text() method (#1328) add test --- tests/test_text_extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index faef6d980..93082349a 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -199,10 +199,10 @@ def test_space_with_one_unit_smaller_than_font_width(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert """Reporting crude oil leak. + assert extracted == """Reporting crude oil leak. Leak was isolated to well pad. Segment of line was immediately isolated, now estimated at 5 barrels of oil spilt. Root cause still -unknown at this time.""" == extracted +unknown at this time.""" From f6dcb439a3e667802558440d79e6b388307a6fed Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 14:26:45 +0900 Subject: [PATCH 06/59] BUG: Missing spaces in extract_text() method (#1328) Convert font size comparison to ratio --- pypdf/_page.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 8e9dbc21e..ff4ef30c4 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1988,8 +1988,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) + # The space width may be smaller than the font width, so the width should be 95%. if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (math.ceil(abs(float(op))) >= _space_width) + (abs(float(op) / 0.95) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): From fd1c48930683bafe4f656b1fe742827a29d4feae Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:39:31 +0900 Subject: [PATCH 07/59] Correction to new file URL. Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_text_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 93082349a..08c4fc4f1 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -194,7 +194,7 @@ def test_layout_mode_warnings(mock_logger_warning): @pytest.mark.enable_socket() def test_space_with_one_unit_smaller_than_font_width(): """Tests for #1328""" - url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf" + url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf" name = "iss1328.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] From 2873b9eb9130a6529cc4e7003e37c3655bc3a7b6 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 24 Sep 2024 18:55:45 +0900 Subject: [PATCH 08/59] BUG: Missing spaces in extract_text() method (py-pdf#1328) calculation efficiency --- pypdf/_page.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index ff4ef30c4..87b914ce2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1985,12 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = _space_width * 0.95 for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) - # The space width may be smaller than the font width, so the width should be 95%. if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - (abs(float(op) / 0.95) >= _space_width) + (abs(float(op)) >= _confirm_space_width) and (len(text) > 0) and (text[-1] != " ") ): From 7597704eaa466934690853b370c850e2f6253e09 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Tue, 24 Sep 2024 21:14:02 +0900 Subject: [PATCH 09/59] BUG: Missing spaces in extract_text() method (py-pdf#1328) Simplify the assertion process --- tests/test_text_extraction.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 08c4fc4f1..8bfa1809e 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -198,11 +198,5 @@ def test_space_with_one_unit_smaller_than_font_width(): name = "iss1328.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] - extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip() - assert extracted == """Reporting crude oil leak. -Leak was isolated to well -pad. Segment of line was -immediately isolated, now -estimated at 5 barrels of oil -spilt. Root cause still -unknown at this time.""" + extracted = page.extract_text() + assert "Reporting crude oil leak.\n" in extracted From fb4de4105aa3d508e1d049aa2dce5b3b1a76461f Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Sat, 28 Sep 2024 19:00:47 +0900 Subject: [PATCH 10/59] BUG: Issue in text extraction (spaces) (#1153) --- pypdf/_cmap.py | 27 ++++++++ pypdf/_page.py | 22 +++++- pypdf/_text_extraction/__init__.py | 108 +++++++++-------------------- 3 files changed, 77 insertions(+), 80 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 4cc112552..e4ad87a0f 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -474,6 +474,33 @@ def compute_space_width( return sp_width +def compute_font_width( + ft: DictionaryObject, char_code: int, font_width: float +) -> float: + if "/Widths" not in ft: + return font_width + + w = list(ft["/Widths"]) + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > char_code or en < char_code: + raise Exception("Not in range") + if w[char_code - st].get_object() == 0: + raise Exception("null width") + char_width = w[char_code - st].get_object() + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + else: + return font_width + if is_null_or_none(char_width): + char_width = None + return char_width + + def type1_alternative( ft: DictionaryObject, map_dict: Dict[Any, Any], diff --git a/pypdf/_page.py b/pypdf/_page.py index 87b914ce2..e3cc74031 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,7 +49,7 @@ overload, ) -from ._cmap import build_char_map, unknown_char_map +from ._cmap import build_char_map, unknown_char_map, compute_font_width from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, @@ -1793,18 +1793,23 @@ def _extract_text( char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf + _font_widths: float = 0.0 TL = 0.0 font_size = 12.0 # init just in case of def current_spacewidth() -> float: return _space_width / 1000.0 + def current_fontwidths() -> float: + return _font_widths / 1000.0 + def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text, output, text + nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS + add_text: str = "" check_crlf_space: bool = False # Table 5.4 page 405 if operator == b"BT": @@ -1935,7 +1940,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: elif operator == b"Tj": check_crlf_space = True - text, rtl_dir = handle_tj( + text, rtl_dir, add_text = handle_tj( text, operands, cm_matrix, @@ -1947,6 +1952,16 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) + _font_widths = 0 + if add_text: + for char in add_text: + font_code = ord(char) + if cmap[3]: + font_width = compute_font_width(cmap[3], font_code, _space_width) + if font_width: + _font_widths = _font_widths + font_width + else: + _font_widths = current_spacewidth() else: return None if check_crlf_space: @@ -1962,6 +1977,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: font_size, visitor_text, current_spacewidth(), + current_fontwidths() ) if text == "": memo_cm = cm_matrix.copy() diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 3b1d687ea..75a0848ad 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -99,6 +99,7 @@ def crlf_space_check( font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], spacewidth: float, + font_width: float ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] @@ -115,85 +116,34 @@ def crlf_space_check( k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) f = font_size * k cm_prev = m + if orientation not in orientations: raise OrientationNotFoundError + if orientation in (0, 180): + moved_height: float = delta_y + moved_width: float = delta_x + elif orientation in (90, 270): + moved_height: float = delta_x + moved_width: float = delta_y try: - if orientation == 0: - if delta_y < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_y) < f * 0.3 - and abs(delta_x) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 180: - if delta_y > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_y) < f * 0.3 - and abs(delta_x) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 90: - if delta_x > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_x) < f * 0.3 - and abs(delta_y) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 270: - if delta_x < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) + if abs(moved_height) > 0.8 * f: + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) text = "" - elif ( - abs(delta_x) < f * 0.3 - and abs(delta_y) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " + elif ( + abs(moved_height) < f * 0.3 + and abs(moved_width) > (spacewidth + font_width) * f + and (output + text)[-1] != " " + ): + text += " " except Exception: pass tm_prev = tm_matrix.copy() @@ -214,12 +164,14 @@ def handle_tj( font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], -) -> Tuple[str, bool]: +) -> Tuple[str, bool, str]: + add_text = "" m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): text += operands[0] + add_text = operands[0] else: t: str = "" tt: bytes = ( @@ -272,6 +224,7 @@ def handle_tj( visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text + add_text = x + add_text else: # left-to-right # print(">",xx,x,end="") if rtl_dir: @@ -281,5 +234,6 @@ def handle_tj( visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x + add_text += x # fmt: on - return text, rtl_dir + return text, rtl_dir, add_text From 373eaec1cd013c43db3407372fa40743e5147839 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Sat, 28 Sep 2024 19:06:09 +0900 Subject: [PATCH 11/59] BUG: Issue in text extraction (spaces) (#1153) add test --- tests/test_text_extraction.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 8bfa1809e..ff318f9fe 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -200,3 +200,14 @@ def test_space_with_one_unit_smaller_than_font_width(): page = reader.pages[0] extracted = page.extract_text() assert "Reporting crude oil leak.\n" in extracted + + +@pytest.mark.enable_socket() +def test_space_position_calculation(): + """Tests for #1153""" + url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" + name = "iss1153.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[3] + extracted = page.extract_text() + assert "Shortly after the Geneva BOF session, the" in extracted From 066f594c23f754f25e65b1e09fd40f7c9315dac1 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Sat, 28 Sep 2024 20:01:26 +0900 Subject: [PATCH 12/59] style: Correcting code style issues --- pypdf/_page.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e3cc74031..4b74790e5 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,7 +49,7 @@ overload, ) -from ._cmap import build_char_map, unknown_char_map, compute_font_width +from ._cmap import build_char_map, compute_font_width, unknown_char_map from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, @@ -1716,6 +1716,19 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out + def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) -> float: + font_widths: float = 0 + if add_text: + for char in add_text: + font_code = ord(char) + if cmap[3]: + font_width = compute_font_width(cmap[3], font_code, default_width) + if font_width: + font_widths = font_widths + font_width + else: + font_widths = default_width + return font_widths + def _extract_text( self, obj: Any, @@ -1952,16 +1965,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) - _font_widths = 0 - if add_text: - for char in add_text: - font_code = ord(char) - if cmap[3]: - font_width = compute_font_width(cmap[3], font_code, _space_width) - if font_width: - _font_widths = _font_widths + font_width - else: - _font_widths = current_spacewidth() + _font_widths = self._get_font_widths(add_text, cmap, _space_width) else: return None if check_crlf_space: From d406e23e005b2dd0c48f7059d9fd55ef159ab6b9 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Sat, 28 Sep 2024 21:18:43 +0900 Subject: [PATCH 13/59] Text position return support --- pypdf/_text_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 75a0848ad..56137dbe1 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -140,7 +140,7 @@ def crlf_space_check( text = "" elif ( abs(moved_height) < f * 0.3 - and abs(moved_width) > (spacewidth + font_width) * f + and moved_width > (spacewidth + font_width) * f and (output + text)[-1] != " " ): text += " " From d338e18c743b629bc6ffd720807ec2c60d556fe5 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Sun, 29 Sep 2024 00:07:09 +0900 Subject: [PATCH 14/59] Add code for CIDFont --- pypdf/_cmap.py | 77 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index e4ad87a0f..cef105015 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -477,25 +477,66 @@ def compute_space_width( def compute_font_width( ft: DictionaryObject, char_code: int, font_width: float ) -> float: - if "/Widths" not in ft: - return font_width - - w = list(ft["/Widths"]) - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > char_code or en < char_code: - raise Exception("Not in range") - if w[char_code - st].get_object() == 0: - raise Exception("null width") - char_width = w[char_code - st].get_object() - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + char_width: float = font_width * 2.0 # default value + w = [] + char_code_width = {} + st: int = 0 + # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts + # Widths for a CIDFont are defined using the DW and W entries. + # DW2 and W2 are for vertical use. Vertical type is not implemented. + if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): + ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore + try: + char_code_width["default"] = cast(float, ft1["/DW"]) + except Exception: + char_code_width["default"] = 1000.0 # Default font width is 0.1 + if "/W" in ft1: + # Starting C [W1 W2 ... Wn] + # C_first - C_last same W + w = list(ft1["/W"]) else: - return font_width + w = [] + while len(w) > 0: + st = w[0] if isinstance(w[0], int) else w[0].get_object() + second = w[1].get_object() + if isinstance(second, int): + for x in range(st, second): + char_code_width[x] = w[2] + w = w[3:] + elif isinstance(second, list): + for y in second: + char_code_width[st] = y + st += 1 + w = w[2:] + else: + logger_warning( + "unknown widths : \n" + (ft1["/W"]).__repr__(), + __name__, + ) + break + try: + char_width = char_code_width[char_code] + except Exception: + char_width = ( + char_code_width["default"] + ) + elif "/Widths" in ft: + w = list(ft["/Widths"]) + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > char_code or en < char_code: + raise Exception("Not in range") + if w[char_code - st].get_object() == 0: + raise Exception("null width") + char_width = w[char_code - st].get_object() + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + else: + return font_width if is_null_or_none(char_width): char_width = None return char_width From f7c4236e034c744d4b77c4bd26ad0d3a01bf4241 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 06:01:24 +0900 Subject: [PATCH 15/59] Added horizontal CIDFont calculation code --- pypdf/_cmap.py | 20 ++++++++++++++------ pypdf/_page.py | 4 +++- pypdf/_text_extraction/__init__.py | 1 + 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index cef105015..6cb3680ef 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -475,8 +475,10 @@ def compute_space_width( def compute_font_width( - ft: DictionaryObject, char_code: int, font_width: float + cmap: Tuple, char_code: int, font_width: float ) -> float: + ft: DictionaryObject = cmap[3] + char_code_map: dict = cmap[1] char_width: float = font_width * 2.0 # default value w = [] char_code_width = {} @@ -491,8 +493,6 @@ def compute_font_width( except Exception: char_code_width["default"] = 1000.0 # Default font width is 0.1 if "/W" in ft1: - # Starting C [W1 W2 ... Wn] - # C_first - C_last same W w = list(ft1["/W"]) else: w = [] @@ -500,12 +500,20 @@ def compute_font_width( st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): - for x in range(st, second): - char_code_width[x] = w[2] + # C_first - C_last same W + for x in range(st, second + 1): + try: + char_code_width[ord(char_code_map[chr(x)])] = w[2] + except Exception: + char_code_width[x] = w[2] w = w[3:] elif isinstance(second, list): + # Starting C [W1 W2 ... Wn] for y in second: - char_code_width[st] = y + try: + char_code_width[ord(char_code_map[chr(st)])] = y + except Exception: + char_code_width[st] = y st += 1 w = w[2:] else: diff --git a/pypdf/_page.py b/pypdf/_page.py index 4b74790e5..19d17716c 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1722,7 +1722,7 @@ def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) -> for char in add_text: font_code = ord(char) if cmap[3]: - font_width = compute_font_width(cmap[3], font_code, default_width) + font_width = compute_font_width(cmap, font_code, default_width) if font_width: font_widths = font_widths + font_width else: @@ -1965,6 +1965,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) + if "San" in add_text: + pass _font_widths = self._get_font_widths(add_text, cmap, _space_width) else: return None diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 56137dbe1..a4d0c3c70 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -211,6 +211,7 @@ def handle_tj( or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x + add_text = x if rtl_dir else add_text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF From a32fbc9aaffca39056d2630e2540912d59b1036b Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 06:36:05 +0900 Subject: [PATCH 16/59] Style: Correcting code style issues --- pypdf/_cmap.py | 24 ++++++++++++++---------- pypdf/_page.py | 9 ++++++++- pypdf/_text_extraction/__init__.py | 4 ++-- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 6cb3680ef..8b6aa544c 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -1,6 +1,6 @@ from binascii import unhexlify from math import ceil -from typing import Any, Dict, List, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_error, logger_warning @@ -475,10 +475,14 @@ def compute_space_width( def compute_font_width( - cmap: Tuple, char_code: int, font_width: float + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + char_code: int, + font_width: float ) -> float: - ft: DictionaryObject = cmap[3] - char_code_map: dict = cmap[1] + ft = cmap[3] + char_code_map = cmap[1] char_width: float = font_width * 2.0 # default value w = [] char_code_width = {} @@ -503,17 +507,17 @@ def compute_font_width( # C_first - C_last same W for x in range(st, second + 1): try: - char_code_width[ord(char_code_map[chr(x)])] = w[2] + char_code_width[str(ord(char_code_map[chr(x)]))] = w[2] except Exception: - char_code_width[x] = w[2] + char_code_width[str(x)] = w[2] w = w[3:] elif isinstance(second, list): # Starting C [W1 W2 ... Wn] for y in second: try: - char_code_width[ord(char_code_map[chr(st)])] = y + char_code_width[str(ord(char_code_map[chr(st)]))] = y except Exception: - char_code_width[st] = y + char_code_width[str(st)] = y st += 1 w = w[2:] else: @@ -523,7 +527,7 @@ def compute_font_width( ) break try: - char_width = char_code_width[char_code] + char_width = char_code_width[str(char_code)] except Exception: char_width = ( char_code_width["default"] @@ -546,7 +550,7 @@ def compute_font_width( else: return font_width if is_null_or_none(char_width): - char_width = None + char_width = char_code_width["default"] return char_width diff --git a/pypdf/_page.py b/pypdf/_page.py index 19d17716c..53196a14f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1716,7 +1716,14 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out - def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) -> float: + def _get_font_widths( + self, + add_text: str, + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + default_width: float + ) -> float: font_widths: float = 0 if add_text: for char in add_text: diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index a4d0c3c70..c29aa4c1d 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -123,8 +123,8 @@ def crlf_space_check( moved_height: float = delta_y moved_width: float = delta_x elif orientation in (90, 270): - moved_height: float = delta_x - moved_width: float = delta_y + moved_height = delta_x + moved_width = delta_y try: if abs(moved_height) > 0.8 * f: if (output + text)[-1] != "\n": From a237f2dd81ae06d41be182733a0678c66dd298ab Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 16:17:16 +0900 Subject: [PATCH 17/59] Integrate font width calculation and space width calculation --- pypdf/_cmap.py | 198 +++++++++++++++++++------------------------------ pypdf/_page.py | 20 +++-- 2 files changed, 86 insertions(+), 132 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 8b6aa544c..12bccd4b9 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -1,6 +1,6 @@ from binascii import unhexlify from math import ceil -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_error, logger_warning @@ -75,11 +75,6 @@ def build_char_map_from_dict( for x in int_entry: if x <= 255: encoding[x] = chr(x) - try: - # override space_width with new params - space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] - except Exception: - pass # I consider the space_code is available on one byte if isinstance(space_code, str): try: # one byte @@ -87,16 +82,17 @@ def build_char_map_from_dict( except Exception: sp = space_code.encode("utf-16-be") sp = sp[0] + 256 * sp[1] + sp = ord(map_dict[chr(sp)]) else: sp = space_code - sp_width = compute_space_width(ft, sp, space_width) + sp_width, font_width_map = compute_space_width(ft, sp, map_dict) return ( font_type, - float(sp_width / 2), + sp_width, encoding, # https://github.com/python/mypy/issues/4374 - map_dict, + map_dict ) @@ -403,18 +399,25 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> def compute_space_width( - ft: DictionaryObject, space_code: int, space_width: float -) -> float: - sp_width: float = space_width * 2.0 # default value - w = [] - w1 = {} + ft: DictionaryObject, sp: int, map_dict: Dict[Any, Any] +) -> Tuple[float, Dict[Any, float]]: + char_code_width = {} st: int = 0 + en: int = 0 + try: + default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2 + except Exception: + default_font_width = 2000.0 # Default font width is 0.2 + sp_width: float = default_font_width # default value if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): + # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts + # Widths for a CIDFont are defined using the DW and W entries. + # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore try: - w1[-1] = cast(float, ft1["/DW"]) + char_code_width["default"] = cast(float, ft1["/DW"]) except Exception: - w1[-1] = 1000.0 + char_code_width["default"] = default_font_width if "/W" in ft1: w = list(ft1["/W"]) else: @@ -423,13 +426,25 @@ def compute_space_width( st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): - for x in range(st, second): - w1[x] = w[2] + # C_first C_last same_W + en = second + for c_code in range(st, en + 1): + try: + conversion_char = map_dict[chr(c_code)] + char_code_width[ord(conversion_char)] = w[2] + except Exception: + pass w = w[3:] elif isinstance(second, list): - for y in second: - w1[st] = y - st += 1 + # Starting_C [W1 W2 ... Wn] + c_code = st + for width in second: + try: + conversion_char = map_dict[chr(c_code)] + char_code_width[ord(conversion_char)] = width + except Exception: + pass + c_code += 1 w = w[2:] else: logger_warning( @@ -438,119 +453,60 @@ def compute_space_width( ) break try: - sp_width = w1[space_code] + sp_width = char_code_width[sp] except Exception: sp_width = ( - w1[-1] / 2.0 + char_code_width["default"] / 2.0 ) # if using default we consider space will be only half size elif "/Widths" in ft: - w = list(ft["/Widths"]) # type: ignore + w = list(ft["/Widths"]) + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + char_code_width["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + else: + # will consider width of char as avg(width) + m = 0 + cpt = 0 + for xx in w: + xx = xx.get_object() + if xx > 0: + m += xx + cpt += 1 + char_code_width["default"] = m / max(1, cpt) try: st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st].get_object() == 0: - raise Exception("null width") - sp_width = w[space_code - st].get_object() + en = cast(int, ft["/LastChar"]) + if st > sp or en < sp: + raise Exception("There is no space character code in the font range") + for c_code in range(st, en + 1): + width = w[c_code - st].get_object() + if width == 0: + raise Exception("The PDF structure is invalid. The array is too " + "small for the specified font width.") + char_code_width[c_code] = width except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for xx in w: - xx = xx.get_object() - if xx > 0: - m += xx - cpt += 1 - sp_width = m / max(1, cpt) / 2 - + if "default" in char_code_width: + sp_width = char_code_width["default"] + if not sp_width: + sp_width = char_code_width[sp].get_object() if is_null_or_none(sp_width): sp_width = 0.0 - return sp_width + return sp_width, char_code_width def compute_font_width( - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] - ], - char_code: int, - font_width: float + font_width_map: Dict[Any, float], + char_code: int ) -> float: - ft = cmap[3] - char_code_map = cmap[1] - char_width: float = font_width * 2.0 # default value - w = [] - char_code_width = {} - st: int = 0 - # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts - # Widths for a CIDFont are defined using the DW and W entries. - # DW2 and W2 are for vertical use. Vertical type is not implemented. - if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): - ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore - try: - char_code_width["default"] = cast(float, ft1["/DW"]) - except Exception: - char_code_width["default"] = 1000.0 # Default font width is 0.1 - if "/W" in ft1: - w = list(ft1["/W"]) - else: - w = [] - while len(w) > 0: - st = w[0] if isinstance(w[0], int) else w[0].get_object() - second = w[1].get_object() - if isinstance(second, int): - # C_first - C_last same W - for x in range(st, second + 1): - try: - char_code_width[str(ord(char_code_map[chr(x)]))] = w[2] - except Exception: - char_code_width[str(x)] = w[2] - w = w[3:] - elif isinstance(second, list): - # Starting C [W1 W2 ... Wn] - for y in second: - try: - char_code_width[str(ord(char_code_map[chr(st)]))] = y - except Exception: - char_code_width[str(st)] = y - st += 1 - w = w[2:] - else: - logger_warning( - "unknown widths : \n" + (ft1["/W"]).__repr__(), - __name__, - ) - break - try: - char_width = char_code_width[str(char_code)] - except Exception: - char_width = ( - char_code_width["default"] - ) - elif "/Widths" in ft: - w = list(ft["/Widths"]) - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > char_code or en < char_code: - raise Exception("Not in range") - if w[char_code - st].get_object() == 0: - raise Exception("null width") - char_width = w[char_code - st].get_object() - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore - else: - return font_width - if is_null_or_none(char_width): - char_width = char_code_width["default"] + char_width: float = 0.0 + try: + char_width = font_width_map[char_code] + except Exception: + char_width = ( + font_width_map["default"] + ) + return char_width diff --git a/pypdf/_page.py b/pypdf/_page.py index 53196a14f..6835e5673 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,7 +49,7 @@ overload, ) -from ._cmap import build_char_map, compute_font_width, unknown_char_map +from ._cmap import build_char_map, compute_font_width, compute_space_width, unknown_char_map from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, @@ -1719,21 +1719,16 @@ def _debug_for_extract(self) -> str: # pragma: no cover def _get_font_widths( self, add_text: str, - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] - ], + font_width_map: Dict[Any, float], default_width: float ) -> float: font_widths: float = 0 if add_text: for char in add_text: - font_code = ord(char) - if cmap[3]: - font_width = compute_font_width(cmap, font_code, default_width) - if font_width: - font_widths = font_widths + font_width + if font_width_map: + font_widths += compute_font_width(font_width_map, ord(char)) else: - font_widths = default_width + font_widths += default_width return font_widths def _extract_text( @@ -1974,7 +1969,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: ) if "San" in add_text: pass - _font_widths = self._get_font_widths(add_text, cmap, _space_width) + if add_text == "l": + pass + _, font_width_map = compute_space_width(cmap[3], 32, cmap[1]) + _font_widths = self._get_font_widths(add_text, font_width_map, _space_width) else: return None if check_crlf_space: From e159e4dd7c29a90db4b85b6e3e1b443e355f31ec Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 16:47:18 +0900 Subject: [PATCH 18/59] Font width map and space width acquisition process separation --- pypdf/_cmap.py | 74 +++++++++++++++++++++++++------------------------- pypdf/_page.py | 8 ++---- 2 files changed, 39 insertions(+), 43 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 12bccd4b9..703eb6524 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -85,7 +85,8 @@ def build_char_map_from_dict( sp = ord(map_dict[chr(sp)]) else: sp = space_code - sp_width, font_width_map = compute_space_width(ft, sp, map_dict) + font_width_map = build_font_width_map(ft, map_dict) + sp_width = compute_space_width(font_width_map, sp) return ( font_type, @@ -398,26 +399,25 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> lst = lst[2:] -def compute_space_width( - ft: DictionaryObject, sp: int, map_dict: Dict[Any, Any] -) -> Tuple[float, Dict[Any, float]]: - char_code_width = {} +def build_font_width_map( + ft: DictionaryObject, map_dict: Dict[Any, Any] +) -> Dict[Any, float]: + font_width_map = {} st: int = 0 en: int = 0 try: default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2 except Exception: default_font_width = 2000.0 # Default font width is 0.2 - sp_width: float = default_font_width # default value if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore try: - char_code_width["default"] = cast(float, ft1["/DW"]) + font_width_map["default"] = cast(float, ft1["/DW"]) except Exception: - char_code_width["default"] = default_font_width + font_width_map["default"] = default_font_width if "/W" in ft1: w = list(ft1["/W"]) else: @@ -431,7 +431,7 @@ def compute_space_width( for c_code in range(st, en + 1): try: conversion_char = map_dict[chr(c_code)] - char_code_width[ord(conversion_char)] = w[2] + font_width_map[ord(conversion_char)] = w[2] except Exception: pass w = w[3:] @@ -441,7 +441,7 @@ def compute_space_width( for width in second: try: conversion_char = map_dict[chr(c_code)] - char_code_width[ord(conversion_char)] = width + font_width_map[ord(conversion_char)] = width except Exception: pass c_code += 1 @@ -452,18 +452,12 @@ def compute_space_width( __name__, ) break - try: - sp_width = char_code_width[sp] - except Exception: - sp_width = ( - char_code_width["default"] / 2.0 - ) # if using default we consider space will be only half size elif "/Widths" in ft: w = list(ft["/Widths"]) if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): - char_code_width["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore else: # will consider width of char as avg(width) m = 0 @@ -473,26 +467,32 @@ def compute_space_width( if xx > 0: m += xx cpt += 1 - char_code_width["default"] = m / max(1, cpt) - try: - st = cast(int, ft["/FirstChar"]) - en = cast(int, ft["/LastChar"]) - if st > sp or en < sp: - raise Exception("There is no space character code in the font range") - for c_code in range(st, en + 1): - width = w[c_code - st].get_object() - if width == 0: - raise Exception("The PDF structure is invalid. The array is too " - "small for the specified font width.") - char_code_width[c_code] = width - except Exception: - if "default" in char_code_width: - sp_width = char_code_width["default"] - if not sp_width: - sp_width = char_code_width[sp].get_object() - if is_null_or_none(sp_width): - sp_width = 0.0 - return sp_width, char_code_width + font_width_map["default"] = m / max(1, cpt) + st = cast(int, ft["/FirstChar"]) + en = cast(int, ft["/LastChar"]) + for c_code in range(st, en + 1): + width = w[c_code - st].get_object() + if is_null_or_none(width): + # The PDF structure is invalid. The array is too small + # for the specified font width. + pass + font_width_map[c_code] = width + if "defalut" not in font_width_map: + font_width_map["default"] = default_font_width + return font_width_map + + +def compute_space_width( + font_width_map: Dict[Any, float], sp: int +) -> float: + try: + sp_width = font_width_map[sp] + except Exception: + sp_width = ( + font_width_map["default"] / 2.0 + ) # if using default we consider space will be only half size + + return sp_width def compute_font_width( diff --git a/pypdf/_page.py b/pypdf/_page.py index 6835e5673..9a1fbe96e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,7 +49,7 @@ overload, ) -from ._cmap import build_char_map, compute_font_width, compute_space_width, unknown_char_map +from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, @@ -1967,11 +1967,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) - if "San" in add_text: - pass - if add_text == "l": - pass - _, font_width_map = compute_space_width(cmap[3], 32, cmap[1]) + font_width_map = build_font_width_map(cmap[3], cmap[1]) _font_widths = self._get_font_widths(add_text, font_width_map, _space_width) else: return None From a19a8f4dba7c8455848fb8c1ca6e80ce7c3c592f Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 19:03:47 +0900 Subject: [PATCH 19/59] Revert to original adjustment space width --- pypdf/_cmap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 703eb6524..51eff91ed 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -86,11 +86,11 @@ def build_char_map_from_dict( else: sp = space_code font_width_map = build_font_width_map(ft, map_dict) - sp_width = compute_space_width(font_width_map, sp) + half_space_width = compute_space_width(font_width_map, sp) / 2.0 return ( font_type, - sp_width, + half_space_width, encoding, # https://github.com/python/mypy/issues/4374 map_dict @@ -408,7 +408,7 @@ def build_font_width_map( try: default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2 except Exception: - default_font_width = 2000.0 # Default font width is 0.2 + default_font_width = 1000.0 # Default font width is 0.1 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts # Widths for a CIDFont are defined using the DW and W entries. From 6dbda504671dabab218e2288d3688796f1425e41 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 19:06:18 +0900 Subject: [PATCH 20/59] Supports diagonal travel distance --- pypdf/_text_extraction/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index c29aa4c1d..accfcc691 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -139,8 +139,8 @@ def crlf_space_check( ) text = "" elif ( - abs(moved_height) < f * 0.3 - and moved_width > (spacewidth + font_width) * f + (math.sqrt(moved_width * moved_width + moved_height * moved_height) + > (spacewidth + font_width) * f) and (output + text)[-1] != " " ): text += " " From 34efe522fba284ab9fa23add13f0f1035435164c Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 19:24:34 +0900 Subject: [PATCH 21/59] Font size defaults to twice the space --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 9a1fbe96e..cf7c60379 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1968,7 +1968,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: visitor_text, ) font_width_map = build_font_width_map(cmap[3], cmap[1]) - _font_widths = self._get_font_widths(add_text, font_width_map, _space_width) + _font_widths = self._get_font_widths(add_text, font_width_map, _space_width * 2.0) else: return None if check_crlf_space: From 52aa7ac2d5738ea7a5d9ff9318437ec5e7ff36b2 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 19:31:25 +0900 Subject: [PATCH 22/59] Get the default space width from the argument --- pypdf/_cmap.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 51eff91ed..3bda18e2e 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -86,7 +86,8 @@ def build_char_map_from_dict( else: sp = space_code font_width_map = build_font_width_map(ft, map_dict) - half_space_width = compute_space_width(font_width_map, sp) / 2.0 + half_space_width = compute_space_width( + font_width_map, sp, space_width) / 2.0 return ( font_type, @@ -402,11 +403,11 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> def build_font_width_map( ft: DictionaryObject, map_dict: Dict[Any, Any] ) -> Dict[Any, float]: - font_width_map = {} + font_width_map: Dict[Any, float] = {} st: int = 0 en: int = 0 try: - default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2 + default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 except Exception: default_font_width = 1000.0 # Default font width is 0.1 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): @@ -483,14 +484,17 @@ def build_font_width_map( def compute_space_width( - font_width_map: Dict[Any, float], sp: int + font_width_map: Dict[Any, float], sp: int, default_space_width: float ) -> float: try: sp_width = font_width_map[sp] except Exception: - sp_width = ( - font_width_map["default"] / 2.0 - ) # if using default we consider space will be only half size + if default_space_width: + sp_width = default_space_width + else: + sp_width = ( + font_width_map["default"] / 2.0 + ) # if using default we consider space will be only half size return sp_width From 7a028bbab8e4c77153e80707e37d701657ed6717 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 20:58:42 +0900 Subject: [PATCH 23/59] fix self-made bugs --- pypdf/_cmap.py | 9 +++++++-- pypdf/_text_extraction/__init__.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 3bda18e2e..ce5b1aea2 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -82,7 +82,10 @@ def build_char_map_from_dict( except Exception: sp = space_code.encode("utf-16-be") sp = sp[0] + 256 * sp[1] - sp = ord(map_dict[chr(sp)]) + try: + sp = ord(map_dict[chr(sp)]) + except Exception: + pass else: sp = space_code font_width_map = build_font_width_map(ft, map_dict) @@ -401,7 +404,7 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> def build_font_width_map( - ft: DictionaryObject, map_dict: Dict[Any, Any] + ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any] ) -> Dict[Any, float]: font_width_map: Dict[Any, float] = {} st: int = 0 @@ -488,6 +491,8 @@ def compute_space_width( ) -> float: try: sp_width = font_width_map[sp] + if sp_width == 0: + raise Exception("Zero width") except Exception: if default_space_width: sp_width = default_space_width diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index accfcc691..c87631579 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -137,10 +137,11 @@ def crlf_space_check( cmap[3], font_size, ) - text = "" + text = "" elif ( (math.sqrt(moved_width * moved_width + moved_height * moved_height) > (spacewidth + font_width) * f) + and (moved_width >= 0) # The string are not back. and (output + text)[-1] != " " ): text += " " From f02fa23c4409c17b25bec1df3c99e044966cbff7 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 21:03:18 +0900 Subject: [PATCH 24/59] Style: Correcting code style issues --- pypdf/_cmap.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index ce5b1aea2..d5b44e13d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -413,6 +413,9 @@ def build_font_width_map( default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 except Exception: default_font_width = 1000.0 # Default font width is 0.1 + if ft is None: + font_width_map["default"] = default_font_width + return font_width_map if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts # Widths for a CIDFont are defined using the DW and W entries. From 980d8316075f4160bdad2d0ced82e4f90c105ce2 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 21:05:45 +0900 Subject: [PATCH 25/59] Style: Correcting code style issues --- pypdf/_cmap.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index d5b44e13d..d229841b1 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -409,13 +409,14 @@ def build_font_width_map( font_width_map: Dict[Any, float] = {} st: int = 0 en: int = 0 - try: - default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 - except Exception: - default_font_width = 1000.0 # Default font width is 0.1 + default_font_width = 1000.0 # Default font width is 0.1 if ft is None: font_width_map["default"] = default_font_width return font_width_map + try: + default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 + except Exception: + pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts # Widths for a CIDFont are defined using the DW and W entries. From 5e6a0dd52853ef424bc54699a1ba4444b0dd9e95 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 21:25:57 +0900 Subject: [PATCH 26/59] fix self-made bugs --- pypdf/_cmap.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index d229841b1..15f49419c 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -479,12 +479,13 @@ def build_font_width_map( st = cast(int, ft["/FirstChar"]) en = cast(int, ft["/LastChar"]) for c_code in range(st, en + 1): - width = w[c_code - st].get_object() - if is_null_or_none(width): + try: + width = w[c_code - st].get_object() + font_width_map[c_code] = width + except Exception: # The PDF structure is invalid. The array is too small # for the specified font width. pass - font_width_map[c_code] = width if "defalut" not in font_width_map: font_width_map["default"] = default_font_width return font_width_map From 8078ac14b85c56768d05edebb95af91ea4c831eb Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Sun, 29 Sep 2024 21:33:18 +0900 Subject: [PATCH 27/59] Style: Correcting code style issues --- pypdf/_cmap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 15f49419c..9f66c9c2c 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -409,12 +409,12 @@ def build_font_width_map( font_width_map: Dict[Any, float] = {} st: int = 0 en: int = 0 - default_font_width = 1000.0 # Default font width is 0.1 + default_font_width: float = 1000.0 # Default font width is 0.1 if ft is None: font_width_map["default"] = default_font_width return font_width_map try: - default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 + default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 except Exception: pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): From b842ceed2f1a7b2a825c72fd3d4168414fc993c2 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 17:44:02 +0900 Subject: [PATCH 28/59] Compliant with PDF1.7 specifications --- pypdf/_text_extraction/__init__.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index c87631579..e2adc15b6 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -113,8 +113,9 @@ def crlf_space_check( orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] - k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) - f = font_size * k + # PDF 32000-1:2008 p249 Table 108 Text positioning operators + scale_x = math.sqrt(cm_matrix[0]**2 + cm_matrix[1]**2) + scale_y = math.sqrt(cm_matrix[2]**2 + cm_matrix[3]**2) cm_prev = m if orientation not in orientations: @@ -126,7 +127,7 @@ def crlf_space_check( moved_height = delta_x moved_width = delta_y try: - if abs(moved_height) > 0.8 * f: + if abs(moved_height) > 0.8 * font_size * scale_y: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: @@ -139,9 +140,7 @@ def crlf_space_check( ) text = "" elif ( - (math.sqrt(moved_width * moved_width + moved_height * moved_height) - > (spacewidth + font_width) * f) - and (moved_width >= 0) # The string are not back. + (moved_width >= (spacewidth + font_width) * font_size * scale_x) and (output + text)[-1] != " " ): text += " " From d1c54dbed6f566d58258fabafb74d7a78c2bcf9b Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 20:49:05 +0900 Subject: [PATCH 29/59] fix self-made bugs --- pypdf/_text_extraction/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index e2adc15b6..d2ed22fae 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -114,8 +114,8 @@ def crlf_space_check( delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # PDF 32000-1:2008 p249 Table 108 Text positioning operators - scale_x = math.sqrt(cm_matrix[0]**2 + cm_matrix[1]**2) - scale_y = math.sqrt(cm_matrix[2]**2 + cm_matrix[3]**2) + scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[1]**2) + scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m if orientation not in orientations: From 328d22be9d2c10ff0888f64e777464ffcc617aff Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 20:50:10 +0900 Subject: [PATCH 30/59] font_map efficiency --- pypdf/_page.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index cf7c60379..502c4478c 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1759,6 +1759,7 @@ def _extract_text( str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject ], ] = {} + font_width_maps: Dict[str, Dict[str, float]] = {} try: objr = obj while NameObject(PG.RESOURCES) not in objr: @@ -1822,10 +1823,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths + nonlocal font_width_maps global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS add_text: str = "" check_crlf_space: bool = False + font_widths: float = 0.0 # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] @@ -1939,6 +1942,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: ty = float(operands[1]) tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] + font_widths = current_fontwidths() + _font_widths = 0.0 elif operator == b"Tm": check_crlf_space = True tm_matrix = [ @@ -1967,8 +1972,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) - font_width_map = build_font_width_map(cmap[3], cmap[1]) - _font_widths = self._get_font_widths(add_text, font_width_map, _space_width * 2.0) + if cmap[2] not in font_width_maps: + font_width_maps[cmap[2]] = build_font_width_map(cmap[3], cmap[1]) + font_width_map = font_width_maps[cmap[2]] + _font_widths += self._get_font_widths(add_text, font_width_map, _space_width * 2.0) else: return None if check_crlf_space: @@ -1984,7 +1991,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: font_size, visitor_text, current_spacewidth(), - current_fontwidths() + font_widths ) if text == "": memo_cm = cm_matrix.copy() From e3924167820927cec09ad1ccaa0aa9737f0fa9d1 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 21:24:46 +0900 Subject: [PATCH 31/59] fix self-made bugs --- pypdf/_cmap.py | 2 +- pypdf/_page.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9f66c9c2c..b67f1495d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -486,7 +486,7 @@ def build_font_width_map( # The PDF structure is invalid. The array is too small # for the specified font width. pass - if "defalut" not in font_width_map: + if "default" not in font_width_map: font_width_map["default"] = default_font_width return font_width_map diff --git a/pypdf/_page.py b/pypdf/_page.py index 502c4478c..e5432e5b3 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1954,6 +1954,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: float(operands[4]), float(operands[5]), ] + font_widths = current_fontwidths() + _font_widths = 0.0 elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL From 9e6d2cea7b592fe37c601270ae3b2d80357a71cf Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 21:37:35 +0900 Subject: [PATCH 32/59] fix self-made bugs --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index b67f1495d..9a75e0c53 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -486,7 +486,7 @@ def build_font_width_map( # The PDF structure is invalid. The array is too small # for the specified font width. pass - if "default" not in font_width_map: + if is_null_or_none(font_width_map.get("default")): font_width_map["default"] = default_font_width return font_width_map From 1fe5285d22ba4de0b659f8cd734451d2c94eaac7 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Mon, 30 Sep 2024 23:14:36 +0900 Subject: [PATCH 33/59] style: Correcting code style issues --- pypdf/_page.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e5432e5b3..9458bd3e6 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -496,6 +496,7 @@ def __init__( if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) + self._font_width_maps: Dict[str, Dict[str, float]] = {} def hash_bin(self) -> int: """ @@ -1718,11 +1719,17 @@ def _debug_for_extract(self) -> str: # pragma: no cover def _get_font_widths( self, + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], add_text: str, - font_width_map: Dict[Any, float], default_width: float ) -> float: font_widths: float = 0 + font_name: str = cmap[2] + if font_name not in self._font_width_maps: + self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1]) + font_width_map: Dict[Any, float] = self._font_width_maps[font_name] if add_text: for char in add_text: if font_width_map: @@ -1759,7 +1766,6 @@ def _extract_text( str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject ], ] = {} - font_width_maps: Dict[str, Dict[str, float]] = {} try: objr = obj while NameObject(PG.RESOURCES) not in objr: @@ -1823,10 +1829,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths - nonlocal font_width_maps global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS - add_text: str = "" check_crlf_space: bool = False font_widths: float = 0.0 # Table 5.4 page 405 @@ -1959,7 +1963,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL - elif operator == b"Tj": check_crlf_space = True text, rtl_dir, add_text = handle_tj( @@ -1974,10 +1977,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) - if cmap[2] not in font_width_maps: - font_width_maps[cmap[2]] = build_font_width_map(cmap[3], cmap[1]) - font_width_map = font_width_maps[cmap[2]] - _font_widths += self._get_font_widths(add_text, font_width_map, _space_width * 2.0) + _font_widths += self._get_font_widths(cmap, add_text, _space_width * 2.0) else: return None if check_crlf_space: @@ -2074,7 +2074,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() - else: process_operation(operator, operands) if visitor_operand_after is not None: From 0292b13f32895054e1d4dfab5a898da21613ce26 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Tue, 1 Oct 2024 10:06:41 +0900 Subject: [PATCH 34/59] Style: Correcting code style issues --- pypdf/_cmap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9a75e0c53..80e9b1bec 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -5,6 +5,7 @@ from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_error, logger_warning from .generic import ( + ArrayObject, DecodedStreamObject, DictionaryObject, StreamObject, @@ -461,7 +462,7 @@ def build_font_width_map( ) break elif "/Widths" in ft: - w = list(ft["/Widths"]) + w = list(ft["/Widths"]) if isinstance(ft["/Widths"], ArrayObject) else [] if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): From 444bef8e4c87f407d3c732e60add868784107992 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 18:02:50 +0900 Subject: [PATCH 35/59] BUG: Changed timing of font size calculation to before font switching --- pypdf/_page.py | 37 +++++++++++++++++------------- pypdf/_text_extraction/__init__.py | 11 +++++---- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 9458bd3e6..25842cb76 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1717,13 +1717,14 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out - def _get_font_widths( + def _get_acutual_font_widths( self, cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], add_text: str, - default_width: float + font_size: float, + default_space_width: float ) -> float: font_widths: float = 0 font_name: str = cmap[2] @@ -1735,8 +1736,8 @@ def _get_font_widths( if font_width_map: font_widths += compute_font_width(font_width_map, ord(char)) else: - font_widths += default_width - return font_widths + font_widths += default_space_width * 2 + return (font_widths * font_size, default_space_width * font_size, font_size) def _extract_text( self, @@ -1815,24 +1816,25 @@ def _extract_text( char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf - _font_widths: float = 0.0 + _actual_str_size: Dict[str, float] = { + "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set string length calculation result TL = 0.0 font_size = 12.0 # init just in case of def current_spacewidth() -> float: return _space_width / 1000.0 - def current_fontwidths() -> float: - return _font_widths / 1000.0 + def current_strwidths() -> float: + return _actual_str_size["str_widths"] / 1000.0 def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths + nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False - font_widths: float = 0.0 + str_widths: float = 0.0 # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] @@ -1946,8 +1948,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: ty = float(operands[1]) tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] - font_widths = current_fontwidths() - _font_widths = 0.0 + str_widths = current_strwidths() + _actual_str_size["str_widths"] = 0.0 elif operator == b"Tm": check_crlf_space = True tm_matrix = [ @@ -1958,8 +1960,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: float(operands[4]), float(operands[5]), ] - font_widths = current_fontwidths() - _font_widths = 0.0 + str_widths = current_strwidths() + _actual_str_size["str_widths"] = 0.0 elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL @@ -1977,7 +1979,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) - _font_widths += self._get_font_widths(cmap, add_text, _space_width * 2.0) + current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = ( + self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth())) + _actual_str_size["str_widths"] += current_font_widths else: return None if check_crlf_space: @@ -1992,8 +1996,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: output, font_size, visitor_text, - current_spacewidth(), - font_widths + str_widths, + _actual_str_size["space_width"], + _actual_str_size["str_height"] ) if text == "": memo_cm = cm_matrix.copy() diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index d2ed22fae..6fbec903f 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -98,8 +98,9 @@ def crlf_space_check( output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + str_widhts: float, spacewidth: float, - font_width: float + str_height: float, ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] @@ -114,8 +115,8 @@ def crlf_space_check( delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # PDF 32000-1:2008 p249 Table 108 Text positioning operators - scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[1]**2) - scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) + scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) + scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) cm_prev = m if orientation not in orientations: @@ -127,7 +128,7 @@ def crlf_space_check( moved_height = delta_x moved_width = delta_y try: - if abs(moved_height) > 0.8 * font_size * scale_y: + if abs(moved_height) > 0.8 * str_height * scale_y: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: @@ -140,7 +141,7 @@ def crlf_space_check( ) text = "" elif ( - (moved_width >= (spacewidth + font_width) * font_size * scale_x) + (moved_width >= spacewidth + str_widhts * scale_x) and (output + text)[-1] != " " ): text += " " From ce36f4886d45eef0ab7c471afc4952d7d1745ddd Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 18:07:57 +0900 Subject: [PATCH 36/59] STY: Correcting code style issues --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 25842cb76..13ac88512 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1725,7 +1725,7 @@ def _get_acutual_font_widths( add_text: str, font_size: float, default_space_width: float - ) -> float: + ) -> Tuple[float, float, float]: font_widths: float = 0 font_name: str = cmap[2] if font_name not in self._font_width_maps: From 68862dc9ba44478df061e0a0a5f763b5e5cf50bf Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 19:30:00 +0900 Subject: [PATCH 37/59] BUG: Modify space calculation results to match original code --- pypdf/_cmap.py | 21 ++++++++------------- pypdf/_page.py | 2 +- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 80e9b1bec..8bb75a4ca 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -89,9 +89,8 @@ def build_char_map_from_dict( pass else: sp = space_code - font_width_map = build_font_width_map(ft, map_dict) - half_space_width = compute_space_width( - font_width_map, sp, space_width) / 2.0 + font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0) + half_space_width = compute_space_width(font_width_map, sp) / 2.0 return ( font_type, @@ -405,12 +404,11 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> def build_font_width_map( - ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any] + ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float ) -> Dict[Any, float]: font_width_map: Dict[Any, float] = {} st: int = 0 en: int = 0 - default_font_width: float = 1000.0 # Default font width is 0.1 if ft is None: font_width_map["default"] = default_font_width return font_width_map @@ -488,24 +486,21 @@ def build_font_width_map( # for the specified font width. pass if is_null_or_none(font_width_map.get("default")): - font_width_map["default"] = default_font_width + font_width_map["default"] = 0.0 return font_width_map def compute_space_width( - font_width_map: Dict[Any, float], sp: int, default_space_width: float + font_width_map: Dict[Any, float], sp: int ) -> float: try: sp_width = font_width_map[sp] if sp_width == 0: raise Exception("Zero width") except Exception: - if default_space_width: - sp_width = default_space_width - else: - sp_width = ( - font_width_map["default"] / 2.0 - ) # if using default we consider space will be only half size + sp_width = ( + font_width_map["default"] / 2.0 + ) # if using default we consider space will be only half size return sp_width diff --git a/pypdf/_page.py b/pypdf/_page.py index 13ac88512..322721b56 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1729,7 +1729,7 @@ def _get_acutual_font_widths( font_widths: float = 0 font_name: str = cmap[2] if font_name not in self._font_width_maps: - self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1]) + self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2) font_width_map: Dict[Any, float] = self._font_width_maps[font_name] if add_text: for char in add_text: From 4bcfac34b741c3bd17a05191193566fc33f6cab0 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 20:19:36 +0900 Subject: [PATCH 38/59] BUG: If there is no default value for font, set to argument --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 8bb75a4ca..41512b747 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -486,7 +486,7 @@ def build_font_width_map( # for the specified font width. pass if is_null_or_none(font_width_map.get("default")): - font_width_map["default"] = 0.0 + font_width_map["default"] = default_font_width if default_font_width else 0.0 return font_width_map From 6d7f75e823ca9f457182d763e70b63b038ef4555 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 20:20:33 +0900 Subject: [PATCH 39/59] BUG: Mistakes in Multiplication --- pypdf/_text_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 6fbec903f..9781a07e8 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -141,7 +141,7 @@ def crlf_space_check( ) text = "" elif ( - (moved_width >= spacewidth + str_widhts * scale_x) + (moved_width >= (spacewidth + str_widhts) * scale_x) and (output + text)[-1] != " " ): text += " " From 3e79f20ebf9790adee0ba0fd50cd40524f120913 Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 1 Oct 2024 20:53:53 +0900 Subject: [PATCH 40/59] Update pypdf/_page.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 322721b56..35bbb7529 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1817,7 +1817,7 @@ def _extract_text( space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf _actual_str_size: Dict[str, float] = { - "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set string length calculation result + "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set to string length calculation result TL = 0.0 font_size = 12.0 # init just in case of From e33b65fc7455f74c06484466ef5f256ccc35f92b Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 1 Oct 2024 21:08:44 +0900 Subject: [PATCH 41/59] Update pypdf/_text_extraction/__init__.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_text_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 9781a07e8..36e4914be 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -98,7 +98,7 @@ def crlf_space_check( output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], - str_widhts: float, + str_widths: float, spacewidth: float, str_height: float, ) -> Tuple[str, str, List[float], List[float]]: From bef78627d714003bc39bf42ddf9aa5feeeddea09 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 21:23:02 +0900 Subject: [PATCH 42/59] Typo --- pypdf/_text_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 36e4914be..7f86d7807 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -141,7 +141,7 @@ def crlf_space_check( ) text = "" elif ( - (moved_width >= (spacewidth + str_widhts) * scale_x) + (moved_width >= (spacewidth + str_widths) * scale_x) and (output + text)[-1] != " " ): text += " " From 2b0e5305a317bf637a4e17dc5949364876a61efe Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 21:25:36 +0900 Subject: [PATCH 43/59] Modifying a comment --- pypdf/_cmap.py | 2 +- pypdf/_text_extraction/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 41512b747..c6202c208 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -417,7 +417,7 @@ def build_font_width_map( except Exception: pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): - # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts + # PDF ref 1.7 9.7.4.3 Glyph Metrics in CIDFonts # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 7f86d7807..9eca070e0 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -114,7 +114,7 @@ def crlf_space_check( orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] - # PDF 32000-1:2008 p249 Table 108 Text positioning operators + # PDF ref 1.7 Table 108 Text positioning operators scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) cm_prev = m From cb5bf4a265a9dfae6659a7c3d22b98751f7fae3a Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 21:36:20 +0900 Subject: [PATCH 44/59] More detailed error codes --- pypdf/_cmap.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index c6202c208..38fff47c7 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -85,7 +85,7 @@ def build_char_map_from_dict( sp = sp[0] + 256 * sp[1] try: sp = ord(map_dict[chr(sp)]) - except Exception: + except KeyError: pass else: sp = space_code @@ -439,7 +439,7 @@ def build_font_width_map( try: conversion_char = map_dict[chr(c_code)] font_width_map[ord(conversion_char)] = w[2] - except Exception: + except KeyError: pass w = w[3:] elif isinstance(second, list): @@ -449,7 +449,7 @@ def build_font_width_map( try: conversion_char = map_dict[chr(c_code)] font_width_map[ord(conversion_char)] = width - except Exception: + except KeyError: pass c_code += 1 w = w[2:] @@ -481,7 +481,7 @@ def build_font_width_map( try: width = w[c_code - st].get_object() font_width_map[c_code] = width - except Exception: + except KeyError: # The PDF structure is invalid. The array is too small # for the specified font width. pass @@ -496,8 +496,8 @@ def compute_space_width( try: sp_width = font_width_map[sp] if sp_width == 0: - raise Exception("Zero width") - except Exception: + raise ValueError("Zero width") + except ValueError: sp_width = ( font_width_map["default"] / 2.0 ) # if using default we consider space will be only half size @@ -512,7 +512,7 @@ def compute_font_width( char_width: float = 0.0 try: char_width = font_width_map[char_code] - except Exception: + except KeyError: char_width = ( font_width_map["default"] ) From c8ce234be8eb62b199dabc1956e2125d7b90b33c Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 21:49:11 +0900 Subject: [PATCH 45/59] Allow list conversion of /Widths other than ArrayObject --- pypdf/_cmap.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 38fff47c7..699f6c5de 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -5,7 +5,6 @@ from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_error, logger_warning from .generic import ( - ArrayObject, DecodedStreamObject, DictionaryObject, StreamObject, @@ -460,7 +459,10 @@ def build_font_width_map( ) break elif "/Widths" in ft: - w = list(ft["/Widths"]) if isinstance(ft["/Widths"], ArrayObject) else [] + try: + w = cast(list, ft["/Widths"].get_object()) + except Exception: + w = [] if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): From fd82bde465fd001e02ad392435e7508131ecdedb Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 21:57:39 +0900 Subject: [PATCH 46/59] Exception code omitted --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 699f6c5de..69b3d15f2 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -499,7 +499,7 @@ def compute_space_width( sp_width = font_width_map[sp] if sp_width == 0: raise ValueError("Zero width") - except ValueError: + except (KeyError, ValueError): sp_width = ( font_width_map["default"] / 2.0 ) # if using default we consider space will be only half size From d79da5b815e58ad32ca93135818336c69d9f7d82 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 22:00:16 +0900 Subject: [PATCH 47/59] Explicit description of type --- pypdf/_cmap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 69b3d15f2..7cd8b7dee 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -7,6 +7,7 @@ from .generic import ( DecodedStreamObject, DictionaryObject, + PdfObject, StreamObject, is_null_or_none, ) @@ -460,7 +461,7 @@ def build_font_width_map( break elif "/Widths" in ft: try: - w = cast(list, ft["/Widths"].get_object()) + w: List[PdfObject] = cast(list, ft["/Widths"].get_object()) except Exception: w = [] if "/FontDescriptor" in ft and "/MissingWidth" in cast( From 98ccb3a8521bb8e267caab8e9071bfbcbc130bbf Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 22:06:21 +0900 Subject: [PATCH 48/59] style: Correcting code style issues --- pypdf/_cmap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 7cd8b7dee..110fd6786 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -421,6 +421,7 @@ def build_font_width_map( # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore + w: List[PdfObject] = [] try: font_width_map["default"] = cast(float, ft1["/DW"]) except Exception: @@ -461,7 +462,7 @@ def build_font_width_map( break elif "/Widths" in ft: try: - w: List[PdfObject] = cast(list, ft["/Widths"].get_object()) + w = cast(list, ft["/Widths"].get_object()) except Exception: w = [] if "/FontDescriptor" in ft and "/MissingWidth" in cast( From b13b97fc125e839d628863e9cf0f46fc9158a76f Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 22:35:05 +0900 Subject: [PATCH 49/59] Convert character map keys from int(ord) to str --- pypdf/_cmap.py | 12 ++++++------ pypdf/_page.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 110fd6786..74b2227f8 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -90,7 +90,7 @@ def build_char_map_from_dict( else: sp = space_code font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0) - half_space_width = compute_space_width(font_width_map, sp) / 2.0 + half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0 return ( font_type, @@ -449,7 +449,7 @@ def build_font_width_map( for width in second: try: conversion_char = map_dict[chr(c_code)] - font_width_map[ord(conversion_char)] = width + font_width_map[conversion_char] = width except KeyError: pass c_code += 1 @@ -484,8 +484,8 @@ def build_font_width_map( for c_code in range(st, en + 1): try: width = w[c_code - st].get_object() - font_width_map[c_code] = width - except KeyError: + font_width_map[chr(c_code)] = width + except IndexError: # The PDF structure is invalid. The array is too small # for the specified font width. pass @@ -511,11 +511,11 @@ def compute_space_width( def compute_font_width( font_width_map: Dict[Any, float], - char_code: int + char: str ) -> float: char_width: float = 0.0 try: - char_width = font_width_map[char_code] + char_width = font_width_map[char] except KeyError: char_width = ( font_width_map["default"] diff --git a/pypdf/_page.py b/pypdf/_page.py index 35bbb7529..6349cc62f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1734,7 +1734,7 @@ def _get_acutual_font_widths( if add_text: for char in add_text: if font_width_map: - font_widths += compute_font_width(font_width_map, ord(char)) + font_widths += compute_font_width(font_width_map, char) else: font_widths += default_space_width * 2 return (font_widths * font_size, default_space_width * font_size, font_size) From ef7331522f9a8f17e6641f2358b85f1f2054a6e6 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Tue, 1 Oct 2024 23:01:49 +0900 Subject: [PATCH 50/59] Style: Correcting code style issues --- pypdf/_cmap.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 74b2227f8..68623419d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -421,7 +421,7 @@ def build_font_width_map( # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore - w: List[PdfObject] = [] + w: List[Union[int, PdfObject]] = [] try: font_width_map["default"] = cast(float, ft1["/DW"]) except Exception: @@ -483,7 +483,11 @@ def build_font_width_map( en = cast(int, ft["/LastChar"]) for c_code in range(st, en + 1): try: - width = w[c_code - st].get_object() + width_obj = w[c_code - st].get_object() + if is_null_or_none(width_obj): + width = 0.0 + else: + width = float(width_obj) font_width_map[chr(c_code)] = width except IndexError: # The PDF structure is invalid. The array is too small @@ -495,10 +499,10 @@ def build_font_width_map( def compute_space_width( - font_width_map: Dict[Any, float], sp: int + font_width_map: Dict[Any, float], space_char: str ) -> float: try: - sp_width = font_width_map[sp] + sp_width = font_width_map[space_char] if sp_width == 0: raise ValueError("Zero width") except (KeyError, ValueError): From f8841602ad4631e57cd8d2c3c23a3b16dbdf58ad Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 1 Oct 2024 23:07:16 +0900 Subject: [PATCH 51/59] Update pypdf/_cmap.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 68623419d..0aaa27dc0 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -417,7 +417,7 @@ def build_font_width_map( except Exception: pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): - # PDF ref 1.7 9.7.4.3 Glyph Metrics in CIDFonts + # ยง9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore From 20a6883394f6807b0c6d22063c707d6e45e6ed50 Mon Sep 17 00:00:00 2001 From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com> Date: Tue, 1 Oct 2024 23:07:38 +0900 Subject: [PATCH 52/59] Update pypdf/_text_extraction/__init__.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_text_extraction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 9eca070e0..89cdb0f2a 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -114,7 +114,7 @@ def crlf_space_check( orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] - # PDF ref 1.7 Table 108 Text positioning operators + # Table 108 of the 1.7 reference ("Text positioning operators") scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) cm_prev = m From e6132faf1ae93f9156e1a63ec9f7fd94832dd796 Mon Sep 17 00:00:00 2001 From: Ryo Kamei Date: Wed, 2 Oct 2024 13:25:15 +0900 Subject: [PATCH 53/59] Exception code omitted --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 0aaa27dc0..ed8ebb462 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -489,7 +489,7 @@ def build_font_width_map( else: width = float(width_obj) font_width_map[chr(c_code)] = width - except IndexError: + except (IndexError, KeyError): # The PDF structure is invalid. The array is too small # for the specified font width. pass From 9a82eb8f7517dd63ff116e8a1280ae5b46b8f78b Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 20:42:34 +0900 Subject: [PATCH 54/59] Style: Correcting code style issues --- pypdf/_cmap.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index ed8ebb462..7d24ab82d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -421,15 +421,12 @@ def build_font_width_map( # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore - w: List[Union[int, PdfObject]] = [] try: font_width_map["default"] = cast(float, ft1["/DW"]) except Exception: font_width_map["default"] = default_font_width if "/W" in ft1: - w = list(ft1["/W"]) - else: - w = [] + w = ft1["/W"].get_object() while len(w) > 0: st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() @@ -461,10 +458,7 @@ def build_font_width_map( ) break elif "/Widths" in ft: - try: - w = cast(list, ft["/Widths"].get_object()) - except Exception: - w = [] + w = ft["/Widths"].get_object() if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): From d4f183534f39867806a4a0059e16432117bbf558 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 20:45:13 +0900 Subject: [PATCH 55/59] Style: Correcting code style issues --- pypdf/_cmap.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 7d24ab82d..bf119b268 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -7,7 +7,6 @@ from .generic import ( DecodedStreamObject, DictionaryObject, - PdfObject, StreamObject, is_null_or_none, ) From 96fcf7c995c3508dc47eee81650e0e2e1c4b7423 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 21:22:39 +0900 Subject: [PATCH 56/59] fix self-made bugs --- pypdf/_cmap.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index bf119b268..5731e8eb1 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -426,6 +426,8 @@ def build_font_width_map( font_width_map["default"] = default_font_width if "/W" in ft1: w = ft1["/W"].get_object() + else: + w = [] while len(w) > 0: st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() From 780a6321c2250da69737ae16a95d23a5c74fb3a4 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 22:47:32 +0900 Subject: [PATCH 57/59] fix self-made bugs --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 5731e8eb1..bf990a344 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -437,7 +437,7 @@ def build_font_width_map( for c_code in range(st, en + 1): try: conversion_char = map_dict[chr(c_code)] - font_width_map[ord(conversion_char)] = w[2] + font_width_map[conversion_char] = w[2] except KeyError: pass w = w[3:] From ce11d0de01cbdd19e89e4d0ede3c19d36c83573b Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 23:07:29 +0900 Subject: [PATCH 58/59] Insufficient height consideration for front and rear fonts --- pypdf/_text_extraction/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 89cdb0f2a..a1c0d1d91 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -115,8 +115,9 @@ def crlf_space_check( delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # Table 108 of the 1.7 reference ("Text positioning operators") - scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) - scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) + scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) + scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) + scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m if orientation not in orientations: @@ -128,7 +129,7 @@ def crlf_space_check( moved_height = delta_x moved_width = delta_y try: - if abs(moved_height) > 0.8 * str_height * scale_y: + if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: @@ -141,7 +142,7 @@ def crlf_space_check( ) text = "" elif ( - (moved_width >= (spacewidth + str_widths) * scale_x) + (moved_width >= (spacewidth + str_widths) * scale_prev_x) and (output + text)[-1] != " " ): text += " " From 03eb1cb444097fc532c9fc4fd243fc221dcc3cc1 Mon Sep 17 00:00:00 2001 From: ryo kamei Date: Wed, 2 Oct 2024 23:30:17 +0900 Subject: [PATCH 59/59] style: Correcting code style issues --- pypdf/_cmap.py | 6 +----- pypdf/_page.py | 5 +---- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index bf990a344..fde795b01 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -478,11 +478,7 @@ def build_font_width_map( en = cast(int, ft["/LastChar"]) for c_code in range(st, en + 1): try: - width_obj = w[c_code - st].get_object() - if is_null_or_none(width_obj): - width = 0.0 - else: - width = float(width_obj) + width = w[c_code - st].get_object() font_width_map[chr(c_code)] = width except (IndexError, KeyError): # The PDF structure is invalid. The array is too small diff --git a/pypdf/_page.py b/pypdf/_page.py index 6349cc62f..c49a68c33 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1733,10 +1733,7 @@ def _get_acutual_font_widths( font_width_map: Dict[Any, float] = self._font_width_maps[font_name] if add_text: for char in add_text: - if font_width_map: - font_widths += compute_font_width(font_width_map, char) - else: - font_widths += default_space_width * 2 + font_widths += compute_font_width(font_width_map, char) return (font_widths * font_size, default_space_width * font_size, font_size) def _extract_text(