From 5400f5ada933b7b4f54ba9eb7c9db53729ef551c Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 13:07:59 +0900
Subject: [PATCH 01/59] BUG: Missing spaces in extract_text() method (#1328)

---
 pypdf/_page.py                |  2 +-
 tests/test_text_extraction.py | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index e4ec053c8..8e9dbc21e 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op)) >= _space_width)
+                        (math.ceil(abs(float(op))) >= _space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 2f0eaad1d..faef6d980 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning):
     mock_logger_warning.assert_called_with(
         "Argument visitor_text is ignored in layout mode", "pypdf._page"
     )
+
+
+@pytest.mark.enable_socket()
+def test_space_with_one_unit_smaller_than_font_width():
+    """Tests for #1328"""
+    url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf"
+    name = "iss1328.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
+    assert """Reporting crude oil leak.
+Leak was isolated to well
+pad. Segment of line was
+immediately isolated, now
+estimated at 5 barrels of oil
+spilt. Root cause still
+unknown at this time.""" == extracted

From aac04364611818571fc24a53f36e325849f0371a Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 13:42:47 +0900
Subject: [PATCH 02/59] Revert "BUG: Missing spaces in extract_text() method
 (#1328)"

This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c.
---
 pypdf/_page.py                |  2 +-
 tests/test_text_extraction.py | 17 -----------------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 8e9dbc21e..e4ec053c8 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (math.ceil(abs(float(op))) >= _space_width)
+                        (abs(float(op)) >= _space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index faef6d980..2f0eaad1d 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -189,20 +189,3 @@ def test_layout_mode_warnings(mock_logger_warning):
     mock_logger_warning.assert_called_with(
         "Argument visitor_text is ignored in layout mode", "pypdf._page"
     )
-
-
-@pytest.mark.enable_socket()
-def test_space_with_one_unit_smaller_than_font_width():
-    """Tests for #1328"""
-    url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf"
-    name = "iss1328.pdf"
-    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
-    page = reader.pages[0]
-    extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
-    assert """Reporting crude oil leak.
-Leak was isolated to well
-pad. Segment of line was
-immediately isolated, now
-estimated at 5 barrels of oil
-spilt. Root cause still
-unknown at this time.""" == extracted

From 64b1c92abec2d72d90086c0a074c3712ff86249d Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 13:44:42 +0900
Subject: [PATCH 03/59] BUG: Missing spaces in extract_text() method (#1328)

---
 pypdf/_page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index e4ec053c8..8e9dbc21e 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1989,7 +1989,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op)) >= _space_width)
+                        (math.ceil(abs(float(op))) >= _space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):

From 70e9b386a409b7dbe559fc1db4759eb866746d82 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 13:48:01 +0900
Subject: [PATCH 04/59] BUG: Missing spaces in extract_text() method (#1328)
 add test

---
 tests/test_text_extraction.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 2f0eaad1d..93082349a 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning):
     mock_logger_warning.assert_called_with(
         "Argument visitor_text is ignored in layout mode", "pypdf._page"
     )
+
+
+@pytest.mark.enable_socket()
+def test_space_with_one_unit_smaller_than_font_width():
+    """Tests for #1328"""
+    url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf"
+    name = "iss1328.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
+    assert extracted == """Reporting crude oil leak.
+Leak was isolated to well
+pad. Segment of line was
+immediately isolated, now
+estimated at 5 barrels of oil
+spilt. Root cause still
+unknown at this time."""

From 65224e1f2dc85da56beedddd25179f22ccacf6be Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 13:42:47 +0900
Subject: [PATCH 05/59] Revert "BUG: Missing spaces in extract_text() method
 (#1328)"

This reverts commit 5400f5ada933b7b4f54ba9eb7c9db53729ef551c.

BUG: Missing spaces in extract_text() method (#1328)

BUG: Missing spaces in extract_text() method (#1328) add test
---
 tests/test_text_extraction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index faef6d980..93082349a 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -199,10 +199,10 @@ def test_space_with_one_unit_smaller_than_font_width():
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     page = reader.pages[0]
     extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
-    assert """Reporting crude oil leak.
+    assert extracted == """Reporting crude oil leak.
 Leak was isolated to well
 pad. Segment of line was
 immediately isolated, now
 estimated at 5 barrels of oil
 spilt. Root cause still
-unknown at this time.""" == extracted
+unknown at this time."""

From f6dcb439a3e667802558440d79e6b388307a6fed Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 14:26:45 +0900
Subject: [PATCH 06/59] BUG: Missing spaces in extract_text() method (#1328)
 Convert font size comparison to ratio

---
 pypdf/_page.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 8e9dbc21e..ff4ef30c4 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1988,8 +1988,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 for op in operands[0]:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
+                    # The space width may be smaller than the font width, so the width should be 95%.
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (math.ceil(abs(float(op))) >= _space_width)
+                        (abs(float(op) / 0.95) >= _space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):

From fd1c48930683bafe4f656b1fe742827a29d4feae Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:39:31 +0900
Subject: [PATCH 07/59] Correction to new file URL.

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 tests/test_text_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 93082349a..08c4fc4f1 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -194,7 +194,7 @@ def test_layout_mode_warnings(mock_logger_warning):
 @pytest.mark.enable_socket()
 def test_space_with_one_unit_smaller_than_font_width():
     """Tests for #1328"""
-    url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf"
+    url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf"
     name = "iss1328.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     page = reader.pages[0]

From 2873b9eb9130a6529cc4e7003e37c3655bc3a7b6 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 18:55:45 +0900
Subject: [PATCH 08/59] BUG: Missing spaces in extract_text() method
 (py-pdf#1328) calculation efficiency

---
 pypdf/_page.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index ff4ef30c4..87b914ce2 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1985,12 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 process_operation(b"TL", [-operands[1]])
                 process_operation(b"Td", operands)
             elif operator == b"TJ":
+                # The space width may be smaller than the font width, so the width should be 95%.
+                _confirm_space_width = _space_width * 0.95
                 for op in operands[0]:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
-                    # The space width may be smaller than the font width, so the width should be 95%.
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op) / 0.95) >= _space_width)
+                        (abs(float(op)) >= _confirm_space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):

From 7597704eaa466934690853b370c850e2f6253e09 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 24 Sep 2024 21:14:02 +0900
Subject: [PATCH 09/59] BUG: Missing spaces in extract_text() method
 (py-pdf#1328) Simplify the assertion process

---
 tests/test_text_extraction.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 08c4fc4f1..8bfa1809e 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -198,11 +198,5 @@ def test_space_with_one_unit_smaller_than_font_width():
     name = "iss1328.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     page = reader.pages[0]
-    extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
-    assert extracted == """Reporting crude oil leak.
-Leak was isolated to well
-pad. Segment of line was
-immediately isolated, now
-estimated at 5 barrels of oil
-spilt. Root cause still
-unknown at this time."""
+    extracted = page.extract_text()
+    assert "Reporting crude oil leak.\n" in extracted

From fb4de4105aa3d508e1d049aa2dce5b3b1a76461f Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Sat, 28 Sep 2024 19:00:47 +0900
Subject: [PATCH 10/59] BUG: Issue in text extraction (spaces) (#1153)

---
 pypdf/_cmap.py                     |  27 ++++++++
 pypdf/_page.py                     |  22 +++++-
 pypdf/_text_extraction/__init__.py | 108 +++++++++--------------------
 3 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 4cc112552..e4ad87a0f 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -474,6 +474,33 @@ def compute_space_width(
     return sp_width
 
 
+def compute_font_width(
+    ft: DictionaryObject, char_code: int, font_width: float
+) -> float:
+    if "/Widths" not in ft:
+        return font_width
+
+    w = list(ft["/Widths"])
+    try:
+        st = cast(int, ft["/FirstChar"])
+        en: int = cast(int, ft["/LastChar"])
+        if st > char_code or en < char_code:
+            raise Exception("Not in range")
+        if w[char_code - st].get_object() == 0:
+            raise Exception("null width")
+        char_width = w[char_code - st].get_object()
+    except Exception:
+        if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+            DictionaryObject, ft["/FontDescriptor"]
+        ):
+            char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+        else:
+            return font_width
+    if is_null_or_none(char_width):
+        char_width = None
+    return char_width
+
+
 def type1_alternative(
     ft: DictionaryObject,
     map_dict: Dict[Any, Any],
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 87b914ce2..e3cc74031 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -49,7 +49,7 @@
     overload,
 )
 
-from ._cmap import build_char_map, unknown_char_map
+from ._cmap import build_char_map, unknown_char_map, compute_font_width
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
@@ -1793,18 +1793,23 @@ def _extract_text(
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
+        _font_widths: float = 0.0
         TL = 0.0
         font_size = 12.0  # init just in case of
 
         def current_spacewidth() -> float:
             return _space_width / 1000.0
 
+        def current_fontwidths() -> float:
+            return _font_widths / 1000.0
+
         def process_operation(operator: bytes, operands: List[Any]) -> None:
             nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text, output, text
+            nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
+            add_text: str = ""
             check_crlf_space: bool = False
             # Table 5.4 page 405
             if operator == b"BT":
@@ -1935,7 +1940,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
 
             elif operator == b"Tj":
                 check_crlf_space = True
-                text, rtl_dir = handle_tj(
+                text, rtl_dir, add_text = handle_tj(
                     text,
                     operands,
                     cm_matrix,
@@ -1947,6 +1952,16 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
+                _font_widths = 0
+                if add_text:
+                    for char in add_text:
+                        font_code = ord(char)
+                        if cmap[3]:
+                            font_width = compute_font_width(cmap[3], font_code, _space_width)
+                            if font_width:
+                                _font_widths = _font_widths + font_width
+                        else:
+                            _font_widths = current_spacewidth()
             else:
                 return None
             if check_crlf_space:
@@ -1962,6 +1977,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                         font_size,
                         visitor_text,
                         current_spacewidth(),
+                        current_fontwidths()
                     )
                     if text == "":
                         memo_cm = cm_matrix.copy()
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 3b1d687ea..75a0848ad 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -99,6 +99,7 @@ def crlf_space_check(
     font_size: float,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
     spacewidth: float,
+    font_width: float
 ) -> Tuple[str, str, List[float], List[float]]:
     cm_prev = cmtm_prev[0]
     tm_prev = cmtm_prev[1]
@@ -115,85 +116,34 @@ def crlf_space_check(
     k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
     f = font_size * k
     cm_prev = m
+
     if orientation not in orientations:
         raise OrientationNotFoundError
+    if orientation in (0, 180):
+        moved_height: float = delta_y
+        moved_width: float = delta_x
+    elif orientation in (90, 270):
+        moved_height: float = delta_x
+        moved_width: float = delta_y
     try:
-        if orientation == 0:
-            if delta_y < -0.8 * f:
-                if (output + text)[-1] != "\n":
-                    output += text + "\n"
-                    if visitor_text is not None:
-                        visitor_text(
-                            text + "\n",
-                            memo_cm,
-                            memo_tm,
-                            cmap[3],
-                            font_size,
-                        )
-                    text = ""
-            elif (
-                abs(delta_y) < f * 0.3
-                and abs(delta_x) > spacewidth * f * 15
-                and (output + text)[-1] != " "
-            ):
-                text += " "
-        elif orientation == 180:
-            if delta_y > 0.8 * f:
-                if (output + text)[-1] != "\n":
-                    output += text + "\n"
-                    if visitor_text is not None:
-                        visitor_text(
-                            text + "\n",
-                            memo_cm,
-                            memo_tm,
-                            cmap[3],
-                            font_size,
-                        )
-                    text = ""
-            elif (
-                abs(delta_y) < f * 0.3
-                and abs(delta_x) > spacewidth * f * 15
-                and (output + text)[-1] != " "
-            ):
-                text += " "
-        elif orientation == 90:
-            if delta_x > 0.8 * f:
-                if (output + text)[-1] != "\n":
-                    output += text + "\n"
-                    if visitor_text is not None:
-                        visitor_text(
-                            text + "\n",
-                            memo_cm,
-                            memo_tm,
-                            cmap[3],
-                            font_size,
-                        )
-                    text = ""
-            elif (
-                abs(delta_x) < f * 0.3
-                and abs(delta_y) > spacewidth * f * 15
-                and (output + text)[-1] != " "
-            ):
-                text += " "
-        elif orientation == 270:
-            if delta_x < -0.8 * f:
-                if (output + text)[-1] != "\n":
-                    output += text + "\n"
-                    if visitor_text is not None:
-                        visitor_text(
-                            text + "\n",
-                            memo_cm,
-                            memo_tm,
-                            cmap[3],
-                            font_size,
-                        )
+        if abs(moved_height) > 0.8 * f:
+            if (output + text)[-1] != "\n":
+                output += text + "\n"
+                if visitor_text is not None:
+                    visitor_text(
+                        text + "\n",
+                        memo_cm,
+                        memo_tm,
+                        cmap[3],
+                        font_size,
+                    )
                     text = ""
-            elif (
-                abs(delta_x) < f * 0.3
-                and abs(delta_y) > spacewidth * f * 15
-                and (output + text)[-1] != " "
-            ):
-                text += " "
+        elif (
+            abs(moved_height) < f * 0.3
+            and abs(moved_width) > (spacewidth + font_width) * f
+            and (output + text)[-1] != " "
+        ):
+            text += " "
     except Exception:
         pass
     tm_prev = tm_matrix.copy()
@@ -214,12 +164,14 @@ def handle_tj(
     font_size: float,
     rtl_dir: bool,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
-) -> Tuple[str, bool]:
+) -> Tuple[str, bool, str]:
+    add_text = ""
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
     if orientation in orientations and len(operands) > 0:
         if isinstance(operands[0], str):
             text += operands[0]
+            add_text = operands[0]
         else:
             t: str = ""
             tt: bytes = (
@@ -272,6 +224,7 @@ def handle_tj(
                             visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
                         text = ""
                     text = x + text
+                    add_text = x + add_text
                 else:  # left-to-right
                     # print(">",xx,x,end="")
                     if rtl_dir:
@@ -281,5 +234,6 @@ def handle_tj(
                             visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
                         text = ""
                     text = text + x
+                    add_text += x
                 # fmt: on
-    return text, rtl_dir
+    return text, rtl_dir, add_text

From 373eaec1cd013c43db3407372fa40743e5147839 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Sat, 28 Sep 2024 19:06:09 +0900
Subject: [PATCH 11/59] BUG: Issue in text extraction (spaces) (#1153) add test

---
 tests/test_text_extraction.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 8bfa1809e..ff318f9fe 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -200,3 +200,14 @@ def test_space_with_one_unit_smaller_than_font_width():
     page = reader.pages[0]
     extracted = page.extract_text()
     assert "Reporting crude oil leak.\n" in extracted
+
+
+@pytest.mark.enable_socket()
+def test_space_position_calculation():
+    """Tests for #1153"""
+    url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf"
+    name = "iss1153.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[3]
+    extracted = page.extract_text()
+    assert "Shortly after the Geneva BOF session, the" in extracted

From 066f594c23f754f25e65b1e09fd40f7c9315dac1 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Sat, 28 Sep 2024 20:01:26 +0900
Subject: [PATCH 12/59] style: Correcting code style issues

---
 pypdf/_page.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index e3cc74031..4b74790e5 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -49,7 +49,7 @@
     overload,
 )
 
-from ._cmap import build_char_map, unknown_char_map, compute_font_width
+from ._cmap import build_char_map, compute_font_width, unknown_char_map
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
@@ -1716,6 +1716,19 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
+    def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) -> float:
+        font_widths: float = 0
+        if add_text:
+            for char in add_text:
+                font_code = ord(char)
+                if cmap[3]:
+                    font_width = compute_font_width(cmap[3], font_code, default_width)
+                    if font_width:
+                        font_widths = font_widths + font_width
+                else:
+                    font_widths = default_width
+        return font_widths
+
     def _extract_text(
         self,
         obj: Any,
@@ -1952,16 +1965,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
-                _font_widths = 0
-                if add_text:
-                    for char in add_text:
-                        font_code = ord(char)
-                        if cmap[3]:
-                            font_width = compute_font_width(cmap[3], font_code, _space_width)
-                            if font_width:
-                                _font_widths = _font_widths + font_width
-                        else:
-                            _font_widths = current_spacewidth()
+                _font_widths = self._get_font_widths(add_text, cmap, _space_width)
             else:
                 return None
             if check_crlf_space:

From d406e23e005b2dd0c48f7059d9fd55ef159ab6b9 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Sat, 28 Sep 2024 21:18:43 +0900
Subject: [PATCH 13/59] Text position return support

---
 pypdf/_text_extraction/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 75a0848ad..56137dbe1 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -140,7 +140,7 @@ def crlf_space_check(
                     text = ""
         elif (
             abs(moved_height) < f * 0.3
-            and abs(moved_width) > (spacewidth + font_width) * f
+            and moved_width > (spacewidth + font_width) * f
             and (output + text)[-1] != " "
         ):
             text += " "

From d338e18c743b629bc6ffd720807ec2c60d556fe5 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 00:07:09 +0900
Subject: [PATCH 14/59] Add code for CIDFont

---
 pypdf/_cmap.py | 77 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 18 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index e4ad87a0f..cef105015 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -477,25 +477,66 @@ def compute_space_width(
 def compute_font_width(
     ft: DictionaryObject, char_code: int, font_width: float
 ) -> float:
-    if "/Widths" not in ft:
-        return font_width
-
-    w = list(ft["/Widths"])
-    try:
-        st = cast(int, ft["/FirstChar"])
-        en: int = cast(int, ft["/LastChar"])
-        if st > char_code or en < char_code:
-            raise Exception("Not in range")
-        if w[char_code - st].get_object() == 0:
-            raise Exception("null width")
-        char_width = w[char_code - st].get_object()
-    except Exception:
-        if "/FontDescriptor" in ft and "/MissingWidth" in cast(
-            DictionaryObject, ft["/FontDescriptor"]
-        ):
-            char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+    char_width: float = font_width * 2.0  # default value
+    w = []
+    char_code_width = {}
+    st: int = 0
+    # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
+    # Widths for a CIDFont are defined using the DW and W entries.
+    # DW2 and W2 are for vertical use. Vertical type is not implemented.
+    if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
+        ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
+        try:
+            char_code_width["default"] = cast(float, ft1["/DW"])
+        except Exception:
+            char_code_width["default"] = 1000.0  # Default font width is 0.1
+        if "/W" in ft1:
+            # Starting C [W1 W2 ... Wn]
+            # C_first - C_last same W
+            w = list(ft1["/W"])
         else:
-            return font_width
+            w = []
+        while len(w) > 0:
+            st = w[0] if isinstance(w[0], int) else w[0].get_object()
+            second = w[1].get_object()
+            if isinstance(second, int):
+                for x in range(st, second):
+                    char_code_width[x] = w[2]
+                w = w[3:]
+            elif isinstance(second, list):
+                for y in second:
+                    char_code_width[st] = y
+                    st += 1
+                w = w[2:]
+            else:
+                logger_warning(
+                    "unknown widths : \n" + (ft1["/W"]).__repr__(),
+                    __name__,
+                )
+                break
+        try:
+            char_width = char_code_width[char_code]
+        except Exception:
+            char_width = (
+                char_code_width["default"]
+            )
+    elif "/Widths" in ft:
+        w = list(ft["/Widths"])
+        try:
+            st = cast(int, ft["/FirstChar"])
+            en: int = cast(int, ft["/LastChar"])
+            if st > char_code or en < char_code:
+                raise Exception("Not in range")
+            if w[char_code - st].get_object() == 0:
+                raise Exception("null width")
+            char_width = w[char_code - st].get_object()
+        except Exception:
+            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+                DictionaryObject, ft["/FontDescriptor"]
+            ):
+                char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+            else:
+                return font_width
     if is_null_or_none(char_width):
         char_width = None
     return char_width

From f7c4236e034c744d4b77c4bd26ad0d3a01bf4241 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 06:01:24 +0900
Subject: [PATCH 15/59] Added horizontal CIDFont calculation code

---
 pypdf/_cmap.py                     | 20 ++++++++++++++------
 pypdf/_page.py                     |  4 +++-
 pypdf/_text_extraction/__init__.py |  1 +
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index cef105015..6cb3680ef 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -475,8 +475,10 @@ def compute_space_width(
 
 
 def compute_font_width(
-    ft: DictionaryObject, char_code: int, font_width: float
+    cmap: Tuple, char_code: int, font_width: float
 ) -> float:
+    ft: DictionaryObject = cmap[3]
+    char_code_map: dict = cmap[1]
     char_width: float = font_width * 2.0  # default value
     w = []
     char_code_width = {}
@@ -491,8 +493,6 @@ def compute_font_width(
         except Exception:
             char_code_width["default"] = 1000.0  # Default font width is 0.1
         if "/W" in ft1:
-            # Starting C [W1 W2 ... Wn]
-            # C_first - C_last same W
             w = list(ft1["/W"])
         else:
             w = []
@@ -500,12 +500,20 @@ def compute_font_width(
             st = w[0] if isinstance(w[0], int) else w[0].get_object()
             second = w[1].get_object()
             if isinstance(second, int):
-                for x in range(st, second):
-                    char_code_width[x] = w[2]
+                # C_first - C_last same W
+                for x in range(st, second + 1):
+                    try:
+                        char_code_width[ord(char_code_map[chr(x)])] = w[2]
+                    except Exception:
+                        char_code_width[x] = w[2]
                 w = w[3:]
             elif isinstance(second, list):
+                # Starting C [W1 W2 ... Wn]
                 for y in second:
-                    char_code_width[st] = y
+                    try:
+                        char_code_width[ord(char_code_map[chr(st)])] = y
+                    except Exception:
+                        char_code_width[st] = y
                     st += 1
                 w = w[2:]
             else:
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 4b74790e5..19d17716c 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1722,7 +1722,7 @@ def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) ->
             for char in add_text:
                 font_code = ord(char)
                 if cmap[3]:
-                    font_width = compute_font_width(cmap[3], font_code, default_width)
+                    font_width = compute_font_width(cmap, font_code, default_width)
                     if font_width:
                         font_widths = font_widths + font_width
                 else:
@@ -1965,6 +1965,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
+                if "San" in add_text:
+                    pass
                 _font_widths = self._get_font_widths(add_text, cmap, _space_width)
             else:
                 return None
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 56137dbe1..a4d0c3c70 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -211,6 +211,7 @@ def handle_tj(
                     or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
                 ):
                     text = x + text if rtl_dir else text + x
+                    add_text = x if rtl_dir else add_text + x
                 elif (  # right-to-left characters set
                     0x0590 <= xx <= 0x08FF
                     or 0xFB1D <= xx <= 0xFDFF

From a32fbc9aaffca39056d2630e2540912d59b1036b Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 06:36:05 +0900
Subject: [PATCH 16/59] Style: Correcting code style issues

---
 pypdf/_cmap.py                     | 24 ++++++++++++++----------
 pypdf/_page.py                     |  9 ++++++++-
 pypdf/_text_extraction/__init__.py |  4 ++--
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 6cb3680ef..8b6aa544c 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -1,6 +1,6 @@
 from binascii import unhexlify
 from math import ceil
-from typing import Any, Dict, List, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import logger_error, logger_warning
@@ -475,10 +475,14 @@ def compute_space_width(
 
 
 def compute_font_width(
-    cmap: Tuple, char_code: int, font_width: float
+    cmap: Tuple[
+        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+    ],
+    char_code: int,
+    font_width: float
 ) -> float:
-    ft: DictionaryObject = cmap[3]
-    char_code_map: dict = cmap[1]
+    ft = cmap[3]
+    char_code_map = cmap[1]
     char_width: float = font_width * 2.0  # default value
     w = []
     char_code_width = {}
@@ -503,17 +507,17 @@ def compute_font_width(
                 # C_first - C_last same W
                 for x in range(st, second + 1):
                     try:
-                        char_code_width[ord(char_code_map[chr(x)])] = w[2]
+                        char_code_width[str(ord(char_code_map[chr(x)]))] = w[2]
                     except Exception:
-                        char_code_width[x] = w[2]
+                        char_code_width[str(x)] = w[2]
                 w = w[3:]
             elif isinstance(second, list):
                 # Starting C [W1 W2 ... Wn]
                 for y in second:
                     try:
-                        char_code_width[ord(char_code_map[chr(st)])] = y
+                        char_code_width[str(ord(char_code_map[chr(st)]))] = y
                     except Exception:
-                        char_code_width[st] = y
+                        char_code_width[str(st)] = y
                     st += 1
                 w = w[2:]
             else:
@@ -523,7 +527,7 @@ def compute_font_width(
                 )
                 break
         try:
-            char_width = char_code_width[char_code]
+            char_width = char_code_width[str(char_code)]
         except Exception:
             char_width = (
                 char_code_width["default"]
@@ -546,7 +550,7 @@ def compute_font_width(
             else:
                 return font_width
     if is_null_or_none(char_width):
-        char_width = None
+        char_width = char_code_width["default"]
     return char_width
 
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 19d17716c..53196a14f 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1716,7 +1716,14 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
-    def _get_font_widths(self, add_text: str, cmap: Tuple, default_width: float) -> float:
+    def _get_font_widths(
+        self,
+        add_text: str,
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ],
+        default_width: float
+    ) -> float:
         font_widths: float = 0
         if add_text:
             for char in add_text:
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index a4d0c3c70..c29aa4c1d 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -123,8 +123,8 @@ def crlf_space_check(
         moved_height: float = delta_y
         moved_width: float = delta_x
     elif orientation in (90, 270):
-        moved_height: float = delta_x
-        moved_width: float = delta_y
+        moved_height = delta_x
+        moved_width = delta_y
     try:
         if abs(moved_height) > 0.8 * f:
             if (output + text)[-1] != "\n":

From a237f2dd81ae06d41be182733a0678c66dd298ab Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 16:17:16 +0900
Subject: [PATCH 17/59] Integrate font width calculation and space width
 calculation

---
 pypdf/_cmap.py | 198 +++++++++++++++++++------------------------------
 pypdf/_page.py |  20 +++--
 2 files changed, 86 insertions(+), 132 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 8b6aa544c..12bccd4b9 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -1,6 +1,6 @@
 from binascii import unhexlify
 from math import ceil
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import logger_error, logger_warning
@@ -75,11 +75,6 @@ def build_char_map_from_dict(
         for x in int_entry:
             if x <= 255:
                 encoding[x] = chr(x)
-    try:
-        # override space_width with new params
-        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
-    except Exception:
-        pass
     # I consider the space_code is available on one byte
     if isinstance(space_code, str):
         try:  # one byte
@@ -87,16 +82,17 @@ def build_char_map_from_dict(
         except Exception:
             sp = space_code.encode("utf-16-be")
             sp = sp[0] + 256 * sp[1]
+        sp = ord(map_dict[chr(sp)])
     else:
         sp = space_code
-    sp_width = compute_space_width(ft, sp, space_width)
+    sp_width, font_width_map = compute_space_width(ft, sp, map_dict)
 
     return (
         font_type,
-        float(sp_width / 2),
+        sp_width,
         encoding,
         # https://github.com/python/mypy/issues/4374
-        map_dict,
+        map_dict
     )
 
 
@@ -403,18 +399,25 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
 
 
 def compute_space_width(
-    ft: DictionaryObject, space_code: int, space_width: float
-) -> float:
-    sp_width: float = space_width * 2.0  # default value
-    w = []
-    w1 = {}
+    ft: DictionaryObject, sp: int, map_dict: Dict[Any, Any]
+) -> Tuple[float, Dict[Any, float]]:
+    char_code_width = {}
     st: int = 0
+    en: int = 0
+    try:
+        default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2
+    except Exception:
+        default_font_width = 2000.0  # Default font width is 0.2
+    sp_width: float = default_font_width  # default value
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
+        # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
+        # Widths for a CIDFont are defined using the DW and W entries.
+        # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
         try:
-            w1[-1] = cast(float, ft1["/DW"])
+            char_code_width["default"] = cast(float, ft1["/DW"])
         except Exception:
-            w1[-1] = 1000.0
+            char_code_width["default"] = default_font_width
         if "/W" in ft1:
             w = list(ft1["/W"])
         else:
@@ -423,13 +426,25 @@ def compute_space_width(
             st = w[0] if isinstance(w[0], int) else w[0].get_object()
             second = w[1].get_object()
             if isinstance(second, int):
-                for x in range(st, second):
-                    w1[x] = w[2]
+                # C_first C_last same_W
+                en = second
+                for c_code in range(st, en + 1):
+                    try:
+                        conversion_char = map_dict[chr(c_code)]
+                        char_code_width[ord(conversion_char)] = w[2]
+                    except Exception:
+                        pass
                 w = w[3:]
             elif isinstance(second, list):
-                for y in second:
-                    w1[st] = y
-                    st += 1
+                # Starting_C [W1 W2 ... Wn]
+                c_code = st
+                for width in second:
+                    try:
+                        conversion_char = map_dict[chr(c_code)]
+                        char_code_width[ord(conversion_char)] = width
+                    except Exception:
+                        pass
+                    c_code += 1
                 w = w[2:]
             else:
                 logger_warning(
@@ -438,119 +453,60 @@ def compute_space_width(
                 )
                 break
         try:
-            sp_width = w1[space_code]
+            sp_width = char_code_width[sp]
         except Exception:
             sp_width = (
-                w1[-1] / 2.0
+                char_code_width["default"] / 2.0
             )  # if using default we consider space will be only half size
     elif "/Widths" in ft:
-        w = list(ft["/Widths"])  # type: ignore
+        w = list(ft["/Widths"])
+        if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+            DictionaryObject, ft["/FontDescriptor"]
+        ):
+            char_code_width["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+        else:
+            # will consider width of char as avg(width)
+            m = 0
+            cpt = 0
+            for xx in w:
+                xx = xx.get_object()
+                if xx > 0:
+                    m += xx
+                    cpt += 1
+            char_code_width["default"] = m / max(1, cpt)
         try:
             st = cast(int, ft["/FirstChar"])
-            en: int = cast(int, ft["/LastChar"])
-            if st > space_code or en < space_code:
-                raise Exception("Not in range")
-            if w[space_code - st].get_object() == 0:
-                raise Exception("null width")
-            sp_width = w[space_code - st].get_object()
+            en = cast(int, ft["/LastChar"])
+            if st > sp or en < sp:
+                raise Exception("There is no space character code in the font range")
+            for c_code in range(st, en + 1):
+                width = w[c_code - st].get_object()
+                if width == 0:
+                    raise Exception("The PDF structure is invalid. The array is too "
+                                    "small for the specified font width.")
+                char_code_width[c_code] = width
         except Exception:
-            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
-                DictionaryObject, ft["/FontDescriptor"]
-            ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
-            else:
-                # will consider width of char as avg(width)/2
-                m = 0
-                cpt = 0
-                for xx in w:
-                    xx = xx.get_object()
-                    if xx > 0:
-                        m += xx
-                        cpt += 1
-                sp_width = m / max(1, cpt) / 2
-
+            if "default" in char_code_width:
+                sp_width = char_code_width["default"]
+        if not sp_width:
+            sp_width = char_code_width[sp].get_object()
     if is_null_or_none(sp_width):
         sp_width = 0.0
-    return sp_width
+    return sp_width, char_code_width
 
 
 def compute_font_width(
-    cmap: Tuple[
-        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
-    ],
-    char_code: int,
-    font_width: float
+    font_width_map: Dict[Any, float],
+    char_code: int
 ) -> float:
-    ft = cmap[3]
-    char_code_map = cmap[1]
-    char_width: float = font_width * 2.0  # default value
-    w = []
-    char_code_width = {}
-    st: int = 0
-    # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
-    # Widths for a CIDFont are defined using the DW and W entries.
-    # DW2 and W2 are for vertical use. Vertical type is not implemented.
-    if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
-        ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
-        try:
-            char_code_width["default"] = cast(float, ft1["/DW"])
-        except Exception:
-            char_code_width["default"] = 1000.0  # Default font width is 0.1
-        if "/W" in ft1:
-            w = list(ft1["/W"])
-        else:
-            w = []
-        while len(w) > 0:
-            st = w[0] if isinstance(w[0], int) else w[0].get_object()
-            second = w[1].get_object()
-            if isinstance(second, int):
-                # C_first - C_last same W
-                for x in range(st, second + 1):
-                    try:
-                        char_code_width[str(ord(char_code_map[chr(x)]))] = w[2]
-                    except Exception:
-                        char_code_width[str(x)] = w[2]
-                w = w[3:]
-            elif isinstance(second, list):
-                # Starting C [W1 W2 ... Wn]
-                for y in second:
-                    try:
-                        char_code_width[str(ord(char_code_map[chr(st)]))] = y
-                    except Exception:
-                        char_code_width[str(st)] = y
-                    st += 1
-                w = w[2:]
-            else:
-                logger_warning(
-                    "unknown widths : \n" + (ft1["/W"]).__repr__(),
-                    __name__,
-                )
-                break
-        try:
-            char_width = char_code_width[str(char_code)]
-        except Exception:
-            char_width = (
-                char_code_width["default"]
-            )
-    elif "/Widths" in ft:
-        w = list(ft["/Widths"])
-        try:
-            st = cast(int, ft["/FirstChar"])
-            en: int = cast(int, ft["/LastChar"])
-            if st > char_code or en < char_code:
-                raise Exception("Not in range")
-            if w[char_code - st].get_object() == 0:
-                raise Exception("null width")
-            char_width = w[char_code - st].get_object()
-        except Exception:
-            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
-                DictionaryObject, ft["/FontDescriptor"]
-            ):
-                char_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
-            else:
-                return font_width
-    if is_null_or_none(char_width):
-        char_width = char_code_width["default"]
+    char_width: float = 0.0
+    try:
+        char_width = font_width_map[char_code]
+    except Exception:
+        char_width = (
+            font_width_map["default"]
+        )
+
     return char_width
 
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 53196a14f..6835e5673 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -49,7 +49,7 @@
     overload,
 )
 
-from ._cmap import build_char_map, compute_font_width, unknown_char_map
+from ._cmap import build_char_map, compute_font_width, compute_space_width, unknown_char_map
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
@@ -1719,21 +1719,16 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
     def _get_font_widths(
         self,
         add_text: str,
-        cmap: Tuple[
-            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
-        ],
+        font_width_map: Dict[Any, float],
         default_width: float
     ) -> float:
         font_widths: float = 0
         if add_text:
             for char in add_text:
-                font_code = ord(char)
-                if cmap[3]:
-                    font_width = compute_font_width(cmap, font_code, default_width)
-                    if font_width:
-                        font_widths = font_widths + font_width
+                if font_width_map:
+                    font_widths += compute_font_width(font_width_map, ord(char))
                 else:
-                    font_widths = default_width
+                    font_widths += default_width
         return font_widths
 
     def _extract_text(
@@ -1974,7 +1969,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 )
                 if "San" in add_text:
                     pass
-                _font_widths = self._get_font_widths(add_text, cmap, _space_width)
+                if add_text == "l":
+                    pass
+                _, font_width_map = compute_space_width(cmap[3], 32, cmap[1])
+                _font_widths = self._get_font_widths(add_text, font_width_map, _space_width)
             else:
                 return None
             if check_crlf_space:

From e159e4dd7c29a90db4b85b6e3e1b443e355f31ec Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 16:47:18 +0900
Subject: [PATCH 18/59] Font width map and space width acquisition process
 separation

---
 pypdf/_cmap.py | 74 +++++++++++++++++++++++++-------------------------
 pypdf/_page.py |  8 ++----
 2 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 12bccd4b9..703eb6524 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -85,7 +85,8 @@ def build_char_map_from_dict(
         sp = ord(map_dict[chr(sp)])
     else:
         sp = space_code
-    sp_width, font_width_map = compute_space_width(ft, sp, map_dict)
+    font_width_map = build_font_width_map(ft, map_dict)
+    sp_width = compute_space_width(font_width_map, sp)
 
     return (
         font_type,
@@ -398,26 +399,25 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
         lst = lst[2:]
 
 
-def compute_space_width(
-    ft: DictionaryObject, sp: int, map_dict: Dict[Any, Any]
-) -> Tuple[float, Dict[Any, float]]:
-    char_code_width = {}
+def build_font_width_map(
+    ft: DictionaryObject, map_dict: Dict[Any, Any]
+) -> Dict[Any, float]:
+    font_width_map = {}
     st: int = 0
     en: int = 0
     try:
         default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2
     except Exception:
         default_font_width = 2000.0  # Default font width is 0.2
-    sp_width: float = default_font_width  # default value
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
         # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
         try:
-            char_code_width["default"] = cast(float, ft1["/DW"])
+            font_width_map["default"] = cast(float, ft1["/DW"])
         except Exception:
-            char_code_width["default"] = default_font_width
+            font_width_map["default"] = default_font_width
         if "/W" in ft1:
             w = list(ft1["/W"])
         else:
@@ -431,7 +431,7 @@ def compute_space_width(
                 for c_code in range(st, en + 1):
                     try:
                         conversion_char = map_dict[chr(c_code)]
-                        char_code_width[ord(conversion_char)] = w[2]
+                        font_width_map[ord(conversion_char)] = w[2]
                     except Exception:
                         pass
                 w = w[3:]
@@ -441,7 +441,7 @@ def compute_space_width(
                 for width in second:
                     try:
                         conversion_char = map_dict[chr(c_code)]
-                        char_code_width[ord(conversion_char)] = width
+                        font_width_map[ord(conversion_char)] = width
                     except Exception:
                         pass
                     c_code += 1
@@ -452,18 +452,12 @@ def compute_space_width(
                     __name__,
                 )
                 break
-        try:
-            sp_width = char_code_width[sp]
-        except Exception:
-            sp_width = (
-                char_code_width["default"] / 2.0
-            )  # if using default we consider space will be only half size
     elif "/Widths" in ft:
         w = list(ft["/Widths"])
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(
             DictionaryObject, ft["/FontDescriptor"]
         ):
-            char_code_width["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+            font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
         else:
             # will consider width of char as avg(width)
             m = 0
@@ -473,26 +467,32 @@ def compute_space_width(
                 if xx > 0:
                     m += xx
                     cpt += 1
-            char_code_width["default"] = m / max(1, cpt)
-        try:
-            st = cast(int, ft["/FirstChar"])
-            en = cast(int, ft["/LastChar"])
-            if st > sp or en < sp:
-                raise Exception("There is no space character code in the font range")
-            for c_code in range(st, en + 1):
-                width = w[c_code - st].get_object()
-                if width == 0:
-                    raise Exception("The PDF structure is invalid. The array is too "
-                                    "small for the specified font width.")
-                char_code_width[c_code] = width
-        except Exception:
-            if "default" in char_code_width:
-                sp_width = char_code_width["default"]
-        if not sp_width:
-            sp_width = char_code_width[sp].get_object()
-    if is_null_or_none(sp_width):
-        sp_width = 0.0
-    return sp_width, char_code_width
+            font_width_map["default"] = m / max(1, cpt)
+        st = cast(int, ft["/FirstChar"])
+        en = cast(int, ft["/LastChar"])
+        for c_code in range(st, en + 1):
+            width = w[c_code - st].get_object()
+            if is_null_or_none(width):
+                # The PDF structure is invalid. The array is too small
+                # for the specified font width.
+                pass
+            font_width_map[c_code] = width
+    if "defalut" not in font_width_map:
+        font_width_map["default"] = default_font_width
+    return font_width_map
+
+
+def compute_space_width(
+    font_width_map: Dict[Any, float], sp: int
+) -> float:
+    try:
+        sp_width = font_width_map[sp]
+    except Exception:
+        sp_width = (
+            font_width_map["default"] / 2.0
+        )  # if using default we consider space will be only half size
+
+    return sp_width
 
 
 def compute_font_width(
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 6835e5673..9a1fbe96e 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -49,7 +49,7 @@
     overload,
 )
 
-from ._cmap import build_char_map, compute_font_width, compute_space_width, unknown_char_map
+from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
@@ -1967,11 +1967,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
-                if "San" in add_text:
-                    pass
-                if add_text == "l":
-                    pass
-                _, font_width_map = compute_space_width(cmap[3], 32, cmap[1])
+                font_width_map = build_font_width_map(cmap[3], cmap[1])
                 _font_widths = self._get_font_widths(add_text, font_width_map, _space_width)
             else:
                 return None

From a19a8f4dba7c8455848fb8c1ca6e80ce7c3c592f Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 19:03:47 +0900
Subject: [PATCH 19/59] Revert to original adjustment space width

---
 pypdf/_cmap.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 703eb6524..51eff91ed 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -86,11 +86,11 @@ def build_char_map_from_dict(
     else:
         sp = space_code
     font_width_map = build_font_width_map(ft, map_dict)
-    sp_width = compute_space_width(font_width_map, sp)
+    half_space_width = compute_space_width(font_width_map, sp) / 2.0
 
     return (
         font_type,
-        sp_width,
+        half_space_width,
         encoding,
         # https://github.com/python/mypy/issues/4374
         map_dict
@@ -408,7 +408,7 @@ def build_font_width_map(
     try:
         default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2
     except Exception:
-        default_font_width = 2000.0  # Default font width is 0.2
+        default_font_width = 1000.0  # Default font width is 0.1
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
         # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
         # Widths for a CIDFont are defined using the DW and W entries.

From 6dbda504671dabab218e2288d3688796f1425e41 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 19:06:18 +0900
Subject: [PATCH 20/59] Supports diagonal travel distance

---
 pypdf/_text_extraction/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index c29aa4c1d..accfcc691 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -139,8 +139,8 @@ def crlf_space_check(
                     )
                     text = ""
         elif (
-            abs(moved_height) < f * 0.3
-            and moved_width > (spacewidth + font_width) * f
+            (math.sqrt(moved_width * moved_width + moved_height * moved_height)
+                > (spacewidth + font_width) * f)
             and (output + text)[-1] != " "
         ):
             text += " "

From 34efe522fba284ab9fa23add13f0f1035435164c Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 19:24:34 +0900
Subject: [PATCH 21/59] Font size defaults to twice the space

---
 pypdf/_page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 9a1fbe96e..cf7c60379 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1968,7 +1968,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     visitor_text,
                 )
                 font_width_map = build_font_width_map(cmap[3], cmap[1])
-                _font_widths = self._get_font_widths(add_text, font_width_map, _space_width)
+                _font_widths = self._get_font_widths(add_text, font_width_map, _space_width * 2.0)
             else:
                 return None
             if check_crlf_space:

From 52aa7ac2d5738ea7a5d9ff9318437ec5e7ff36b2 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 19:31:25 +0900
Subject: [PATCH 22/59] Get the default space width from the argument

---
 pypdf/_cmap.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 51eff91ed..3bda18e2e 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -86,7 +86,8 @@ def build_char_map_from_dict(
     else:
         sp = space_code
     font_width_map = build_font_width_map(ft, map_dict)
-    half_space_width = compute_space_width(font_width_map, sp) / 2.0
+    half_space_width = compute_space_width(
+        font_width_map, sp, space_width) / 2.0
 
     return (
         font_type,
@@ -402,11 +403,11 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
 def build_font_width_map(
     ft: DictionaryObject, map_dict: Dict[Any, Any]
 ) -> Dict[Any, float]:
-    font_width_map = {}
+    font_width_map: Dict[Any, float] = {}
     st: int = 0
     en: int = 0
     try:
-        default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2
+        default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
     except Exception:
         default_font_width = 1000.0  # Default font width is 0.1
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
@@ -483,14 +484,17 @@ def build_font_width_map(
 
 
 def compute_space_width(
-    font_width_map: Dict[Any, float], sp: int
+    font_width_map: Dict[Any, float], sp: int, default_space_width: float
 ) -> float:
     try:
         sp_width = font_width_map[sp]
     except Exception:
-        sp_width = (
-            font_width_map["default"] / 2.0
-        )  # if using default we consider space will be only half size
+        if default_space_width:
+            sp_width = default_space_width
+        else:
+            sp_width = (
+                font_width_map["default"] / 2.0
+            )  # if using default we consider space will be only half size
 
     return sp_width
 

From 7a028bbab8e4c77153e80707e37d701657ed6717 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 20:58:42 +0900
Subject: [PATCH 23/59] fix self-made bugs

---
 pypdf/_cmap.py                     | 9 +++++++--
 pypdf/_text_extraction/__init__.py | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 3bda18e2e..ce5b1aea2 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -82,7 +82,10 @@ def build_char_map_from_dict(
         except Exception:
             sp = space_code.encode("utf-16-be")
             sp = sp[0] + 256 * sp[1]
-        sp = ord(map_dict[chr(sp)])
+        try:
+            sp = ord(map_dict[chr(sp)])
+        except Exception:
+            pass
     else:
         sp = space_code
     font_width_map = build_font_width_map(ft, map_dict)
@@ -401,7 +404,7 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
 
 
 def build_font_width_map(
-    ft: DictionaryObject, map_dict: Dict[Any, Any]
+    ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any]
 ) -> Dict[Any, float]:
     font_width_map: Dict[Any, float] = {}
     st: int = 0
@@ -488,6 +491,8 @@ def compute_space_width(
 ) -> float:
     try:
         sp_width = font_width_map[sp]
+        if sp_width == 0:
+            raise Exception("Zero width")
     except Exception:
         if default_space_width:
             sp_width = default_space_width
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index accfcc691..c87631579 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -137,10 +137,11 @@ def crlf_space_check(
                         cmap[3],
                         font_size,
                     )
-                    text = ""
+                text = ""
         elif (
             (math.sqrt(moved_width * moved_width + moved_height * moved_height)
                 > (spacewidth + font_width) * f)
+            and (moved_width >= 0)  # The string are not back.
             and (output + text)[-1] != " "
         ):
             text += " "

From f02fa23c4409c17b25bec1df3c99e044966cbff7 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 21:03:18 +0900
Subject: [PATCH 24/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index ce5b1aea2..d5b44e13d 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -413,6 +413,9 @@ def build_font_width_map(
         default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
     except Exception:
         default_font_width = 1000.0  # Default font width is 0.1
+    if ft is None:
+        font_width_map["default"] = default_font_width
+        return font_width_map
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
         # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
         # Widths for a CIDFont are defined using the DW and W entries.

From 980d8316075f4160bdad2d0ced82e4f90c105ce2 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 21:05:45 +0900
Subject: [PATCH 25/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index d5b44e13d..d229841b1 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -409,13 +409,14 @@ def build_font_width_map(
     font_width_map: Dict[Any, float] = {}
     st: int = 0
     en: int = 0
-    try:
-        default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
-    except Exception:
-        default_font_width = 1000.0  # Default font width is 0.1
+    default_font_width = 1000.0  # Default font width is 0.1
     if ft is None:
         font_width_map["default"] = default_font_width
         return font_width_map
+    try:
+        default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
+    except Exception:
+        pass
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
         # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
         # Widths for a CIDFont are defined using the DW and W entries.

From 5e6a0dd52853ef424bc54699a1ba4444b0dd9e95 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 21:25:57 +0900
Subject: [PATCH 26/59] fix self-made bugs

---
 pypdf/_cmap.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index d229841b1..15f49419c 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -479,12 +479,13 @@ def build_font_width_map(
         st = cast(int, ft["/FirstChar"])
         en = cast(int, ft["/LastChar"])
         for c_code in range(st, en + 1):
-            width = w[c_code - st].get_object()
-            if is_null_or_none(width):
+            try:
+                width = w[c_code - st].get_object()
+                font_width_map[c_code] = width
+            except Exception:
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass
-            font_width_map[c_code] = width
     if "defalut" not in font_width_map:
         font_width_map["default"] = default_font_width
     return font_width_map

From 8078ac14b85c56768d05edebb95af91ea4c831eb Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Sun, 29 Sep 2024 21:33:18 +0900
Subject: [PATCH 27/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 15f49419c..9f66c9c2c 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -409,12 +409,12 @@ def build_font_width_map(
     font_width_map: Dict[Any, float] = {}
     st: int = 0
     en: int = 0
-    default_font_width = 1000.0  # Default font width is 0.1
+    default_font_width: float = 1000.0  # Default font width is 0.1
     if ft is None:
         font_width_map["default"] = default_font_width
         return font_width_map
     try:
-        default_font_width: float = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
+        default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
     except Exception:
         pass
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):

From b842ceed2f1a7b2a825c72fd3d4168414fc993c2 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 17:44:02 +0900
Subject: [PATCH 28/59] Compliant with PDF1.7 specifications

---
 pypdf/_text_extraction/__init__.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index c87631579..e2adc15b6 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -113,8 +113,9 @@ def crlf_space_check(
     orientation = orient(m)
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
-    k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
-    f = font_size * k
+    # PDF 32000-1:2008 p249 Table 108 Text positioning operators
+    scale_x = math.sqrt(cm_matrix[0]**2 + cm_matrix[1]**2)
+    scale_y = math.sqrt(cm_matrix[2]**2 + cm_matrix[3]**2)
     cm_prev = m
 
     if orientation not in orientations:
@@ -126,7 +127,7 @@ def crlf_space_check(
         moved_height = delta_x
         moved_width = delta_y
     try:
-        if abs(moved_height) > 0.8 * f:
+        if abs(moved_height) > 0.8 * font_size * scale_y:
             if (output + text)[-1] != "\n":
                 output += text + "\n"
                 if visitor_text is not None:
@@ -139,9 +140,7 @@ def crlf_space_check(
                     )
                 text = ""
         elif (
-            (math.sqrt(moved_width * moved_width + moved_height * moved_height)
-                > (spacewidth + font_width) * f)
-            and (moved_width >= 0)  # The string are not back.
+            (moved_width >= (spacewidth + font_width) * font_size * scale_x)
             and (output + text)[-1] != " "
         ):
             text += " "

From d1c54dbed6f566d58258fabafb74d7a78c2bcf9b Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 20:49:05 +0900
Subject: [PATCH 29/59] fix self-made bugs

---
 pypdf/_text_extraction/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index e2adc15b6..d2ed22fae 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -114,8 +114,8 @@ def crlf_space_check(
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
     # PDF 32000-1:2008 p249 Table 108 Text positioning operators
-    scale_x = math.sqrt(cm_matrix[0]**2 + cm_matrix[1]**2)
-    scale_y = math.sqrt(cm_matrix[2]**2 + cm_matrix[3]**2)
+    scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[1]**2)
+    scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
     cm_prev = m
 
     if orientation not in orientations:

From 328d22be9d2c10ff0888f64e777464ffcc617aff Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 20:50:10 +0900
Subject: [PATCH 30/59] font_map efficiency

---
 pypdf/_page.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index cf7c60379..502c4478c 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1759,6 +1759,7 @@ def _extract_text(
                 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
             ],
         ] = {}
+        font_width_maps: Dict[str, Dict[str, float]] = {}
         try:
             objr = obj
             while NameObject(PG.RESOURCES) not in objr:
@@ -1822,10 +1823,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
             nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths
+            nonlocal font_width_maps
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
             add_text: str = ""
             check_crlf_space: bool = False
+            font_widths: float = 0.0
             # Table 5.4 page 405
             if operator == b"BT":
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1939,6 +1942,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 ty = float(operands[1])
                 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
                 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
+                font_widths = current_fontwidths()
+                _font_widths = 0.0
             elif operator == b"Tm":
                 check_crlf_space = True
                 tm_matrix = [
@@ -1967,8 +1972,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
-                font_width_map = build_font_width_map(cmap[3], cmap[1])
-                _font_widths = self._get_font_widths(add_text, font_width_map, _space_width * 2.0)
+                if cmap[2] not in font_width_maps:
+                    font_width_maps[cmap[2]] = build_font_width_map(cmap[3], cmap[1])
+                font_width_map = font_width_maps[cmap[2]]
+                _font_widths += self._get_font_widths(add_text, font_width_map, _space_width * 2.0)
             else:
                 return None
             if check_crlf_space:
@@ -1984,7 +1991,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                         font_size,
                         visitor_text,
                         current_spacewidth(),
-                        current_fontwidths()
+                        font_widths
                     )
                     if text == "":
                         memo_cm = cm_matrix.copy()

From e3924167820927cec09ad1ccaa0aa9737f0fa9d1 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 21:24:46 +0900
Subject: [PATCH 31/59] fix self-made bugs

---
 pypdf/_cmap.py | 2 +-
 pypdf/_page.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 9f66c9c2c..b67f1495d 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -486,7 +486,7 @@ def build_font_width_map(
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass
-    if "defalut" not in font_width_map:
+    if "default" not in font_width_map:
         font_width_map["default"] = default_font_width
     return font_width_map
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 502c4478c..e5432e5b3 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1954,6 +1954,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     float(operands[4]),
                     float(operands[5]),
                 ]
+                font_widths = current_fontwidths()
+                _font_widths = 0.0
             elif operator == b"T*":
                 check_crlf_space = True
                 tm_matrix[5] -= TL

From 9e6d2cea7b592fe37c601270ae3b2d80357a71cf Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 21:37:35 +0900
Subject: [PATCH 32/59] fix self-made bugs

---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index b67f1495d..9a75e0c53 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -486,7 +486,7 @@ def build_font_width_map(
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass
-    if "default" not in font_width_map:
+    if is_null_or_none(font_width_map.get("default")):
         font_width_map["default"] = default_font_width
     return font_width_map
 

From 1fe5285d22ba4de0b659f8cd734451d2c94eaac7 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Mon, 30 Sep 2024 23:14:36 +0900
Subject: [PATCH 33/59] style: Correcting code style issues

---
 pypdf/_page.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index e5432e5b3..9458bd3e6 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -496,6 +496,7 @@ def __init__(
         if not is_null_or_none(indirect_reference):
             assert indirect_reference is not None, "mypy"
             self.update(cast(DictionaryObject, indirect_reference.get_object()))
+        self._font_width_maps: Dict[str, Dict[str, float]] = {}
 
     def hash_bin(self) -> int:
         """
@@ -1718,11 +1719,17 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
 
     def _get_font_widths(
         self,
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ],
         add_text: str,
-        font_width_map: Dict[Any, float],
         default_width: float
     ) -> float:
         font_widths: float = 0
+        font_name: str = cmap[2]
+        if font_name not in self._font_width_maps:
+            self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1])
+        font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
         if add_text:
             for char in add_text:
                 if font_width_map:
@@ -1759,7 +1766,6 @@ def _extract_text(
                 str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
             ],
         ] = {}
-        font_width_maps: Dict[str, Dict[str, float]] = {}
         try:
             objr = obj
             while NameObject(PG.RESOURCES) not in objr:
@@ -1823,10 +1829,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
             nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths
-            nonlocal font_width_maps
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
-            add_text: str = ""
             check_crlf_space: bool = False
             font_widths: float = 0.0
             # Table 5.4 page 405
@@ -1959,7 +1963,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             elif operator == b"T*":
                 check_crlf_space = True
                 tm_matrix[5] -= TL
-
             elif operator == b"Tj":
                 check_crlf_space = True
                 text, rtl_dir, add_text = handle_tj(
@@ -1974,10 +1977,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
-                if cmap[2] not in font_width_maps:
-                    font_width_maps[cmap[2]] = build_font_width_map(cmap[3], cmap[1])
-                font_width_map = font_width_maps[cmap[2]]
-                _font_widths += self._get_font_widths(add_text, font_width_map, _space_width * 2.0)
+                _font_widths += self._get_font_widths(cmap, add_text, _space_width * 2.0)
             else:
                 return None
             if check_crlf_space:
@@ -2074,7 +2074,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     text = ""
                     memo_cm = cm_matrix.copy()
                     memo_tm = tm_matrix.copy()
-
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None:

From 0292b13f32895054e1d4dfab5a898da21613ce26 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 10:06:41 +0900
Subject: [PATCH 34/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 9a75e0c53..80e9b1bec 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -5,6 +5,7 @@
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import logger_error, logger_warning
 from .generic import (
+    ArrayObject,
     DecodedStreamObject,
     DictionaryObject,
     StreamObject,
@@ -461,7 +462,7 @@ def build_font_width_map(
                 )
                 break
     elif "/Widths" in ft:
-        w = list(ft["/Widths"])
+        w = list(ft["/Widths"]) if isinstance(ft["/Widths"], ArrayObject) else []
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(
             DictionaryObject, ft["/FontDescriptor"]
         ):

From 444bef8e4c87f407d3c732e60add868784107992 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 18:02:50 +0900
Subject: [PATCH 35/59] BUG: Changed timing of font size calculation to before
 font switching

---
 pypdf/_page.py                     | 37 +++++++++++++++++-------------
 pypdf/_text_extraction/__init__.py | 11 +++++----
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 9458bd3e6..25842cb76 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1717,13 +1717,14 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
-    def _get_font_widths(
+    def _get_acutual_font_widths(
         self,
         cmap: Tuple[
             Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
         ],
         add_text: str,
-        default_width: float
+        font_size: float,
+        default_space_width: float
     ) -> float:
         font_widths: float = 0
         font_name: str = cmap[2]
@@ -1735,8 +1736,8 @@ def _get_font_widths(
                 if font_width_map:
                     font_widths += compute_font_width(font_width_map, ord(char))
                 else:
-                    font_widths += default_width
-        return font_widths
+                    font_widths += default_space_width * 2
+        return (font_widths * font_size, default_space_width * font_size, font_size)
 
     def _extract_text(
         self,
@@ -1815,24 +1816,25 @@ def _extract_text(
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
-        _font_widths: float = 0.0
+        _actual_str_size: Dict[str, float] = {
+            "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}  # will be set string length calculation result
         TL = 0.0
         font_size = 12.0  # init just in case of
 
         def current_spacewidth() -> float:
             return _space_width / 1000.0
 
-        def current_fontwidths() -> float:
-            return _font_widths / 1000.0
+        def current_strwidths() -> float:
+            return _actual_str_size["str_widths"] / 1000.0
 
         def process_operation(operator: bytes, operands: List[Any]) -> None:
             nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text, output, text, _font_widths
+            nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
             check_crlf_space: bool = False
-            font_widths: float = 0.0
+            str_widths: float = 0.0
             # Table 5.4 page 405
             if operator == b"BT":
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1946,8 +1948,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 ty = float(operands[1])
                 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
                 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
-                font_widths = current_fontwidths()
-                _font_widths = 0.0
+                str_widths = current_strwidths()
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"Tm":
                 check_crlf_space = True
                 tm_matrix = [
@@ -1958,8 +1960,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     float(operands[4]),
                     float(operands[5]),
                 ]
-                font_widths = current_fontwidths()
-                _font_widths = 0.0
+                str_widths = current_strwidths()
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"T*":
                 check_crlf_space = True
                 tm_matrix[5] -= TL
@@ -1977,7 +1979,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
-                _font_widths += self._get_font_widths(cmap, add_text, _space_width * 2.0)
+                current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
+                    self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
+                _actual_str_size["str_widths"] += current_font_widths
             else:
                 return None
             if check_crlf_space:
@@ -1992,8 +1996,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                         output,
                         font_size,
                         visitor_text,
-                        current_spacewidth(),
-                        font_widths
+                        str_widths,
+                        _actual_str_size["space_width"],
+                        _actual_str_size["str_height"]
                     )
                     if text == "":
                         memo_cm = cm_matrix.copy()
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index d2ed22fae..6fbec903f 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -98,8 +98,9 @@ def crlf_space_check(
     output: str,
     font_size: float,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+    str_widhts: float,
     spacewidth: float,
-    font_width: float
+    str_height: float,
 ) -> Tuple[str, str, List[float], List[float]]:
     cm_prev = cmtm_prev[0]
     tm_prev = cmtm_prev[1]
@@ -114,8 +115,8 @@ def crlf_space_check(
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
     # PDF 32000-1:2008 p249 Table 108 Text positioning operators
-    scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[1]**2)
-    scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
+    scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
+    scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
     cm_prev = m
 
     if orientation not in orientations:
@@ -127,7 +128,7 @@ def crlf_space_check(
         moved_height = delta_x
         moved_width = delta_y
     try:
-        if abs(moved_height) > 0.8 * font_size * scale_y:
+        if abs(moved_height) > 0.8 * str_height * scale_y:
             if (output + text)[-1] != "\n":
                 output += text + "\n"
                 if visitor_text is not None:
@@ -140,7 +141,7 @@ def crlf_space_check(
                     )
                 text = ""
         elif (
-            (moved_width >= (spacewidth + font_width) * font_size * scale_x)
+            (moved_width >= spacewidth + str_widhts * scale_x)
             and (output + text)[-1] != " "
         ):
             text += " "

From ce36f4886d45eef0ab7c471afc4952d7d1745ddd Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 18:07:57 +0900
Subject: [PATCH 36/59] STY: Correcting code style issues

---
 pypdf/_page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 25842cb76..13ac88512 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1725,7 +1725,7 @@ def _get_acutual_font_widths(
         add_text: str,
         font_size: float,
         default_space_width: float
-    ) -> float:
+    ) -> Tuple[float, float, float]:
         font_widths: float = 0
         font_name: str = cmap[2]
         if font_name not in self._font_width_maps:

From 68862dc9ba44478df061e0a0a5f763b5e5cf50bf Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 19:30:00 +0900
Subject: [PATCH 37/59] BUG: Modify space calculation results to match original
 code

---
 pypdf/_cmap.py | 21 ++++++++-------------
 pypdf/_page.py |  2 +-
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 80e9b1bec..8bb75a4ca 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -89,9 +89,8 @@ def build_char_map_from_dict(
             pass
     else:
         sp = space_code
-    font_width_map = build_font_width_map(ft, map_dict)
-    half_space_width = compute_space_width(
-        font_width_map, sp, space_width) / 2.0
+    font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
+    half_space_width = compute_space_width(font_width_map, sp) / 2.0
 
     return (
         font_type,
@@ -405,12 +404,11 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
 
 
 def build_font_width_map(
-    ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any]
+    ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
 ) -> Dict[Any, float]:
     font_width_map: Dict[Any, float] = {}
     st: int = 0
     en: int = 0
-    default_font_width: float = 1000.0  # Default font width is 0.1
     if ft is None:
         font_width_map["default"] = default_font_width
         return font_width_map
@@ -488,24 +486,21 @@ def build_font_width_map(
                 # for the specified font width.
                 pass
     if is_null_or_none(font_width_map.get("default")):
-        font_width_map["default"] = default_font_width
+        font_width_map["default"] = 0.0
     return font_width_map
 
 
 def compute_space_width(
-    font_width_map: Dict[Any, float], sp: int, default_space_width: float
+    font_width_map: Dict[Any, float], sp: int
 ) -> float:
     try:
         sp_width = font_width_map[sp]
         if sp_width == 0:
             raise Exception("Zero width")
     except Exception:
-        if default_space_width:
-            sp_width = default_space_width
-        else:
-            sp_width = (
-                font_width_map["default"] / 2.0
-            )  # if using default we consider space will be only half size
+        sp_width = (
+            font_width_map["default"] / 2.0
+        )  # if using default we consider space will be only half size
 
     return sp_width
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 13ac88512..322721b56 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1729,7 +1729,7 @@ def _get_acutual_font_widths(
         font_widths: float = 0
         font_name: str = cmap[2]
         if font_name not in self._font_width_maps:
-            self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1])
+            self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
         font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
         if add_text:
             for char in add_text:

From 4bcfac34b741c3bd17a05191193566fc33f6cab0 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 20:19:36 +0900
Subject: [PATCH 38/59] BUG: If there is no default value for font, set to
 argument

---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 8bb75a4ca..41512b747 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -486,7 +486,7 @@ def build_font_width_map(
                 # for the specified font width.
                 pass
     if is_null_or_none(font_width_map.get("default")):
-        font_width_map["default"] = 0.0
+        font_width_map["default"] = default_font_width if default_font_width else 0.0
     return font_width_map
 
 

From 6d7f75e823ca9f457182d763e70b63b038ef4555 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 20:20:33 +0900
Subject: [PATCH 39/59] BUG: Mistakes in Multiplication

---
 pypdf/_text_extraction/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 6fbec903f..9781a07e8 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -141,7 +141,7 @@ def crlf_space_check(
                     )
                 text = ""
         elif (
-            (moved_width >= spacewidth + str_widhts * scale_x)
+            (moved_width >= (spacewidth + str_widhts) * scale_x)
             and (output + text)[-1] != " "
         ):
             text += " "

From 3e79f20ebf9790adee0ba0fd50cd40524f120913 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Tue, 1 Oct 2024 20:53:53 +0900
Subject: [PATCH 40/59] Update pypdf/_page.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 322721b56..35bbb7529 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1817,7 +1817,7 @@ def _extract_text(
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
         _actual_str_size: Dict[str, float] = {
-            "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}  # will be set string length calculation result
+            "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}  # will be set to string length calculation result
         TL = 0.0
         font_size = 12.0  # init just in case of
 

From e33b65fc7455f74c06484466ef5f256ccc35f92b Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Tue, 1 Oct 2024 21:08:44 +0900
Subject: [PATCH 41/59] Update pypdf/_text_extraction/__init__.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_text_extraction/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 9781a07e8..36e4914be 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -98,7 +98,7 @@ def crlf_space_check(
     output: str,
     font_size: float,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
-    str_widhts: float,
+    str_widths: float,
     spacewidth: float,
     str_height: float,
 ) -> Tuple[str, str, List[float], List[float]]:

From bef78627d714003bc39bf42ddf9aa5feeeddea09 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 21:23:02 +0900
Subject: [PATCH 42/59] Typo

---
 pypdf/_text_extraction/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 36e4914be..7f86d7807 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -141,7 +141,7 @@ def crlf_space_check(
                     )
                 text = ""
         elif (
-            (moved_width >= (spacewidth + str_widhts) * scale_x)
+            (moved_width >= (spacewidth + str_widths) * scale_x)
             and (output + text)[-1] != " "
         ):
             text += " "

From 2b0e5305a317bf637a4e17dc5949364876a61efe Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 21:25:36 +0900
Subject: [PATCH 43/59] Modifying a comment

---
 pypdf/_cmap.py                     | 2 +-
 pypdf/_text_extraction/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 41512b747..c6202c208 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -417,7 +417,7 @@ def build_font_width_map(
     except Exception:
         pass
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
-        # p271 PDF32000_2008 9.7.4.3 Glyph Metrics in CIDFonts
+        # PDF ref 1.7 9.7.4.3 Glyph Metrics in CIDFonts
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 7f86d7807..9eca070e0 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -114,7 +114,7 @@ def crlf_space_check(
     orientation = orient(m)
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
-    # PDF 32000-1:2008 p249 Table 108 Text positioning operators
+    # PDF ref 1.7 Table 108 Text positioning operators
     scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
     scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
     cm_prev = m

From cb5bf4a265a9dfae6659a7c3d22b98751f7fae3a Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 21:36:20 +0900
Subject: [PATCH 44/59] More detailed error codes

---
 pypdf/_cmap.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index c6202c208..38fff47c7 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -85,7 +85,7 @@ def build_char_map_from_dict(
             sp = sp[0] + 256 * sp[1]
         try:
             sp = ord(map_dict[chr(sp)])
-        except Exception:
+        except KeyError:
             pass
     else:
         sp = space_code
@@ -439,7 +439,7 @@ def build_font_width_map(
                     try:
                         conversion_char = map_dict[chr(c_code)]
                         font_width_map[ord(conversion_char)] = w[2]
-                    except Exception:
+                    except KeyError:
                         pass
                 w = w[3:]
             elif isinstance(second, list):
@@ -449,7 +449,7 @@ def build_font_width_map(
                     try:
                         conversion_char = map_dict[chr(c_code)]
                         font_width_map[ord(conversion_char)] = width
-                    except Exception:
+                    except KeyError:
                         pass
                     c_code += 1
                 w = w[2:]
@@ -481,7 +481,7 @@ def build_font_width_map(
             try:
                 width = w[c_code - st].get_object()
                 font_width_map[c_code] = width
-            except Exception:
+            except KeyError:
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass
@@ -496,8 +496,8 @@ def compute_space_width(
     try:
         sp_width = font_width_map[sp]
         if sp_width == 0:
-            raise Exception("Zero width")
-    except Exception:
+            raise ValueError("Zero width")
+    except ValueError:
         sp_width = (
             font_width_map["default"] / 2.0
         )  # if using default we consider space will be only half size
@@ -512,7 +512,7 @@ def compute_font_width(
     char_width: float = 0.0
     try:
         char_width = font_width_map[char_code]
-    except Exception:
+    except KeyError:
         char_width = (
             font_width_map["default"]
         )

From c8ce234be8eb62b199dabc1956e2125d7b90b33c Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 21:49:11 +0900
Subject: [PATCH 45/59] Allow list conversion of /Widths other than ArrayObject

---
 pypdf/_cmap.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 38fff47c7..699f6c5de 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -5,7 +5,6 @@
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import logger_error, logger_warning
 from .generic import (
-    ArrayObject,
     DecodedStreamObject,
     DictionaryObject,
     StreamObject,
@@ -460,7 +459,10 @@ def build_font_width_map(
                 )
                 break
     elif "/Widths" in ft:
-        w = list(ft["/Widths"]) if isinstance(ft["/Widths"], ArrayObject) else []
+        try:
+            w = cast(list, ft["/Widths"].get_object())
+        except Exception:
+            w = []
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(
             DictionaryObject, ft["/FontDescriptor"]
         ):

From fd82bde465fd001e02ad392435e7508131ecdedb Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 21:57:39 +0900
Subject: [PATCH 46/59] Exception code omitted

---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 699f6c5de..69b3d15f2 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -499,7 +499,7 @@ def compute_space_width(
         sp_width = font_width_map[sp]
         if sp_width == 0:
             raise ValueError("Zero width")
-    except ValueError:
+    except (KeyError, ValueError):
         sp_width = (
             font_width_map["default"] / 2.0
         )  # if using default we consider space will be only half size

From d79da5b815e58ad32ca93135818336c69d9f7d82 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 22:00:16 +0900
Subject: [PATCH 47/59] Explicit description of type

---
 pypdf/_cmap.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 69b3d15f2..7cd8b7dee 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -7,6 +7,7 @@
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
+    PdfObject,
     StreamObject,
     is_null_or_none,
 )
@@ -460,7 +461,7 @@ def build_font_width_map(
                 break
     elif "/Widths" in ft:
         try:
-            w = cast(list, ft["/Widths"].get_object())
+            w: List[PdfObject] = cast(list, ft["/Widths"].get_object())
         except Exception:
             w = []
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(

From 98ccb3a8521bb8e267caab8e9071bfbcbc130bbf Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 22:06:21 +0900
Subject: [PATCH 48/59] style: Correcting code style issues

---
 pypdf/_cmap.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 7cd8b7dee..110fd6786 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -421,6 +421,7 @@ def build_font_width_map(
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
+        w: List[PdfObject] = []
         try:
             font_width_map["default"] = cast(float, ft1["/DW"])
         except Exception:
@@ -461,7 +462,7 @@ def build_font_width_map(
                 break
     elif "/Widths" in ft:
         try:
-            w: List[PdfObject] = cast(list, ft["/Widths"].get_object())
+            w = cast(list, ft["/Widths"].get_object())
         except Exception:
             w = []
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(

From b13b97fc125e839d628863e9cf0f46fc9158a76f Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 22:35:05 +0900
Subject: [PATCH 49/59] Convert character map keys from int(ord) to str

---
 pypdf/_cmap.py | 12 ++++++------
 pypdf/_page.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 110fd6786..74b2227f8 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -90,7 +90,7 @@ def build_char_map_from_dict(
     else:
         sp = space_code
     font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
-    half_space_width = compute_space_width(font_width_map, sp) / 2.0
+    half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0
 
     return (
         font_type,
@@ -449,7 +449,7 @@ def build_font_width_map(
                 for width in second:
                     try:
                         conversion_char = map_dict[chr(c_code)]
-                        font_width_map[ord(conversion_char)] = width
+                        font_width_map[conversion_char] = width
                     except KeyError:
                         pass
                     c_code += 1
@@ -484,8 +484,8 @@ def build_font_width_map(
         for c_code in range(st, en + 1):
             try:
                 width = w[c_code - st].get_object()
-                font_width_map[c_code] = width
-            except KeyError:
+                font_width_map[chr(c_code)] = width
+            except IndexError:
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass
@@ -511,11 +511,11 @@ def compute_space_width(
 
 def compute_font_width(
     font_width_map: Dict[Any, float],
-    char_code: int
+    char: str
 ) -> float:
     char_width: float = 0.0
     try:
-        char_width = font_width_map[char_code]
+        char_width = font_width_map[char]
     except KeyError:
         char_width = (
             font_width_map["default"]
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 35bbb7529..6349cc62f 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1734,7 +1734,7 @@ def _get_acutual_font_widths(
         if add_text:
             for char in add_text:
                 if font_width_map:
-                    font_widths += compute_font_width(font_width_map, ord(char))
+                    font_widths += compute_font_width(font_width_map, char)
                 else:
                     font_widths += default_space_width * 2
         return (font_widths * font_size, default_space_width * font_size, font_size)

From ef7331522f9a8f17e6641f2358b85f1f2054a6e6 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Tue, 1 Oct 2024 23:01:49 +0900
Subject: [PATCH 50/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 74b2227f8..68623419d 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -421,7 +421,7 @@ def build_font_width_map(
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
-        w: List[PdfObject] = []
+        w: List[Union[int, PdfObject]] = []
         try:
             font_width_map["default"] = cast(float, ft1["/DW"])
         except Exception:
@@ -483,7 +483,11 @@ def build_font_width_map(
         en = cast(int, ft["/LastChar"])
         for c_code in range(st, en + 1):
             try:
-                width = w[c_code - st].get_object()
+                width_obj = w[c_code - st].get_object()
+                if is_null_or_none(width_obj):
+                    width = 0.0
+                else:
+                    width = float(width_obj)
                 font_width_map[chr(c_code)] = width
             except IndexError:
                 # The PDF structure is invalid. The array is too small
@@ -495,10 +499,10 @@ def build_font_width_map(
 
 
 def compute_space_width(
-    font_width_map: Dict[Any, float], sp: int
+    font_width_map: Dict[Any, float], space_char: str
 ) -> float:
     try:
-        sp_width = font_width_map[sp]
+        sp_width = font_width_map[space_char]
         if sp_width == 0:
             raise ValueError("Zero width")
     except (KeyError, ValueError):

From f8841602ad4631e57cd8d2c3c23a3b16dbdf58ad Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Tue, 1 Oct 2024 23:07:16 +0900
Subject: [PATCH 51/59] Update pypdf/_cmap.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 68623419d..0aaa27dc0 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -417,7 +417,7 @@ def build_font_width_map(
     except Exception:
         pass
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
-        # PDF ref 1.7 9.7.4.3 Glyph Metrics in CIDFonts
+        # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore

From 20a6883394f6807b0c6d22063c707d6e45e6ed50 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <44630192+ssjkamei@users.noreply.github.com>
Date: Tue, 1 Oct 2024 23:07:38 +0900
Subject: [PATCH 52/59] Update pypdf/_text_extraction/__init__.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_text_extraction/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 9eca070e0..89cdb0f2a 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -114,7 +114,7 @@ def crlf_space_check(
     orientation = orient(m)
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
-    # PDF ref 1.7 Table 108 Text positioning operators
+    # Table 108 of the 1.7 reference ("Text positioning operators")
     scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
     scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
     cm_prev = m

From e6132faf1ae93f9156e1a63ec9f7fd94832dd796 Mon Sep 17 00:00:00 2001
From: Ryo Kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 13:25:15 +0900
Subject: [PATCH 53/59] Exception code omitted

---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 0aaa27dc0..ed8ebb462 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -489,7 +489,7 @@ def build_font_width_map(
                 else:
                     width = float(width_obj)
                 font_width_map[chr(c_code)] = width
-            except IndexError:
+            except (IndexError, KeyError):
                 # The PDF structure is invalid. The array is too small
                 # for the specified font width.
                 pass

From 9a82eb8f7517dd63ff116e8a1280ae5b46b8f78b Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 20:42:34 +0900
Subject: [PATCH 54/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index ed8ebb462..7d24ab82d 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -421,15 +421,12 @@ def build_font_width_map(
         # Widths for a CIDFont are defined using the DW and W entries.
         # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
-        w: List[Union[int, PdfObject]] = []
         try:
             font_width_map["default"] = cast(float, ft1["/DW"])
         except Exception:
             font_width_map["default"] = default_font_width
         if "/W" in ft1:
-            w = list(ft1["/W"])
-        else:
-            w = []
+            w = ft1["/W"].get_object()
         while len(w) > 0:
             st = w[0] if isinstance(w[0], int) else w[0].get_object()
             second = w[1].get_object()
@@ -461,10 +458,7 @@ def build_font_width_map(
                 )
                 break
     elif "/Widths" in ft:
-        try:
-            w = cast(list, ft["/Widths"].get_object())
-        except Exception:
-            w = []
+        w = ft["/Widths"].get_object()
         if "/FontDescriptor" in ft and "/MissingWidth" in cast(
             DictionaryObject, ft["/FontDescriptor"]
         ):

From d4f183534f39867806a4a0059e16432117bbf558 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 20:45:13 +0900
Subject: [PATCH 55/59] Style: Correcting code style issues

---
 pypdf/_cmap.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 7d24ab82d..bf119b268 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -7,7 +7,6 @@
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    PdfObject,
     StreamObject,
     is_null_or_none,
 )

From 96fcf7c995c3508dc47eee81650e0e2e1c4b7423 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 21:22:39 +0900
Subject: [PATCH 56/59] fix self-made bugs

---
 pypdf/_cmap.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index bf119b268..5731e8eb1 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -426,6 +426,8 @@ def build_font_width_map(
             font_width_map["default"] = default_font_width
         if "/W" in ft1:
             w = ft1["/W"].get_object()
+        else:
+            w = []
         while len(w) > 0:
             st = w[0] if isinstance(w[0], int) else w[0].get_object()
             second = w[1].get_object()

From 780a6321c2250da69737ae16a95d23a5c74fb3a4 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 22:47:32 +0900
Subject: [PATCH 57/59] fix self-made bugs

---
 pypdf/_cmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 5731e8eb1..bf990a344 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -437,7 +437,7 @@ def build_font_width_map(
                 for c_code in range(st, en + 1):
                     try:
                         conversion_char = map_dict[chr(c_code)]
-                        font_width_map[ord(conversion_char)] = w[2]
+                        font_width_map[conversion_char] = w[2]
                     except KeyError:
                         pass
                 w = w[3:]

From ce11d0de01cbdd19e89e4d0ede3c19d36c83573b Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 23:07:29 +0900
Subject: [PATCH 58/59] Insufficient height consideration for front and rear
 fonts

---
 pypdf/_text_extraction/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 89cdb0f2a..a1c0d1d91 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -115,8 +115,9 @@ def crlf_space_check(
     delta_x = m[4] - m_prev[4]
     delta_y = m[5] - m_prev[5]
     # Table 108 of the 1.7 reference ("Text positioning operators")
-    scale_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
-    scale_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
+    scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
+    scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
+    scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
     cm_prev = m
 
     if orientation not in orientations:
@@ -128,7 +129,7 @@ def crlf_space_check(
         moved_height = delta_x
         moved_width = delta_y
     try:
-        if abs(moved_height) > 0.8 * str_height * scale_y:
+        if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
             if (output + text)[-1] != "\n":
                 output += text + "\n"
                 if visitor_text is not None:
@@ -141,7 +142,7 @@ def crlf_space_check(
                     )
                 text = ""
         elif (
-            (moved_width >= (spacewidth + str_widths) * scale_x)
+            (moved_width >= (spacewidth + str_widths) * scale_prev_x)
             and (output + text)[-1] != " "
         ):
             text += " "

From 03eb1cb444097fc532c9fc4fd243fc221dcc3cc1 Mon Sep 17 00:00:00 2001
From: ryo kamei <r-kamei@sixsquare.co.jp>
Date: Wed, 2 Oct 2024 23:30:17 +0900
Subject: [PATCH 59/59] style: Correcting code style issues

---
 pypdf/_cmap.py | 6 +-----
 pypdf/_page.py | 5 +----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index bf990a344..fde795b01 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -478,11 +478,7 @@ def build_font_width_map(
         en = cast(int, ft["/LastChar"])
         for c_code in range(st, en + 1):
             try:
-                width_obj = w[c_code - st].get_object()
-                if is_null_or_none(width_obj):
-                    width = 0.0
-                else:
-                    width = float(width_obj)
+                width = w[c_code - st].get_object()
                 font_width_map[chr(c_code)] = width
             except (IndexError, KeyError):
                 # The PDF structure is invalid. The array is too small
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 6349cc62f..c49a68c33 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1733,10 +1733,7 @@ def _get_acutual_font_widths(
         font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
         if add_text:
             for char in add_text:
-                if font_width_map:
-                    font_widths += compute_font_width(font_width_map, char)
-                else:
-                    font_widths += default_space_width * 2
+                font_widths += compute_font_width(font_width_map, char)
         return (font_widths * font_size, default_space_width * font_size, font_size)
 
     def _extract_text(