py-pdf · MartinThoma · Dec 24, 2023
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1873,6 +1873,7 @@ def _extract_text(
         visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        group_TJ: bool = True,
     ) -> str:
         """
         See extract_text for most arguments.
@@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             if operator == b"BT":
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
                 output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
                 return None
             elif operator == b"ET":
                 output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
             elif operator == b"cm":
                 output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 cm_matrix = mult(
                     [
@@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             elif operator == b"Tf":
                 if text != "":
                     output += text  # .translate(cmap)
-                    if visitor_text is not None:
-                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 process_operation(b"TL", [-operands[1]])
                 process_operation(b"Td", operands)
             elif operator == b"TJ":
+                if visitor_text is not None and group_TJ:
+                    # To prevent sending letters instead of words we
+                    # override the visitor temporarily.
+                    visitor_text_before = visitor_text
+                    tm_matrix_before = [
+                        tm_matrix[0],
+                        tm_matrix[1],
+                        tm_matrix[2],
+                        tm_matrix[3],
+                        tm_matrix[4],
+                        tm_matrix[5],
+                    ]
+                    text_TJ: List[str] = []
+
+                    def visitor_text(
+                        text: str,
+                        cm_matrix: Any,
+                        tm_matrix: Any,
+                        font_dict: Any,
+                        font_size: Any,
+                    ) -> None:
+                        # TODO cases where the current inserting order is kept
+                        if rtl_dir:
+                            # right-to-left
+                            text_TJ.insert(0, text)  # noqa
+                        else:
+                            text_TJ.append(text)  # noqa
+
                 for op in operands[0]:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
@@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                         and (text[-1] != " ")
                     ):
                         process_operation(b"Tj", [" "])
+                if visitor_text is not None and group_TJ:
+                    visitor_text = visitor_text_before
+                    visitor_text(
+                        "".join(text_TJ),
+                        cm_matrix,
+                        tm_matrix_before,
+                        cmap[3],
+                        font_size,
+                    )
             elif operator == b"Do":
                 output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 try:
                     if output[-1] != "\n":
                         output += "\n"
@@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                             visitor_operand_before,
                             visitor_operand_after,
                             visitor_text,
+                            group_TJ,
                         )
                         output += text
-                        if visitor_text is not None:
-                            visitor_text(
-                                text,
-                                memo_cm,
-                                memo_tm,
-                                cmap[3],
-                                font_size,
-                            )
                 except Exception:
                     logger_warning(
                         f" impossible to decode XFormObject {operands[0]}",
@@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
             if visitor_operand_after is not None:
                 visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
         output += text  # just in case of
-        if text != "" and visitor_text is not None:
-            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(
@@ -2207,6 +2226,7 @@ def extract_text(
         visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        group_TJ: bool = True,
     ) -> str:
         """
         Locate all text drawing commands, in the order they are provided in the
@@ -2246,6 +2266,8 @@ def extract_text(
                 text matrix, font-dictionary and font-size.
                 The font-dictionary may be None in case of unknown fonts.
                 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
+            group_TJ: True for one call of visitor_text at each TJ,
+                False for calls of visitor_text at each text-fragment of TJ.
 
         Returns:
             The extracted text
@@ -2295,6 +2317,7 @@ def extract_text(
             visitor_operand_before,
             visitor_operand_after,
             visitor_text,
+            group_TJ,
         )
 
     def extract_xform_text(
@@ -2305,6 +2328,7 @@ def extract_xform_text(
         visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
         visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        group_TJ: bool = True,
     ) -> str:
         """
         Extract text from an XObject.
@@ -2316,6 +2340,8 @@ def extract_xform_text(
             visitor_operand_before:
             visitor_operand_after:
             visitor_text:
+            group_TJ: True for one call of visitor_text at each TJ,
+               False for calls of visitor_text at each text-fragment of TJ.
 
         Returns:
             The extracted text

diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
@@ -123,7 +123,7 @@ def crlf_space_check(
                     output += text + "\n"
                     if visitor_text is not None:
                         visitor_text(
-                            text + "\n",
+                            "\n",
                             memo_cm,
                             memo_tm,
                             cmap[3],
@@ -136,13 +136,21 @@ def crlf_space_check(
                 and (output + text)[-1] != " "
             ):
                 text += " "
+                if visitor_text is not None:
+                    visitor_text(
+                        " ",
+                        cm_matrix,
+                        tm_matrix,
+                        cmap[3],
+                        font_size,
+                    )
         elif orientation == 180:
             if delta_y > 0.8 * f:
                 if (output + text)[-1] != "\n":
                     output += text + "\n"
                     if visitor_text is not None:
                         visitor_text(
-                            text + "\n",
+                            "\n",
                             memo_cm,
                             memo_tm,
                             cmap[3],
@@ -155,13 +163,21 @@ def crlf_space_check(
                 and (output + text)[-1] != " "
             ):
                 text += " "
+                if visitor_text is not None:
+                    visitor_text(
+                        " ",
+                        cm_matrix,
+                        tm_matrix,
+                        cmap[3],
+                        font_size,
+                    )
         elif orientation == 90:
             if delta_x > 0.8 * f:
                 if (output + text)[-1] != "\n":
                     output += text + "\n"
                     if visitor_text is not None:
                         visitor_text(
-                            text + "\n",
+                            "\n",
                             memo_cm,
                             memo_tm,
                             cmap[3],
@@ -180,7 +196,7 @@ def crlf_space_check(
                     output += text + "\n"
                     if visitor_text is not None:
                         visitor_text(
-                            text + "\n",
+                            "\n",
                             memo_cm,
                             memo_tm,
                             cmap[3],
@@ -193,6 +209,14 @@ def crlf_space_check(
                 and (output + text)[-1] != " "
             ):
                 text += " "
+                if visitor_text is not None:
+                    visitor_text(
+                        " ",
+                        cm_matrix,
+                        tm_matrix,
+                        cmap[3],
+                        font_size,
+                    )
     except Exception:
         pass
     tm_prev = tm_matrix.copy()
@@ -214,12 +238,13 @@ def handle_tj(
     rtl_dir: bool,
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
 ) -> Tuple[str, bool]:
-
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
     if orientation in orientations and len(operands) > 0:
         if isinstance(operands[0], str):
             text += operands[0]
+            if visitor_text is not None:
+                visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size)
         else:
             t: str = ""
             tt: bytes = (
@@ -243,6 +268,7 @@ def handle_tj(
                     [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
                 )
             # "\u0590 - \u08FF \uFB50 - \uFDFF"
+            tj_text = ""
             for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
                 # x can be a sequence of bytes ; ex: habibi.pdf
                 if len(x) == 1:
@@ -258,7 +284,7 @@ def handle_tj(
                     or 0x20A0 <= xx <= 0x21FF           # but (numbers) indices/exponents
                     or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
                 ):
-                    text = x + text if rtl_dir else text + x
+                    tj_text = x + tj_text if rtl_dir else tj_text + x
                 elif (  # right-to-left characters set
                     0x0590 <= xx <= 0x08FF
                     or 0xFB1D <= xx <= 0xFDFF
@@ -280,6 +306,9 @@ def handle_tj(
                         if visitor_text is not None:
                             visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
                         text = ""
-                    text = text + x
+                    tj_text = tj_text + x
                 # fmt: on
+            text = tj_text + text if rtl_dir else text + tj_text
+            if visitor_text is not None:
+                visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size)
     return text, rtl_dir
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -1,6 +1,7 @@
 """Test the pypdf._page module."""
 import json
 import math
+import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
@@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None:
                     rectangles.append(r)
 
         def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None:
-            if text.strip() != "":
+            if text != "":
                 if logger.isEnabledFor(logging.DEBUG):
                     logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}")
                 texts.append(
@@ -571,7 +572,7 @@ def extract_table(
 
         It is expected that each cell is marked by a rectangle-object.
         It is expected that the page contains one table only.
-        It is expected that the table contains at least 3 columns and 2 rows.
+        It is expected that the table contains at least 2 columns and 2 rows.
 
         A list of rows is returned.
         Each row contains a list of cells.
@@ -623,8 +624,8 @@ def extract_table(
         curr_y = None
         curr_row = None
         for r in rectangles_filtered:
-            if col2count[r.x] < 3 or row2count[r.y] < 2:
-                # We expect at least 3 columns and 2 rows.
+            if col2count[r.x] < 2 or row2count[r.y] < 2:
+                # We expect at least 2 columns and 2 rows.
                 continue
             if curr_y is None or r.y != curr_y:
                 # next row
@@ -646,7 +647,8 @@ def extract_table(
 
     def extract_cell_text(cell_texts: List[PositionedText]) -> str:
         """Joins the text-objects of a cell."""
-        return ("".join(t.text for t in cell_texts)).strip()
+        text_raw = "".join(t.text for t in cell_texts)
+        return re.sub(r" +\n", "\n", text_raw.strip())
 
     # Test 1: We test the analysis of page 7 "2.1 LRS model".
     reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
@@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool:
     for t in texts:
         for r in rectangles:
             if r.contains(t.x, t.y):
-                texts = rectangle2texts.setdefault(r, [])
-                texts.append(t.text.strip())
+                rtexts = rectangle2texts.setdefault(r, [])
+                if t.text != "":
+                    rtexts.append(t.text)
                 break
     # Five boxes and the figure-description below.
-    assert len(rectangle2texts) == 6
-    box_texts = [" ".join(texts) for texts in rectangle2texts.values()]
+    assert len(rectangle2texts) == 11
+    box_texts = [
+        re.sub(" *\n", " ", "".join(texts).strip())
+        for texts in rectangle2texts.values()
+    ]
     assert "Hydro Network" in box_texts
     assert "Hydro Events" in box_texts
     assert "Metadata" in box_texts
@@ -697,10 +703,10 @@ def filter_first_table(r) -> bool:
     assert extract_cell_text(rows[0][2]) == "Description"
     assert extract_cell_text(rows[1][0]) == "September 2002"
     # The line break between "English review;"
-    # and "Remove" is not detected.
+    # and "Remove" is detected.
     assert (
         extract_cell_text(rows[6][2])
-        == "English review;Remove the UML model for the Segmented view."
+        == "English review;\nRemove the UML model for the Segmented view."
     )
     assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments."
 
@@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None:
     assert list_td[2] == (210.0, 210.0)
     assert list_td[3] == (410.0, 210.0)
 
+    # Test 3b: check extract_visitor in Sample_Td-matrix.pdf
+    #
+    (texts, rectangles) = extract_text_and_rectangles(page_td_model)
+    rows = extract_table(texts, rectangles)
+    assert len(rows) == 2
+    assert extract_cell_text(rows[0][0]) == "Hello PDF!"
+    assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!"
+    assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!"
+    assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!"
+
 
 @pytest.mark.parametrize(
     ("pdf_path", "password", "embedded", "unembedded"),