Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Change the positions of the calls of the visitor-function #2364

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 46 additions & 20 deletions pypdf/_page.py
Expand Up @@ -1873,6 +1873,7 @@ def _extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
See extract_text for most arguments.
Expand Down Expand Up @@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
Expand Down Expand Up @@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
elif operator == b"Tf":
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
Expand Down Expand Up @@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
elif operator == b"TJ":
if visitor_text is not None and group_TJ:
# To prevent sending letters instead of words we
# override the visitor temporarily.
visitor_text_before = visitor_text
tm_matrix_before = [
tm_matrix[0],
tm_matrix[1],
tm_matrix[2],
tm_matrix[3],
tm_matrix[4],
tm_matrix[5],
]
text_TJ: List[str] = []

def visitor_text(
text: str,
cm_matrix: Any,
tm_matrix: Any,
font_dict: Any,
font_size: Any,
) -> None:
# TODO cases where the current inserting order is kept
if rtl_dir:
# right-to-left
text_TJ.insert(0, text) # noqa
else:
text_TJ.append(text) # noqa

for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
Expand All @@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
if visitor_text is not None and group_TJ:
visitor_text = visitor_text_before
visitor_text(
"".join(text_TJ),
cm_matrix,
tm_matrix_before,
cmap[3],
font_size,
)
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
Expand All @@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)
output += text
if visitor_text is not None:
visitor_text(
text,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
Expand All @@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand All @@ -2207,6 +2226,7 @@ def extract_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand Down Expand Up @@ -2246,6 +2266,8 @@ def extract_text(
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.

Returns:
The extracted text
Expand Down Expand Up @@ -2295,6 +2317,7 @@ def extract_text(
visitor_operand_before,
visitor_operand_after,
visitor_text,
group_TJ,
)

def extract_xform_text(
Expand All @@ -2305,6 +2328,7 @@ def extract_xform_text(
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
group_TJ: bool = True,
) -> str:
"""
Extract text from an XObject.
Expand All @@ -2316,6 +2340,8 @@ def extract_xform_text(
visitor_operand_before:
visitor_operand_after:
visitor_text:
group_TJ: True for one call of visitor_text at each TJ,
False for calls of visitor_text at each text-fragment of TJ.

Returns:
The extracted text
Expand Down
43 changes: 36 additions & 7 deletions pypdf/_text_extraction/__init__.py
Expand Up @@ -123,7 +123,7 @@ def crlf_space_check(
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -136,13 +136,21 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 180:
if delta_y > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -155,13 +163,21 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -180,7 +196,7 @@ def crlf_space_check(
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
"\n",
memo_cm,
memo_tm,
cmap[3],
Expand All @@ -193,6 +209,14 @@ def crlf_space_check(
and (output + text)[-1] != " "
):
text += " "
if visitor_text is not None:
visitor_text(
" ",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
except Exception:
pass
tm_prev = tm_matrix.copy()
Expand All @@ -214,12 +238,13 @@ def handle_tj(
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
) -> Tuple[str, bool]:

m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations and len(operands) > 0:
if isinstance(operands[0], str):
text += operands[0]
if visitor_text is not None:
visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size)
else:
t: str = ""
tt: bytes = (
Expand All @@ -243,6 +268,7 @@ def handle_tj(
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
tj_text = ""
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
Expand All @@ -258,7 +284,7 @@ def handle_tj(
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
tj_text = x + tj_text if rtl_dir else tj_text + x
elif ( # right-to-left characters set
0x0590 <= xx <= 0x08FF
or 0xFB1D <= xx <= 0xFDFF
Expand All @@ -280,6 +306,9 @@ def handle_tj(
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
tj_text = tj_text + x
# fmt: on
text = tj_text + text if rtl_dir else text + tj_text
if visitor_text is not None:
visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size)
return text, rtl_dir
38 changes: 27 additions & 11 deletions tests/test_page.py
@@ -1,6 +1,7 @@
"""Test the pypdf._page module."""
import json
import math
import re
from copy import deepcopy
from io import BytesIO
from pathlib import Path
Expand Down Expand Up @@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None:
rectangles.append(r)

def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None:
if text.strip() != "":
if text != "":
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}")
texts.append(
Expand All @@ -571,7 +572,7 @@ def extract_table(

It is expected that each cell is marked by a rectangle-object.
It is expected that the page contains one table only.
It is expected that the table contains at least 3 columns and 2 rows.
It is expected that the table contains at least 2 columns and 2 rows.

A list of rows is returned.
Each row contains a list of cells.
Expand Down Expand Up @@ -623,8 +624,8 @@ def extract_table(
curr_y = None
curr_row = None
for r in rectangles_filtered:
if col2count[r.x] < 3 or row2count[r.y] < 2:
# We expect at least 3 columns and 2 rows.
if col2count[r.x] < 2 or row2count[r.y] < 2:
# We expect at least 2 columns and 2 rows.
continue
if curr_y is None or r.y != curr_y:
# next row
Expand All @@ -646,7 +647,8 @@ def extract_table(

def extract_cell_text(cell_texts: List[PositionedText]) -> str:
"""Joins the text-objects of a cell."""
return ("".join(t.text for t in cell_texts)).strip()
text_raw = "".join(t.text for t in cell_texts)
return re.sub(r" +\n", "\n", text_raw.strip())

# Test 1: We test the analysis of page 7 "2.1 LRS model".
reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
Expand All @@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool:
for t in texts:
for r in rectangles:
if r.contains(t.x, t.y):
texts = rectangle2texts.setdefault(r, [])
texts.append(t.text.strip())
rtexts = rectangle2texts.setdefault(r, [])
if t.text != "":
rtexts.append(t.text)
break
# Five boxes and the figure-description below.
assert len(rectangle2texts) == 6
box_texts = [" ".join(texts) for texts in rectangle2texts.values()]
assert len(rectangle2texts) == 11
box_texts = [
re.sub(" *\n", " ", "".join(texts).strip())
for texts in rectangle2texts.values()
]
assert "Hydro Network" in box_texts
assert "Hydro Events" in box_texts
assert "Metadata" in box_texts
Expand All @@ -697,10 +703,10 @@ def filter_first_table(r) -> bool:
assert extract_cell_text(rows[0][2]) == "Description"
assert extract_cell_text(rows[1][0]) == "September 2002"
# The line break between "English review;"
# and "Remove" is not detected.
# and "Remove" is detected.
assert (
extract_cell_text(rows[6][2])
== "English review;Remove the UML model for the Segmented view."
== "English review;\nRemove the UML model for the Segmented view."
)
assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments."

Expand Down Expand Up @@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None:
assert list_td[2] == (210.0, 210.0)
assert list_td[3] == (410.0, 210.0)

# Test 3b: check extract_visitor in Sample_Td-matrix.pdf
#
(texts, rectangles) = extract_text_and_rectangles(page_td_model)
rows = extract_table(texts, rectangles)
assert len(rows) == 2
assert extract_cell_text(rows[0][0]) == "Hello PDF!"
assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!"
assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!"
assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!"


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
Expand Down