Skip to content

Commit

Permalink
fitz/: Fix pymupdf#2238 - use 'overlap' rather than 'contains' when e…
Browse files Browse the repository at this point in the history
…xtracting text.

We now include chars that overlap with the clipbox, instead of only those that
are entirely contained within the clipbox.

Note that new fn JM_rects_overlap() still returns true if one of the rects is
empty. This allows things to work with ligatures, where component glyphs can
have zero width.
  • Loading branch information
julian-smith-artifex-com committed Mar 13, 2023
1 parent bbc8572 commit 59aaf49
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
6 changes: 3 additions & 3 deletions fitz/fitz.i
Original file line number Diff line number Diff line change
Expand Up @@ -11528,7 +11528,7 @@ struct TextPage {
fz_rect linerect = fz_empty_rect;
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect cbbox = JM_char_bbox(gctx, line, ch);
if (!fz_contains_rect(tp_rect, cbbox) &&
if (!JM_rects_overlap(tp_rect, cbbox) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand All @@ -11542,7 +11542,7 @@ struct TextPage {
blockrect = fz_union_rect(blockrect, linerect);
}
text = JM_EscapeStrFromBuffer(gctx, res);
} else if (fz_contains_rect(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) {
} else if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) {
fz_image *img = block->u.i.image;
fz_colorspace *cs = img->colorspace;
text = PyUnicode_FromFormat("<image: %s, width: %d, height: %d, bpc: %d>", fz_colorspace_name(gctx, cs), img->w, img->h, img->bpc);
Expand Down Expand Up @@ -11610,7 +11610,7 @@ struct TextPage {
buflen = 0; // reset char counter
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect cbbox = JM_char_bbox(gctx, line, ch);
if (!fz_contains_rect(tp_rect, cbbox) &&
if (!JM_rects_overlap(tp_rect, cbbox) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand Down
12 changes: 11 additions & 1 deletion fitz/helper-other.i
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,17 @@ fz_archive *JM_archive_from_py(fz_context *ctx, fz_archive *arch, PyObject *path
}



int JM_rects_overlap(const fz_rect a, const fz_rect b)
{
if (0
|| a.x0 >= b.x1
|| a.y0 >= b.y1
|| a.x1 <= b.x0
|| a.y1 <= b.y0
)
return 0;
return 1;
}

//-----------------------------------------------------------------------------
// dummy structure for various tools and utilities
Expand Down
10 changes: 5 additions & 5 deletions fitz/helper-stext.i
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ JM_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page)
if (block->type == FZ_STEXT_BLOCK_TEXT) {
for (line = block->u.t.first_line; line; line = line->next) {
for (ch = line->first_char; ch; ch = ch->next) {
if (!fz_contains_rect(rect, JM_char_bbox(ctx, line, ch)) &&
if (!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch)) &&
!fz_is_infinite_rect(rect)) {
continue;
}
Expand Down Expand Up @@ -375,7 +375,7 @@ JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle)
for (line = block->u.t.first_line; line; line = line->next) {
for (ch = line->first_char; ch; ch = ch->next) {
if (!fz_is_infinite_rect(rect) &&
!fz_contains_rect(rect, JM_char_bbox(ctx, line, ch))) {
!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch))) {
goto next_char;
}
try_new_match:
Expand Down Expand Up @@ -436,7 +436,7 @@ JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page
for (ch = line->first_char; ch; ch = ch->next) {
chbbox = JM_char_bbox(ctx, line, ch);
if (fz_is_infinite_rect(rect) ||
fz_contains_rect(rect, chbbox)) {
JM_rects_overlap(rect, chbbox)) {
last_char = ch->c;
n = fz_runetochar(utf, ch->c);
for (i = 0; i < n; i++) {
Expand Down Expand Up @@ -525,7 +525,7 @@ JM_make_spanlist(fz_context *ctx, PyObject *line_dict,

for (ch = line->first_char; ch; ch = ch->next) {
fz_rect r = JM_char_bbox(ctx, line, ch);
if (!fz_contains_rect(tp_rect, r) &&
if (!JM_rects_overlap(tp_rect, r) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand Down Expand Up @@ -771,7 +771,7 @@ JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
int line_had_text = 0;
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect r = JM_char_bbox(ctx, line, ch);
if (fz_contains_rect(area, r)) {
if (JM_rects_overlap(area, r)) {
line_had_text = 1;
if (need_new_line) {
fz_append_string(ctx, buffer, "\n");
Expand Down

0 comments on commit 59aaf49

Please sign in to comment.