Skip to content

Commit

Permalink
fitz/: Fix pymupdf#2238 - use 'overlap' rather than 'contains' when e…
Browse files Browse the repository at this point in the history
…xtracting text.

We now include chars that overlap with the clipbox, instead of only those that
are entirely contained within the clipbox.
  • Loading branch information
julian-smith-artifex-com committed Mar 8, 2023
1 parent 0c4e3cb commit 63aceb2
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 10 deletions.
6 changes: 3 additions & 3 deletions fitz/fitz.i
Expand Up @@ -11528,7 +11528,7 @@ struct TextPage {
fz_rect linerect = fz_empty_rect;
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect cbbox = JM_char_bbox(gctx, line, ch);
if (!fz_contains_rect(tp_rect, cbbox) &&
if (!JM_rects_overlap(tp_rect, cbbox) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand All @@ -11542,7 +11542,7 @@ struct TextPage {
blockrect = fz_union_rect(blockrect, linerect);
}
text = JM_EscapeStrFromBuffer(gctx, res);
} else if (fz_contains_rect(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) {
} else if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) {
fz_image *img = block->u.i.image;
fz_colorspace *cs = img->colorspace;
text = PyUnicode_FromFormat("<image: %s, width: %d, height: %d, bpc: %d>", fz_colorspace_name(gctx, cs), img->w, img->h, img->bpc);
Expand Down Expand Up @@ -11610,7 +11610,7 @@ struct TextPage {
buflen = 0; // reset char counter
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect cbbox = JM_char_bbox(gctx, line, ch);
if (!fz_contains_rect(tp_rect, cbbox) &&
if (!JM_rects_overlap(tp_rect, cbbox) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand Down
6 changes: 5 additions & 1 deletion fitz/helper-other.i
Expand Up @@ -1295,7 +1295,11 @@ fz_archive *JM_archive_from_py(fz_context *ctx, fz_archive *arch, PyObject *path
}



int JM_rects_overlap(const fz_rect a, const fz_rect b)
{
fz_rect c = fz_intersect_rect(a, b);
return !fz_is_empty_rect(c);
}

//-----------------------------------------------------------------------------
// dummy structure for various tools and utilities
Expand Down
12 changes: 6 additions & 6 deletions fitz/helper-stext.i
Expand Up @@ -228,7 +228,7 @@ JM_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page)
if (block->type == FZ_STEXT_BLOCK_TEXT) {
for (line = block->u.t.first_line; line; line = line->next) {
for (ch = line->first_char; ch; ch = ch->next) {
if (!fz_contains_rect(rect, JM_char_bbox(ctx, line, ch)) &&
if (!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch)) &&
!fz_is_infinite_rect(rect)) {
continue;
}
Expand Down Expand Up @@ -420,7 +420,7 @@ JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle)
for (line = block->u.t.first_line; line; line = line->next) {
for (ch = line->first_char; ch; ch = ch->next) {
if (!fz_is_infinite_rect(rect) &&
!fz_contains_rect(rect, JM_char_bbox(ctx, line, ch))) {
!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch))) {
goto next_char;
}
try_new_match:
Expand Down Expand Up @@ -482,7 +482,7 @@ JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page
for (ch = line->first_char; ch; ch = ch->next) {
chbbox = JM_char_bbox(ctx, line, ch);
if (fz_is_infinite_rect(rect) ||
fz_contains_rect(rect, chbbox)) {
JM_rects_overlap(rect, chbbox)) {
last_char = ch->c;
n = fz_runetochar(utf, ch->c);
for (i = 0; i < n; i++) {
Expand Down Expand Up @@ -571,7 +571,7 @@ JM_make_spanlist(fz_context *ctx, PyObject *line_dict,

for (ch = line->first_char; ch; ch = ch->next) {
fz_rect r = JM_char_bbox(ctx, line, ch);
if (!fz_contains_rect(tp_rect, r) &&
if (!JM_rects_overlap(tp_rect, r) &&
!fz_is_infinite_rect(tp_rect)) {
continue;
}
Expand Down Expand Up @@ -769,7 +769,7 @@ void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_di
int block_n = -1;
for (block = tp->first_block; block; block = block->next) {
block_n++;
if (!fz_contains_rect(tp_rect, block->bbox) &&
if (!JM_rects_overlap(tp_rect, block->bbox) &&
!fz_is_infinite_rect(tp_rect) &&
block->type == FZ_STEXT_BLOCK_IMAGE) {
continue;
Expand Down Expand Up @@ -817,7 +817,7 @@ JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
int line_had_text = 0;
for (ch = line->first_char; ch; ch = ch->next) {
fz_rect r = JM_char_bbox(ctx, line, ch);
if (fz_contains_rect(area, r)) {
if (JM_rects_overlap(area, r)) {
line_had_text = 1;
if (need_new_line) {
fz_append_string(ctx, buffer, "\n");
Expand Down

0 comments on commit 63aceb2

Please sign in to comment.