From 03f82eaab57d3c7c852c6e61bfd805c8cf42e8f2 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sun, 31 Mar 2024 17:05:21 -0700 Subject: [PATCH] Added line size metrics to output per #906 (#909) --- package-lock.json | 14 +++++++------- package.json | 2 +- src/index.d.ts | 6 ++++++ src/worker-script/utils/dump.js | 11 +++++++++++ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index ca3934506..9cbf45049 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "node-fetch": "^2.6.9", "opencollective-postinstall": "^2.0.3", "regenerator-runtime": "^0.13.3", - "tesseract.js-core": "^5.0.0", + "tesseract.js-core": "^5.1.0", "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, @@ -8664,9 +8664,9 @@ } }, "node_modules/tesseract.js-core": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.0.0.tgz", - "integrity": "sha512-lJur5LzjinW5VYMKlVNnBU2JPLpO+A9VqAYBeuV+ZgH0hKvsnm+536Yyp+/zRTBdLe7D6Kok0FN9g+TE4J8qGA==" + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.0.tgz", + "integrity": "sha512-D4gc5ET1DF/sDayF/eVmHgVGo7nqVC2e3d7uVgVOSAk4NOcmUqvJRTj8etqEmI/2390ZkXCRiDMxTD1RFYyp1g==" }, "node_modules/test-exclude": { "version": "6.0.0", @@ -16062,9 +16062,9 @@ } }, "tesseract.js-core": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.0.0.tgz", - "integrity": "sha512-lJur5LzjinW5VYMKlVNnBU2JPLpO+A9VqAYBeuV+ZgH0hKvsnm+536Yyp+/zRTBdLe7D6Kok0FN9g+TE4J8qGA==" + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.0.tgz", + "integrity": "sha512-D4gc5ET1DF/sDayF/eVmHgVGo7nqVC2e3d7uVgVOSAk4NOcmUqvJRTj8etqEmI/2390ZkXCRiDMxTD1RFYyp1g==" }, "test-exclude": { "version": "6.0.0", diff --git a/package.json b/package.json index abb8e8a0d..cacd0ef74 100644 --- a/package.json +++ b/package.json @@ -69,7 +69,7 @@ "node-fetch": "^2.6.9", "opencollective-postinstall": "^2.0.3", "regenerator-runtime": "^0.13.3", - "tesseract.js-core": "^5.0.0", + "tesseract.js-core": "^5.1.0", "wasm-feature-detect": "^1.2.11", "zlibjs": "^0.3.1" }, diff --git a/src/index.d.ts b/src/index.d.ts index bad0b89a9..4b2358206 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -178,6 +178,11 @@ declare namespace Tesseract { y1: number; has_baseline: boolean; } + interface RowAttributes { + ascenders: number; + descenders: number; + row_height: number; + } interface Bbox { x0: number; y0: number; @@ -189,6 +194,7 @@ declare namespace Tesseract { text: string; confidence: number; baseline: Baseline; + rowAttributes: RowAttributes bbox: Bbox; paragraph: Paragraph; block: Block; diff --git a/src/worker-script/utils/dump.js b/src/worker-script/utils/dump.js index 9c4d3a180..4bdeaf586 100644 --- a/src/worker-script/utils/dump.js +++ b/src/worker-script/utils/dump.js @@ -126,11 +126,22 @@ module.exports = (TessModule, api, output, options) => { block.paragraphs.push(para); } if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { + // getRowAttributes was added in a recent minor version of Tesseract.js-core, + // so we need to check if it exists before calling it. + // This can be removed in the next major version (v6). + let rowAttributes; + if (ri.getRowAttributes) { + rowAttributes = ri.getRowAttributes(); + // Descenders is reported as a negative within Tesseract internally so we need to flip it. + // The positive version is intuitive, and matches what is reported in the hOCR output. + rowAttributes.descenders *= -1; + } textline = { words: [], text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null, baseline: ri.getBaseline(RIL_TEXTLINE), + rowAttributes, bbox: ri.getBoundingBox(RIL_TEXTLINE), }; para.lines.push(textline);