Merge pull request #511 from badGarnet/yao/allow-multiple-output-form…

…ats-in-one-run allow multiple output
madmaze · Sep 7, 2023 · 07da369 · 07da369
2 parents b35f061 + e43ffd6
commit 07da369
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 7 deletions.
diff --git a/README.rst b/README.rst
@@ -97,6 +97,10 @@ Library usage:
     # Get ALTO XML output
     xml = pytesseract.image_to_alto_xml('test.png')
 
+    # getting multiple types of output with one call to save compute time
+    # currently supports mix and match of the following: txt, pdf, hocr, box, tsv
+    text, boxes = pytesseract.run_and_get_multiple_output('test.png', extensions=['txt', 'box'])
+
 Support for OpenCV image/NumPy array objects
 
 .. code-block:: python
@@ -153,6 +157,8 @@ Add the following config, if you have tessdata error like: "Error opening data f
 
 * **run_and_get_output** Returns the raw output from Tesseract OCR. Gives a bit more control over the parameters that are sent to tesseract.
 
+* **run_and_get_multiple_output** Returns like `run_and_get_output` but can handle multiple extensions. This function replaces the `extension: str` kwarg with `extension: List[str]` kwarg where a list of extensions can be specified and the corresponding data is returned after only one `tesseract` call. This function reduces the number of calls to `tesseract` when multiple output formats, like both text and bounding boxes,  are needed.
+
 **Parameters**
 
 ``image_to_data(image, lang=None, config='', nice=0, output_type=Output.STRING, timeout=0, pandas_config=None)``

diff --git a/pytesseract/__init__.py b/pytesseract/__init__.py
@@ -9,6 +9,7 @@
 from .pytesseract import image_to_pdf_or_hocr
 from .pytesseract import image_to_string
 from .pytesseract import Output
+from .pytesseract import run_and_get_multiple_output
 from .pytesseract import run_and_get_output
 from .pytesseract import TesseractError
 from .pytesseract import TesseractNotFoundError

diff --git a/pytesseract/pytesseract.py b/pytesseract/pytesseract.py
@@ -21,6 +21,8 @@
 from pkgutil import find_loader
 from tempfile import NamedTemporaryFile
 from time import sleep
+from typing import List
+from typing import Optional
 
 from packaging.version import InvalidVersion
 from packaging.version import parse
@@ -65,6 +67,13 @@
     'Script confidence': ('script_conf', float),
 }
 
+EXTENTION_TO_CONFIG = {
+    'box': 'tessedit_create_boxfile=1 batch.nochop makebox',
+    'xml': 'tessedit_create_alto=1',
+    'hocr': 'tessedit_create_hocr=1',
+    'tsv': 'tessedit_create_tsv=1',
+}
+
 TESSERACT_MIN_VERSION = Version('3.05')
 TESSERACT_ALTO_VERSION = Version('4.1.0')
 
@@ -252,8 +261,9 @@ def run_tesseract(
     if config:
         cmd_args += shlex.split(config, posix=not_windows)
 
-    if extension and extension not in {'box', 'osd', 'tsv', 'xml'}:
-        cmd_args.append(extension)
+    for _extension in extension.split():
+        if _extension not in {'box', 'osd', 'tsv', 'xml'}:
+            cmd_args.append(_extension)
     LOGGER.debug('%r', cmd_args)
 
     try:
@@ -269,6 +279,51 @@ def run_tesseract(
             raise TesseractError(proc.returncode, get_errors(error_string))
 
 
+def _read_output(filename: str, return_bytes: bool = False):
+    with open(filename, 'rb') as output_file:
+        if return_bytes:
+            return output_file.read()
+        return output_file.read().decode(DEFAULT_ENCODING)
+
+
+def run_and_get_multiple_output(
+    image,
+    extensions: List[str],
+    lang: Optional[str] = None,
+    nice: int = 0,
+    timeout: int = 0,
+    return_bytes: bool = False,
+):
+    config = ' '.join(
+        EXTENTION_TO_CONFIG.get(extension, '') for extension in extensions
+    ).strip()
+    if config:
+        config = f'-c {config}'
+    else:
+        config = ''
+
+    with save(image) as (temp_name, input_filename):
+        kwargs = {
+            'input_filename': input_filename,
+            'output_filename_base': temp_name,
+            'extension': ' '.join(extensions),
+            'lang': lang,
+            'config': config,
+            'nice': nice,
+            'timeout': timeout,
+        }
+
+        run_tesseract(**kwargs)
+
+        return [
+            _read_output(
+                f"{kwargs['output_filename_base']}{extsep}{extension}",
+                True if extension in {'pdf', 'hocr'} else return_bytes,
+            )
+            for extension in extensions
+        ]
+
+
 def run_and_get_output(
     image,
     extension='',
@@ -290,11 +345,10 @@ def run_and_get_output(
         }
 
         run_tesseract(**kwargs)
-        filename = f"{kwargs['output_filename_base']}{extsep}{extension}"
-        with open(filename, 'rb') as output_file:
-            if return_bytes:
-                return output_file.read()
-            return output_file.read().decode(DEFAULT_ENCODING)
+        return _read_output(
+            f"{kwargs['output_filename_base']}{extsep}{extension}",
+            return_bytes,
+        )
 
 
 def file_to_dict(tsv, cell_delimiter, str_col_idx):

diff --git a/tests/pytesseract_test.py b/tests/pytesseract_test.py
@@ -1,3 +1,4 @@
+from functools import partial
 from glob import iglob
 from multiprocessing import Pool
 from os import getcwd
@@ -20,6 +21,7 @@
 from pytesseract import image_to_pdf_or_hocr
 from pytesseract import image_to_string
 from pytesseract import Output
+from pytesseract import run_and_get_multiple_output
 from pytesseract import TesseractNotFoundError
 from pytesseract import TSVNotSupported
 from pytesseract.pytesseract import file_to_dict
@@ -73,6 +75,17 @@ def test_file_small():
     return path.join(DATA_DIR, 'test-small.jpg')
 
 
+@pytest.fixture(scope='session')
+def function_mapping():
+    return {
+        'pdf': partial(image_to_pdf_or_hocr, extension='pdf'),
+        'txt': image_to_string,
+        'box': image_to_boxes,
+        'hocr': partial(image_to_pdf_or_hocr, extension='hocr'),
+        'tsv': image_to_data,
+    }
+
+
 @pytest.mark.parametrize(
     'test_file',
     [
@@ -227,6 +240,33 @@ def test_image_to_pdf_or_hocr(test_file, extension):
         assert result.endswith('</html>')
 
 
+@pytest.mark.parametrize(
+    'extensions',
+    [
+        ['tsv', 'pdf', 'txt', 'box', 'hocr'],
+        # This tests a case where the extensions do not add any config params
+        # Here this test is not merged with the test above because we might get
+        # into a racing condition where test results from different parameter
+        # are mixed in the test below
+        ['pdf', 'txt'],
+    ],
+)
+def test_run_and_get_multiple_output(test_file, function_mapping, extensions):
+    compound_results = run_and_get_multiple_output(
+        test_file,
+        extensions=extensions,
+    )
+    for result, extension in zip(compound_results, extensions):
+        if extension == 'pdf':
+            # pdf creation time could be different between the two so do not
+            # check the whole string
+            assert (
+                result[:1000] == function_mapping[extension](test_file)[:1000]
+            )
+        else:
+            assert result == function_mapping[extension](test_file)
+
+
 @pytest.mark.skipif(
     TESSERACT_VERSION[:2] < (4, 1),
     reason='requires tesseract >= 4.1',