Skip to content

Commit

Permalink
Added tesseract version and libcurl check for URL input
Browse files Browse the repository at this point in the history
  • Loading branch information
marosstruk committed Dec 5, 2023
1 parent 30a0b28 commit 4c53f33
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 4 deletions.
2 changes: 2 additions & 0 deletions pytesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .pytesseract import ALTONotSupported
from .pytesseract import get_languages
from .pytesseract import get_tesseract_version
from .pytesseract import has_libcurl
from .pytesseract import image_to_alto_xml
from .pytesseract import image_to_boxes
from .pytesseract import image_to_data
Expand All @@ -14,6 +15,7 @@
from .pytesseract import TesseractError
from .pytesseract import TesseractNotFoundError
from .pytesseract import TSVNotSupported
from .pytesseract import URLNotSupported


__version__ = '0.3.13'
31 changes: 30 additions & 1 deletion pytesseract/pytesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@

TESSERACT_MIN_VERSION = Version('3.05')
TESSERACT_ALTO_VERSION = Version('4.1.0')
TESSERACT_URL_VERSION = Version('4.1.1')


class Output:
Expand Down Expand Up @@ -123,6 +124,13 @@ def __init__(self):
'ALTO output not supported. Tesseract >= 4.1.0 required',
)

class URLNotSupported(EnvironmentError):
def __init__(self):
super().__init__(
'URL input not supported. '
'Tesseract >= 4.1.1 built with libcurl required',
)


def kill(process, code):
process.terminate()
Expand Down Expand Up @@ -209,7 +217,10 @@ def save(image):
try:
with NamedTemporaryFile(prefix='tess_', delete=False) as f:
if isinstance(image, str):
if image.startswith('http:') or image.startswith('https:'):
if image.startswith(('http:', 'https:')):
if get_tesseract_version(cached=True) < TESSERACT_URL_VERSION\
or not has_libcurl(cached=True):
raise URLNotSupported()
yield f.name, image
else:
yield f.name, realpath(normpath(normcase(image)))
Expand Down Expand Up @@ -473,6 +484,24 @@ def get_tesseract_version():
return version


@run_once
def has_libcurl():
"""
Returns True if tesseract-ocr was installed with libcurl or False otherwise
"""
try:
output = subprocess.check_output(
[tesseract_cmd, '--version'],
stderr=subprocess.STDOUT,
env=environ,
stdin=subprocess.DEVNULL,
)
except OSError:
raise TesseractNotFoundError()

return 'libcurl' in output.decode(DEFAULT_ENCODING)


def image_to_string(
image,
lang=None,
Expand Down
20 changes: 17 additions & 3 deletions tests/pytesseract_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pytesseract import ALTONotSupported
from pytesseract import get_languages
from pytesseract import get_tesseract_version
from pytesseract import has_libcurl
from pytesseract import image_to_alto_xml
from pytesseract import image_to_boxes
from pytesseract import image_to_data
Expand All @@ -24,6 +25,7 @@
from pytesseract import run_and_get_multiple_output
from pytesseract import TesseractNotFoundError
from pytesseract import TSVNotSupported
from pytesseract import URLNotSupported
from pytesseract.pytesseract import file_to_dict
from pytesseract.pytesseract import numpy_installed
from pytesseract.pytesseract import pandas_installed
Expand All @@ -45,12 +47,14 @@
IS_PYTHON_3 = not IS_PYTHON_2

TESSERACT_VERSION = tuple(get_tesseract_version().release) # to skip tests
HAS_LIBCURL = has_libcurl() # to skip tests

TESTS_DIR = path.dirname(path.abspath(__file__))
DATA_DIR = path.join(TESTS_DIR, 'data')
TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata')
TEST_JPEG = path.join(DATA_DIR, 'test.jpg')
TEST_JPEG_URL = 'https://i.imgur.com/hWO45US.jpg'
TEST_JPEG_URL = ('https://github.com/madmaze/pytesseract'
'/blob/master/tests/data/test.jpg?raw=true')

pytestmark = pytest.mark.pytesseract # used marker for the module
string_type = unicode if IS_PYTHON_2 else str # noqa: 821
Expand Down Expand Up @@ -128,8 +132,9 @@ def test_image_to_string_with_image_type(test_file):
ids=['jpeg_url'],
)
def test_image_to_string_with_url(test_file):
# Tesseract-ocr supports image URLs from version 4.1.1
if TESSERACT_VERSION[0] < 4:
# Tesseract-ocr supports image URLs from version 4.1.1
# and must be built with libcurl.
if TESSERACT_VERSION < (4, 1, 1) or not HAS_LIBCURL:
pytest.skip('skip url test')
assert 'The quick brown dog' in image_to_string(test_file)

Expand Down Expand Up @@ -311,6 +316,15 @@ def test_image_to_data__pandas_support(test_file_small):
image_to_data(test_file_small, output_type=Output.DATAFRAME)


@pytest.mark.skipif(
TESSERACT_VERSION >= (4, 1, 1) and HAS_LIBCURL,
reason='requires tesseract < 4.1.1 or tesseract built without libcurl',
)
def test_image_to_string_url_support():
with pytest.raises(URLNotSupported):
image_to_string(TEST_JPEG_URL)


@pytest.mark.skipif(
TESSERACT_VERSION[:2] < (3, 5),
reason='requires tesseract >= 3.05',
Expand Down

0 comments on commit 4c53f33

Please sign in to comment.