madmaze · marosstruk · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023 · Dec 5, 2023
diff --git a/pytesseract/__init__.py b/pytesseract/__init__.py
@@ -2,6 +2,7 @@
 from .pytesseract import ALTONotSupported
 from .pytesseract import get_languages
 from .pytesseract import get_tesseract_version
+from .pytesseract import has_libcurl
 from .pytesseract import image_to_alto_xml
 from .pytesseract import image_to_boxes
 from .pytesseract import image_to_data
@@ -14,6 +15,7 @@
 from .pytesseract import TesseractError
 from .pytesseract import TesseractNotFoundError
 from .pytesseract import TSVNotSupported
+from .pytesseract import URLNotSupported
 
 
 __version__ = '0.3.13'
diff --git a/pytesseract/pytesseract.py b/pytesseract/pytesseract.py
@@ -81,6 +81,7 @@
 
 TESSERACT_MIN_VERSION = Version('3.05')
 TESSERACT_ALTO_VERSION = Version('4.1.0')
+TESSERACT_URL_VERSION = Version('4.1.1')
 
 
 class Output:
@@ -124,6 +125,14 @@ def __init__(self):
         )
 
 
+class URLNotSupported(EnvironmentError):
+    def __init__(self):
+        super().__init__(
+            'URL input not supported. '
+            'Tesseract >= 4.1.1 built with libcurl required',
+        )
+
+
 def kill(process, code):
     process.terminate()
     try:
@@ -209,7 +218,14 @@ def save(image):
     try:
         with NamedTemporaryFile(prefix='tess_', delete=False) as f:
             if isinstance(image, str):
-                yield f.name, realpath(normpath(normcase(image)))
+                if image.startswith(('http:', 'https:')):
+                    if get_tesseract_version(
+                        cached=True,
+                    ) < TESSERACT_URL_VERSION or not has_libcurl(cached=True):
+                        raise URLNotSupported()
+                    yield f.name, image
+                else:
+                    yield f.name, realpath(normpath(normcase(image)))
                 return
             image, extension = prepare(image)
             input_file_name = f'{f.name}_input{extsep}{extension}'
@@ -470,6 +486,24 @@ def get_tesseract_version():
     return version
 
 
+@run_once
+def has_libcurl():
+    """
+    Returns True if tesseract-ocr was installed with libcurl or False otherwise
+    """
+    try:
+        output = subprocess.check_output(
+            [tesseract_cmd, '--version'],
+            stderr=subprocess.STDOUT,
+            env=environ,
+            stdin=subprocess.DEVNULL,
+        )
+    except OSError:
+        raise TesseractNotFoundError()
+
+    return 'libcurl' in output.decode(DEFAULT_ENCODING)
+
+
 def image_to_string(
     image,
     lang=None,

diff --git a/tests/pytesseract_test.py b/tests/pytesseract_test.py
@@ -14,6 +14,7 @@
 from pytesseract import ALTONotSupported
 from pytesseract import get_languages
 from pytesseract import get_tesseract_version
+from pytesseract import has_libcurl
 from pytesseract import image_to_alto_xml
 from pytesseract import image_to_boxes
 from pytesseract import image_to_data
@@ -24,6 +25,7 @@
 from pytesseract import run_and_get_multiple_output
 from pytesseract import TesseractNotFoundError
 from pytesseract import TSVNotSupported
+from pytesseract import URLNotSupported
 from pytesseract.pytesseract import file_to_dict
 from pytesseract.pytesseract import numpy_installed
 from pytesseract.pytesseract import pandas_installed
@@ -45,11 +47,16 @@
 IS_PYTHON_3 = not IS_PYTHON_2
 
 TESSERACT_VERSION = tuple(get_tesseract_version().release)  # to skip tests
+HAS_LIBCURL = has_libcurl()  # to skip tests
 
 TESTS_DIR = path.dirname(path.abspath(__file__))
 DATA_DIR = path.join(TESTS_DIR, 'data')
 TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata')
 TEST_JPEG = path.join(DATA_DIR, 'test.jpg')
+TEST_JPEG_URL = (
+    'https://github.com/madmaze/pytesseract'
+    '/blob/master/tests/data/test.jpg?raw=true'
+)
 
 pytestmark = pytest.mark.pytesseract  # used marker for the module
 string_type = unicode if IS_PYTHON_2 else str  # noqa: 821
@@ -121,6 +128,19 @@ def test_image_to_string_with_image_type(test_file):
     assert 'The quick brown dog' in image_to_string(test_file_path, 'eng')
 
 
+@pytest.mark.parametrize(
+    'test_file',
+    [TEST_JPEG_URL],
+    ids=['jpeg_url'],
+)
+def test_image_to_string_with_url(test_file):
+    # Tesseract-ocr supports image URLs from version 4.1.1
+    # and must be built with libcurl.
+    if TESSERACT_VERSION < (4, 1, 1) or not HAS_LIBCURL:
+        pytest.skip('skip url test')
+    assert 'The quick brown dog' in image_to_string(test_file)
+
+
 @pytest.mark.parametrize(
     'test_file',
     [TEST_JPEG, Image.open(TEST_JPEG)],
@@ -298,6 +318,15 @@ def test_image_to_data__pandas_support(test_file_small):
         image_to_data(test_file_small, output_type=Output.DATAFRAME)
 
 
+@pytest.mark.skipif(
+    TESSERACT_VERSION >= (4, 1, 1) and HAS_LIBCURL,
+    reason='requires tesseract < 4.1.1 or tesseract built without libcurl',
+)
+def test_image_to_string_url_support():
+    with pytest.raises(URLNotSupported):
+        image_to_string(TEST_JPEG_URL)
+
+
 @pytest.mark.skipif(
     TESSERACT_VERSION[:2] < (3, 5),
     reason='requires tesseract >= 3.05',