Skip to content

Commit

Permalink
Added tesseract version and libcurl check for URL input
Browse files Browse the repository at this point in the history
  • Loading branch information
marosstruk committed Nov 23, 2023
1 parent 30a0b28 commit 6f20b6f
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV

- name: Install tesseract
run: sudo apt-get -y update && sudo apt-get install -y tesseract-ocr tesseract-ocr-fra
run: sudo apt-get -y update && sudo apt-get install -y libcurl4-openssl-dev tesseract-ocr tesseract-ocr-fra

- name: Print tesseract version
run: echo $(tesseract --version)
Expand Down
28 changes: 28 additions & 0 deletions pytesseract/pytesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@

TESSERACT_MIN_VERSION = Version('3.05')
TESSERACT_ALTO_VERSION = Version('4.1.0')
TESSERACT_URL_VERSION = Version('4.1.1')


class Output:
Expand Down Expand Up @@ -123,6 +124,12 @@ def __init__(self):
'ALTO output not supported. Tesseract >= 4.1.0 required',
)

class URLNotSupported(EnvironmentError):
def __init__(self):
super().__init__(
'URL input not supported. Tesseract >= 4.1.1 and libcurl required',
)


def kill(process, code):
process.terminate()
Expand Down Expand Up @@ -210,6 +217,9 @@ def save(image):
with NamedTemporaryFile(prefix='tess_', delete=False) as f:
if isinstance(image, str):
if image.startswith('http:') or image.startswith('https:'):
if get_tesseract_version(cached=True) < TESSERACT_URL_VERSION\
or not has_libcurl(cached=True):
raise URLNotSupported()
yield f.name, image
else:
yield f.name, realpath(normpath(normcase(image)))
Expand Down Expand Up @@ -473,6 +483,24 @@ def get_tesseract_version():
return version


@run_once
def has_libcurl():
"""
Returns True if tesseract-ocr was installed with libcurl or False otherwise
"""
try:
output = subprocess.check_output(
[tesseract_cmd, '--version'],
stderr=subprocess.STDOUT,
env=environ,
stdin=subprocess.DEVNULL,
)
except OSError:
raise TesseractNotFoundError()

return 'libcurl' in output.decode(DEFAULT_ENCODING)


def image_to_string(
image,
lang=None,
Expand Down

0 comments on commit 6f20b6f

Please sign in to comment.