From 1e37dc0a486903fdc5a2a0946c7360de93fc3051 Mon Sep 17 00:00:00 2001 From: Michael Lissner Date: Wed, 23 Mar 2022 07:47:33 -0700 Subject: [PATCH] fix(deps): Yank dependency on pypdf2 In the following issues and PRs, PyPDF2 users have identified and offered to fix a weird bug where the library overrides normal logging in a way that breaks when you log certain things: https://github.com/mstamy2/PyPDF2/issues/67 https://github.com/mstamy2/PyPDF2/pull/641 https://github.com/mstamy2/PyPDF2/pull/452 https://github.com/mstamy2/PyPDF2/pull/243 Not great, but there was probably a reason. Unfortunately, the maintainers aren't merging any of the fixes people have provided over the years, and when I upgraded to Python 3.10 one of our tests changed in a way that it triggered this bug. So, since the maintainers don't seem inclined to fix their own package, this commit yanks it from CourtListener. It's good timing, really, since we now have the microservice available, but it was disappointing to see bugs and PRs related to this going back to 2014. Most of the fixes are one or two-line PRs too. Bummer. --- cl/corpus_importer/tasks.py | 13 ++++------ cl/recap/tasks.py | 24 +++++++------------ cl/scrapers/tasks.py | 47 ++++++------------------------------- poetry.lock | 13 +--------- pyproject.toml | 1 - 5 files changed, 22 insertions(+), 76 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 692f2b2a60..4f31359384 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -53,6 +53,7 @@ from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.celery_utils import throttle_task from cl.lib.crypto import sha1 +from cl.lib.microservice_utils import microservice from cl.lib.pacer import ( get_blocked_status, get_first_missing_de_date, @@ -89,7 +90,7 @@ ProcessingQueue, ) from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow -from cl.scrapers.tasks import extract_recap_pdf, get_page_count +from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import ( ClaimHistory, Court, @@ -1590,13 +1591,9 @@ def update_rd_metadata( # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = sha1(force_bytes(response.content)) - with NamedTemporaryFile( - prefix="rd_for_page_size_", - suffix=".pdf", - buffering=0, - ) as tmp: - tmp.write(rd.filepath_local.read()) - rd.page_count = get_page_count(tmp.name, "pdf") + rd.page_count = microservice( + service="page-count", file_type="pdf", file=rd.filepath_local.read() + ).content # Save and extract, skipping OCR. rd.save() diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index a90d82e32e..c30a61dd75 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -1,6 +1,5 @@ import logging from datetime import datetime -from tempfile import NamedTemporaryFile from typing import Dict, List, Optional, Tuple, Union from zipfile import ZipFile @@ -12,7 +11,6 @@ from django.core.exceptions import ValidationError from django.core.files.base import ContentFile from django.core.files.uploadedfile import SimpleUploadedFile -from django.core.serializers.json import DjangoJSONEncoder from django.db import IntegrityError, transaction from django.utils.timezone import now from juriscraper.lib.exceptions import PacerLoginException, ParsingException @@ -42,6 +40,7 @@ from cl.custom_filters.templatetags.text_filters import oxford_join from cl.lib.crypto import sha1 from cl.lib.filesizes import convert_size_to_bytes +from cl.lib.microservice_utils import microservice from cl.lib.pacer import map_cl_to_pacer_id from cl.lib.pacer_session import ( get_or_cache_pacer_cookies, @@ -74,7 +73,7 @@ PacerHtmlFiles, ProcessingQueue, ) -from cl.scrapers.tasks import extract_recap_pdf, get_page_count +from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import Docket, DocketEntry, RECAPDocument from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket @@ -286,7 +285,7 @@ def process_recap_pdf(self, pk): # Do the file, finally. try: - content = pq.filepath_local.read() + file_contents = pq.filepath_local.read() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: @@ -296,7 +295,7 @@ def process_recap_pdf(self, pk): mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) - new_sha1 = sha1(content) + new_sha1 = sha1(file_contents) existing_document = all( [ rd.sha1 == new_sha1, @@ -307,7 +306,7 @@ def process_recap_pdf(self, pk): if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. - cf = ContentFile(content) + cf = ContentFile(file_contents) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, @@ -318,15 +317,10 @@ def process_recap_pdf(self, pk): rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction - extension = rd.filepath_local.name.split(".")[-1] - with NamedTemporaryFile( - prefix="rd_page_count_", - suffix=f".{extension}", - buffering=0, - ) as tmp: - tmp.write(content) - rd.page_count = get_page_count(tmp.name, extension) - rd.file_size = rd.filepath_local.size + rd.page_count = microservice( + service="page-count", file_type="pdf", file=file_contents + ).content + rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 32a07f606f..0a73e4a75d 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -1,4 +1,3 @@ -import json import logging import random import re @@ -19,8 +18,6 @@ from juriscraper.pacer import CaseQuery, PacerSession from lxml.etree import XMLSyntaxError from lxml.html.clean import Cleaner -from PyPDF2 import PdfFileReader -from PyPDF2.utils import PdfReadError from cl.audio.models import Audio from cl.celery_init import app @@ -259,41 +256,6 @@ def convert_file_to_txt(path: str) -> str: return p.communicate()[0].decode() -def get_page_count(path: str, extension: str) -> Optional[int]: - """Get the number of pages, if appropriate mimetype. - - :param path: A path to a binary (pdf, wpd, doc, txt, html, etc.) - :param extension: The extension of the binary. - :return: The number of pages if possible, else return None - """ - if extension == "pdf": - try: - reader = PdfFileReader(path) - return int(reader.getNumPages()) - except ( - IOError, - ValueError, - TypeError, - KeyError, - AssertionError, - PdfReadError, - ): - # IOError: File doesn't exist. My bad. - # ValueError: Didn't get an int for the page count. Their bad. - # TypeError: NumberObject has no attribute '__getitem__'. Ugh. - # KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF? - # PdfReadError: Something else. I have no words. - pass - elif extension == "wpd": - # Best solution appears to be to dig into the binary format - pass - elif extension == "doc": - # Best solution appears to be to dig into the XML of the file - # itself: http://stackoverflow.com/a/12972502/64911 - pass - return None - - def update_document_from_text(opinion: Opinion) -> None: """Extract additional metadata from document text @@ -358,7 +320,8 @@ def extract_doc_content( buffering=0, # Make sure it's on disk when we try to use it ) as tmp: # Get file contents from S3 and put them in a temp file. - tmp.write(opinion.local_path.read()) + file_contents = opinion.local_path.read() + tmp.write(file_contents) if extension == "doc": content, err = extract_from_doc(tmp.name) @@ -380,7 +343,11 @@ def extract_doc_content( return # Do page count, if possible - opinion.page_count = get_page_count(tmp.name, extension) + opinion.page_count = microservice( + service="page-count", + file_type=extension, + file=file_contents, + ).content assert isinstance( content, str diff --git a/poetry.lock b/poetry.lock index e5cbe709e7..0105e884a2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1418,14 +1418,6 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -[[package]] -name = "pypdf2" -version = "1.26.0" -description = "PDF toolkit" -category = "main" -optional = false -python-versions = "*" - [[package]] name = "pystemmer" version = "2.0.1" @@ -1988,7 +1980,7 @@ flp = ["reporters-db", "juriscraper"] [metadata] lock-version = "1.1" python-versions = ">=3.10, <3.11" -content-hash = "7ec249acd058a5511848443d26c2580e8715154338b8788f49f96d5f2f200aa7" +content-hash = "3de89e8e06d8268e494a38e38cdb3f55f68537d7f761697e2c96b3d7c3eb92f2" [metadata.files] amqp = [ @@ -2799,9 +2791,6 @@ pyparsing = [ {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, ] -pypdf2 = [ - {file = "PyPDF2-1.26.0.tar.gz", hash = "sha256:e28f902f2f0a1603ea95ebe21dff311ef09be3d0f0ef29a3e44a932729564385"}, -] pystemmer = [ {file = "PyStemmer-2.0.1.tar.gz", hash = "sha256:9b81c35302f1d2a5ad9465b85986db246990db93d97d3e8f129269ed7102788e"}, ] diff --git a/pyproject.toml b/pyproject.toml index 96f71fded0..293cf077fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ psycopg2 = "^2.8.6" pycparser = "^2.14" pyopenssl = "*" pyparsing = "^2.4.2" -pypdf2 = "^1.26.0" python = ">=3.10, <3.11" python-dateutil = "^2.8.1" python-igraph = "0.9.1"