fix(deps): Yank dependency on pypdf2

In the following issues and PRs, PyPDF2 users have identified and offered to fix a weird bug where the library overrides normal logging in a way that breaks when you log certain things: py-pdf/pypdf#67 py-pdf/pypdf#641 py-pdf/pypdf#452 py-pdf/pypdf#243 Not great, but there was probably a reason. Unfortunately, the maintainers aren't merging any of the fixes people have provided over the years, and when I upgraded to Python 3.10 one of our tests changed in a way that it triggered this bug. So, since the maintainers don't seem inclined to fix their own package, this commit yanks it from CourtListener. It's good timing, really, since we now have the microservice available, but it was disappointing to see bugs and PRs related to this going back to 2014. Most of the fixes are one or two-line PRs too. Bummer.
freelawproject · Mar 23, 2022 · 1e37dc0 · 1e37dc0
1 parent c605292
commit 1e37dc0
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 76 deletions.
diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py
@@ -53,6 +53,7 @@
 from cl.custom_filters.templatetags.text_filters import best_case_name
 from cl.lib.celery_utils import throttle_task
 from cl.lib.crypto import sha1
+from cl.lib.microservice_utils import microservice
 from cl.lib.pacer import (
     get_blocked_status,
     get_first_missing_de_date,
@@ -89,7 +90,7 @@
     ProcessingQueue,
 )
 from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow
-from cl.scrapers.tasks import extract_recap_pdf, get_page_count
+from cl.scrapers.tasks import extract_recap_pdf
 from cl.search.models import (
     ClaimHistory,
     Court,
@@ -1590,13 +1591,9 @@ def update_rd_metadata(
     # request.content is sometimes a str, sometimes unicode, so
     # force it all to be bytes, pleasing hashlib.
     rd.sha1 = sha1(force_bytes(response.content))
-    with NamedTemporaryFile(
-        prefix="rd_for_page_size_",
-        suffix=".pdf",
-        buffering=0,
-    ) as tmp:
-        tmp.write(rd.filepath_local.read())
-        rd.page_count = get_page_count(tmp.name, "pdf")
+    rd.page_count = microservice(
+        service="page-count", file_type="pdf", file=rd.filepath_local.read()
+    ).content
 
     # Save and extract, skipping OCR.
     rd.save()

diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py
@@ -1,6 +1,5 @@
 import logging
 from datetime import datetime
-from tempfile import NamedTemporaryFile
 from typing import Dict, List, Optional, Tuple, Union
 from zipfile import ZipFile
 
@@ -12,7 +11,6 @@
 from django.core.exceptions import ValidationError
 from django.core.files.base import ContentFile
 from django.core.files.uploadedfile import SimpleUploadedFile
-from django.core.serializers.json import DjangoJSONEncoder
 from django.db import IntegrityError, transaction
 from django.utils.timezone import now
 from juriscraper.lib.exceptions import PacerLoginException, ParsingException
@@ -42,6 +40,7 @@
 from cl.custom_filters.templatetags.text_filters import oxford_join
 from cl.lib.crypto import sha1
 from cl.lib.filesizes import convert_size_to_bytes
+from cl.lib.microservice_utils import microservice
 from cl.lib.pacer import map_cl_to_pacer_id
 from cl.lib.pacer_session import (
     get_or_cache_pacer_cookies,
@@ -74,7 +73,7 @@
     PacerHtmlFiles,
     ProcessingQueue,
 )
-from cl.scrapers.tasks import extract_recap_pdf, get_page_count
+from cl.scrapers.tasks import extract_recap_pdf
 from cl.search.models import Docket, DocketEntry, RECAPDocument
 from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket
 
@@ -286,7 +285,7 @@ def process_recap_pdf(self, pk):
 
     # Do the file, finally.
     try:
-        content = pq.filepath_local.read()
+        file_contents = pq.filepath_local.read()
     except IOError as exc:
         msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
         if (self.request.retries == self.max_retries) or pq.debug:
@@ -296,7 +295,7 @@ def process_recap_pdf(self, pk):
             mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
             raise self.retry(exc=exc)
 
-    new_sha1 = sha1(content)
+    new_sha1 = sha1(file_contents)
     existing_document = all(
         [
             rd.sha1 == new_sha1,
@@ -307,7 +306,7 @@ def process_recap_pdf(self, pk):
     if not existing_document:
         # Different sha1, it wasn't available, or it's missing from disk. Move
         # the new file over from the processing queue storage.
-        cf = ContentFile(content)
+        cf = ContentFile(file_contents)
         file_name = get_document_filename(
             rd.docket_entry.docket.court_id,
             rd.docket_entry.docket.pacer_case_id,
@@ -318,15 +317,10 @@ def process_recap_pdf(self, pk):
             rd.filepath_local.save(file_name, cf, save=False)
 
             # Do page count and extraction
-            extension = rd.filepath_local.name.split(".")[-1]
-            with NamedTemporaryFile(
-                prefix="rd_page_count_",
-                suffix=f".{extension}",
-                buffering=0,
-            ) as tmp:
-                tmp.write(content)
-                rd.page_count = get_page_count(tmp.name, extension)
-                rd.file_size = rd.filepath_local.size
+            rd.page_count = microservice(
+                service="page-count", file_type="pdf", file=file_contents
+            ).content
+            rd.file_size = rd.filepath_local.size
 
         rd.ocr_status = None
         rd.is_available = True

diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
@@ -1,4 +1,3 @@
-import json
 import logging
 import random
 import re
@@ -19,8 +18,6 @@
 from juriscraper.pacer import CaseQuery, PacerSession
 from lxml.etree import XMLSyntaxError
 from lxml.html.clean import Cleaner
-from PyPDF2 import PdfFileReader
-from PyPDF2.utils import PdfReadError
 
 from cl.audio.models import Audio
 from cl.celery_init import app
@@ -259,41 +256,6 @@ def convert_file_to_txt(path: str) -> str:
     return p.communicate()[0].decode()
 
 
-def get_page_count(path: str, extension: str) -> Optional[int]:
-    """Get the number of pages, if appropriate mimetype.
-
-    :param path: A path to a binary (pdf, wpd, doc, txt, html, etc.)
-    :param extension: The extension of the binary.
-    :return: The number of pages if possible, else return None
-    """
-    if extension == "pdf":
-        try:
-            reader = PdfFileReader(path)
-            return int(reader.getNumPages())
-        except (
-            IOError,
-            ValueError,
-            TypeError,
-            KeyError,
-            AssertionError,
-            PdfReadError,
-        ):
-            # IOError: File doesn't exist. My bad.
-            # ValueError: Didn't get an int for the page count. Their bad.
-            # TypeError: NumberObject has no attribute '__getitem__'. Ugh.
-            # KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF?
-            # PdfReadError: Something else. I have no words.
-            pass
-    elif extension == "wpd":
-        # Best solution appears to be to dig into the binary format
-        pass
-    elif extension == "doc":
-        # Best solution appears to be to dig into the XML of the file
-        # itself: http://stackoverflow.com/a/12972502/64911
-        pass
-    return None
-
-
 def update_document_from_text(opinion: Opinion) -> None:
     """Extract additional metadata from document text
 
@@ -358,7 +320,8 @@ def extract_doc_content(
         buffering=0,  # Make sure it's on disk when we try to use it
     ) as tmp:
         # Get file contents from S3 and put them in a temp file.
-        tmp.write(opinion.local_path.read())
+        file_contents = opinion.local_path.read()
+        tmp.write(file_contents)
 
         if extension == "doc":
             content, err = extract_from_doc(tmp.name)
@@ -380,7 +343,11 @@ def extract_doc_content(
             return
 
         # Do page count, if possible
-        opinion.page_count = get_page_count(tmp.name, extension)
+        opinion.page_count = microservice(
+            service="page-count",
+            file_type=extension,
+            file=file_contents,
+        ).content
 
     assert isinstance(
         content, str

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,6 @@ psycopg2 = "^2.8.6"
 pycparser = "^2.14"
 pyopenssl = "*"
 pyparsing = "^2.4.2"
-pypdf2 = "^1.26.0"
 python = ">=3.10, <3.11"
 python-dateutil = "^2.8.1"
 python-igraph = "0.9.1"