From 1e37dc0a486903fdc5a2a0946c7360de93fc3051 Mon Sep 17 00:00:00 2001
From: Michael Lissner <mlissner@michaeljaylissner.com>
Date: Wed, 23 Mar 2022 07:47:33 -0700
Subject: [PATCH] fix(deps): Yank dependency on pypdf2

In the following issues and PRs, PyPDF2 users have identified and
offered to fix a weird bug where the library overrides normal logging
in a way that breaks when you log certain things:

https://github.com/mstamy2/PyPDF2/issues/67
https://github.com/mstamy2/PyPDF2/pull/641
https://github.com/mstamy2/PyPDF2/pull/452
https://github.com/mstamy2/PyPDF2/pull/243

Not great, but there was probably a reason. Unfortunately, the
maintainers aren't merging any of the fixes people have provided
over the years, and when I upgraded to Python 3.10 one of our tests
changed in a way that it triggered this bug.

So, since the maintainers don't seem inclined to fix their own
package, this commit yanks it from CourtListener. It's good timing,
really, since we now have the microservice available, but it was
disappointing to see bugs and PRs related to this going back to 2014.
Most of the fixes are one or two-line PRs too. Bummer.
---
 cl/corpus_importer/tasks.py | 13 ++++------
 cl/recap/tasks.py           | 24 +++++++------------
 cl/scrapers/tasks.py        | 47 ++++++-------------------------------
 poetry.lock                 | 13 +---------
 pyproject.toml              |  1 -
 5 files changed, 22 insertions(+), 76 deletions(-)

diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py
index 692f2b2a60..4f31359384 100644
--- a/cl/corpus_importer/tasks.py
+++ b/cl/corpus_importer/tasks.py
@@ -53,6 +53,7 @@
 from cl.custom_filters.templatetags.text_filters import best_case_name
 from cl.lib.celery_utils import throttle_task
 from cl.lib.crypto import sha1
+from cl.lib.microservice_utils import microservice
 from cl.lib.pacer import (
     get_blocked_status,
     get_first_missing_de_date,
@@ -89,7 +90,7 @@
     ProcessingQueue,
 )
 from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow
-from cl.scrapers.tasks import extract_recap_pdf, get_page_count
+from cl.scrapers.tasks import extract_recap_pdf
 from cl.search.models import (
     ClaimHistory,
     Court,
@@ -1590,13 +1591,9 @@ def update_rd_metadata(
     # request.content is sometimes a str, sometimes unicode, so
     # force it all to be bytes, pleasing hashlib.
     rd.sha1 = sha1(force_bytes(response.content))
-    with NamedTemporaryFile(
-        prefix="rd_for_page_size_",
-        suffix=".pdf",
-        buffering=0,
-    ) as tmp:
-        tmp.write(rd.filepath_local.read())
-        rd.page_count = get_page_count(tmp.name, "pdf")
+    rd.page_count = microservice(
+        service="page-count", file_type="pdf", file=rd.filepath_local.read()
+    ).content
 
     # Save and extract, skipping OCR.
     rd.save()
diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py
index a90d82e32e..c30a61dd75 100644
--- a/cl/recap/tasks.py
+++ b/cl/recap/tasks.py
@@ -1,6 +1,5 @@
 import logging
 from datetime import datetime
-from tempfile import NamedTemporaryFile
 from typing import Dict, List, Optional, Tuple, Union
 from zipfile import ZipFile
 
@@ -12,7 +11,6 @@
 from django.core.exceptions import ValidationError
 from django.core.files.base import ContentFile
 from django.core.files.uploadedfile import SimpleUploadedFile
-from django.core.serializers.json import DjangoJSONEncoder
 from django.db import IntegrityError, transaction
 from django.utils.timezone import now
 from juriscraper.lib.exceptions import PacerLoginException, ParsingException
@@ -42,6 +40,7 @@
 from cl.custom_filters.templatetags.text_filters import oxford_join
 from cl.lib.crypto import sha1
 from cl.lib.filesizes import convert_size_to_bytes
+from cl.lib.microservice_utils import microservice
 from cl.lib.pacer import map_cl_to_pacer_id
 from cl.lib.pacer_session import (
     get_or_cache_pacer_cookies,
@@ -74,7 +73,7 @@
     PacerHtmlFiles,
     ProcessingQueue,
 )
-from cl.scrapers.tasks import extract_recap_pdf, get_page_count
+from cl.scrapers.tasks import extract_recap_pdf
 from cl.search.models import Docket, DocketEntry, RECAPDocument
 from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket
 
@@ -286,7 +285,7 @@ def process_recap_pdf(self, pk):
 
     # Do the file, finally.
     try:
-        content = pq.filepath_local.read()
+        file_contents = pq.filepath_local.read()
     except IOError as exc:
         msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
         if (self.request.retries == self.max_retries) or pq.debug:
@@ -296,7 +295,7 @@ def process_recap_pdf(self, pk):
             mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
             raise self.retry(exc=exc)
 
-    new_sha1 = sha1(content)
+    new_sha1 = sha1(file_contents)
     existing_document = all(
         [
             rd.sha1 == new_sha1,
@@ -307,7 +306,7 @@ def process_recap_pdf(self, pk):
     if not existing_document:
         # Different sha1, it wasn't available, or it's missing from disk. Move
         # the new file over from the processing queue storage.
-        cf = ContentFile(content)
+        cf = ContentFile(file_contents)
         file_name = get_document_filename(
             rd.docket_entry.docket.court_id,
             rd.docket_entry.docket.pacer_case_id,
@@ -318,15 +317,10 @@ def process_recap_pdf(self, pk):
             rd.filepath_local.save(file_name, cf, save=False)
 
             # Do page count and extraction
-            extension = rd.filepath_local.name.split(".")[-1]
-            with NamedTemporaryFile(
-                prefix="rd_page_count_",
-                suffix=f".{extension}",
-                buffering=0,
-            ) as tmp:
-                tmp.write(content)
-                rd.page_count = get_page_count(tmp.name, extension)
-                rd.file_size = rd.filepath_local.size
+            rd.page_count = microservice(
+                service="page-count", file_type="pdf", file=file_contents
+            ).content
+            rd.file_size = rd.filepath_local.size
 
         rd.ocr_status = None
         rd.is_available = True
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
index 32a07f606f..0a73e4a75d 100644
--- a/cl/scrapers/tasks.py
+++ b/cl/scrapers/tasks.py
@@ -1,4 +1,3 @@
-import json
 import logging
 import random
 import re
@@ -19,8 +18,6 @@
 from juriscraper.pacer import CaseQuery, PacerSession
 from lxml.etree import XMLSyntaxError
 from lxml.html.clean import Cleaner
-from PyPDF2 import PdfFileReader
-from PyPDF2.utils import PdfReadError
 
 from cl.audio.models import Audio
 from cl.celery_init import app
@@ -259,41 +256,6 @@ def convert_file_to_txt(path: str) -> str:
     return p.communicate()[0].decode()
 
 
-def get_page_count(path: str, extension: str) -> Optional[int]:
-    """Get the number of pages, if appropriate mimetype.
-
-    :param path: A path to a binary (pdf, wpd, doc, txt, html, etc.)
-    :param extension: The extension of the binary.
-    :return: The number of pages if possible, else return None
-    """
-    if extension == "pdf":
-        try:
-            reader = PdfFileReader(path)
-            return int(reader.getNumPages())
-        except (
-            IOError,
-            ValueError,
-            TypeError,
-            KeyError,
-            AssertionError,
-            PdfReadError,
-        ):
-            # IOError: File doesn't exist. My bad.
-            # ValueError: Didn't get an int for the page count. Their bad.
-            # TypeError: NumberObject has no attribute '__getitem__'. Ugh.
-            # KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF?
-            # PdfReadError: Something else. I have no words.
-            pass
-    elif extension == "wpd":
-        # Best solution appears to be to dig into the binary format
-        pass
-    elif extension == "doc":
-        # Best solution appears to be to dig into the XML of the file
-        # itself: http://stackoverflow.com/a/12972502/64911
-        pass
-    return None
-
-
 def update_document_from_text(opinion: Opinion) -> None:
     """Extract additional metadata from document text
 
@@ -358,7 +320,8 @@ def extract_doc_content(
         buffering=0,  # Make sure it's on disk when we try to use it
     ) as tmp:
         # Get file contents from S3 and put them in a temp file.
-        tmp.write(opinion.local_path.read())
+        file_contents = opinion.local_path.read()
+        tmp.write(file_contents)
 
         if extension == "doc":
             content, err = extract_from_doc(tmp.name)
@@ -380,7 +343,11 @@ def extract_doc_content(
             return
 
         # Do page count, if possible
-        opinion.page_count = get_page_count(tmp.name, extension)
+        opinion.page_count = microservice(
+            service="page-count",
+            file_type=extension,
+            file=file_contents,
+        ).content
 
     assert isinstance(
         content, str
diff --git a/poetry.lock b/poetry.lock
index e5cbe709e7..0105e884a2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1418,14 +1418,6 @@ category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 
-[[package]]
-name = "pypdf2"
-version = "1.26.0"
-description = "PDF toolkit"
-category = "main"
-optional = false
-python-versions = "*"
-
 [[package]]
 name = "pystemmer"
 version = "2.0.1"
@@ -1988,7 +1980,7 @@ flp = ["reporters-db", "juriscraper"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.10, <3.11"
-content-hash = "7ec249acd058a5511848443d26c2580e8715154338b8788f49f96d5f2f200aa7"
+content-hash = "3de89e8e06d8268e494a38e38cdb3f55f68537d7f761697e2c96b3d7c3eb92f2"
 
 [metadata.files]
 amqp = [
@@ -2799,9 +2791,6 @@ pyparsing = [
     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
 ]
-pypdf2 = [
-    {file = "PyPDF2-1.26.0.tar.gz", hash = "sha256:e28f902f2f0a1603ea95ebe21dff311ef09be3d0f0ef29a3e44a932729564385"},
-]
 pystemmer = [
     {file = "PyStemmer-2.0.1.tar.gz", hash = "sha256:9b81c35302f1d2a5ad9465b85986db246990db93d97d3e8f129269ed7102788e"},
 ]
diff --git a/pyproject.toml b/pyproject.toml
index 96f71fded0..293cf077fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,6 @@ psycopg2 = "^2.8.6"
 pycparser = "^2.14"
 pyopenssl = "*"
 pyparsing = "^2.4.2"
-pypdf2 = "^1.26.0"
 python = ">=3.10, <3.11"
 python-dateutil = "^2.8.1"
 python-igraph = "0.9.1"