Skip to content

Commit

Permalink
fix(deps): Yank dependency on pypdf2
Browse files Browse the repository at this point in the history
In the following issues and PRs, PyPDF2 users have identified and
offered to fix a weird bug where the library overrides normal logging
in a way that breaks when you log certain things:

py-pdf/pypdf#67
py-pdf/pypdf#641
py-pdf/pypdf#452
py-pdf/pypdf#243

Not great, but there was probably a reason. Unfortunately, the
maintainers aren't merging any of the fixes people have provided
over the years, and when I upgraded to Python 3.10 one of our tests
changed in a way that it triggered this bug.

So, since the maintainers don't seem inclined to fix their own
package, this commit yanks it from CourtListener. It's good timing,
really, since we now have the microservice available, but it was
disappointing to see bugs and PRs related to this going back to 2014.
Most of the fixes are one or two-line PRs too. Bummer.
  • Loading branch information
mlissner committed Mar 23, 2022
1 parent c605292 commit 1e37dc0
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 76 deletions.
13 changes: 5 additions & 8 deletions cl/corpus_importer/tasks.py
Expand Up @@ -53,6 +53,7 @@
from cl.custom_filters.templatetags.text_filters import best_case_name
from cl.lib.celery_utils import throttle_task
from cl.lib.crypto import sha1
from cl.lib.microservice_utils import microservice
from cl.lib.pacer import (
get_blocked_status,
get_first_missing_de_date,
Expand Down Expand Up @@ -89,7 +90,7 @@
ProcessingQueue,
)
from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow
from cl.scrapers.tasks import extract_recap_pdf, get_page_count
from cl.scrapers.tasks import extract_recap_pdf
from cl.search.models import (
ClaimHistory,
Court,
Expand Down Expand Up @@ -1590,13 +1591,9 @@ def update_rd_metadata(
# request.content is sometimes a str, sometimes unicode, so
# force it all to be bytes, pleasing hashlib.
rd.sha1 = sha1(force_bytes(response.content))
with NamedTemporaryFile(
prefix="rd_for_page_size_",
suffix=".pdf",
buffering=0,
) as tmp:
tmp.write(rd.filepath_local.read())
rd.page_count = get_page_count(tmp.name, "pdf")
rd.page_count = microservice(
service="page-count", file_type="pdf", file=rd.filepath_local.read()
).content

# Save and extract, skipping OCR.
rd.save()
Expand Down
24 changes: 9 additions & 15 deletions cl/recap/tasks.py
@@ -1,6 +1,5 @@
import logging
from datetime import datetime
from tempfile import NamedTemporaryFile
from typing import Dict, List, Optional, Tuple, Union
from zipfile import ZipFile

Expand All @@ -12,7 +11,6 @@
from django.core.exceptions import ValidationError
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.serializers.json import DjangoJSONEncoder
from django.db import IntegrityError, transaction
from django.utils.timezone import now
from juriscraper.lib.exceptions import PacerLoginException, ParsingException
Expand Down Expand Up @@ -42,6 +40,7 @@
from cl.custom_filters.templatetags.text_filters import oxford_join
from cl.lib.crypto import sha1
from cl.lib.filesizes import convert_size_to_bytes
from cl.lib.microservice_utils import microservice
from cl.lib.pacer import map_cl_to_pacer_id
from cl.lib.pacer_session import (
get_or_cache_pacer_cookies,
Expand Down Expand Up @@ -74,7 +73,7 @@
PacerHtmlFiles,
ProcessingQueue,
)
from cl.scrapers.tasks import extract_recap_pdf, get_page_count
from cl.scrapers.tasks import extract_recap_pdf
from cl.search.models import Docket, DocketEntry, RECAPDocument
from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket

Expand Down Expand Up @@ -286,7 +285,7 @@ def process_recap_pdf(self, pk):

# Do the file, finally.
try:
content = pq.filepath_local.read()
file_contents = pq.filepath_local.read()
except IOError as exc:
msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
if (self.request.retries == self.max_retries) or pq.debug:
Expand All @@ -296,7 +295,7 @@ def process_recap_pdf(self, pk):
mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
raise self.retry(exc=exc)

new_sha1 = sha1(content)
new_sha1 = sha1(file_contents)
existing_document = all(
[
rd.sha1 == new_sha1,
Expand All @@ -307,7 +306,7 @@ def process_recap_pdf(self, pk):
if not existing_document:
# Different sha1, it wasn't available, or it's missing from disk. Move
# the new file over from the processing queue storage.
cf = ContentFile(content)
cf = ContentFile(file_contents)
file_name = get_document_filename(
rd.docket_entry.docket.court_id,
rd.docket_entry.docket.pacer_case_id,
Expand All @@ -318,15 +317,10 @@ def process_recap_pdf(self, pk):
rd.filepath_local.save(file_name, cf, save=False)

# Do page count and extraction
extension = rd.filepath_local.name.split(".")[-1]
with NamedTemporaryFile(
prefix="rd_page_count_",
suffix=f".{extension}",
buffering=0,
) as tmp:
tmp.write(content)
rd.page_count = get_page_count(tmp.name, extension)
rd.file_size = rd.filepath_local.size
rd.page_count = microservice(
service="page-count", file_type="pdf", file=file_contents
).content
rd.file_size = rd.filepath_local.size

rd.ocr_status = None
rd.is_available = True
Expand Down
47 changes: 7 additions & 40 deletions cl/scrapers/tasks.py
@@ -1,4 +1,3 @@
import json
import logging
import random
import re
Expand All @@ -19,8 +18,6 @@
from juriscraper.pacer import CaseQuery, PacerSession
from lxml.etree import XMLSyntaxError
from lxml.html.clean import Cleaner
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError

from cl.audio.models import Audio
from cl.celery_init import app
Expand Down Expand Up @@ -259,41 +256,6 @@ def convert_file_to_txt(path: str) -> str:
return p.communicate()[0].decode()


def get_page_count(path: str, extension: str) -> Optional[int]:
"""Get the number of pages, if appropriate mimetype.
:param path: A path to a binary (pdf, wpd, doc, txt, html, etc.)
:param extension: The extension of the binary.
:return: The number of pages if possible, else return None
"""
if extension == "pdf":
try:
reader = PdfFileReader(path)
return int(reader.getNumPages())
except (
IOError,
ValueError,
TypeError,
KeyError,
AssertionError,
PdfReadError,
):
# IOError: File doesn't exist. My bad.
# ValueError: Didn't get an int for the page count. Their bad.
# TypeError: NumberObject has no attribute '__getitem__'. Ugh.
# KeyError, AssertionError: assert xrefstream["/Type"] == "/XRef". WTF?
# PdfReadError: Something else. I have no words.
pass
elif extension == "wpd":
# Best solution appears to be to dig into the binary format
pass
elif extension == "doc":
# Best solution appears to be to dig into the XML of the file
# itself: http://stackoverflow.com/a/12972502/64911
pass
return None


def update_document_from_text(opinion: Opinion) -> None:
"""Extract additional metadata from document text
Expand Down Expand Up @@ -358,7 +320,8 @@ def extract_doc_content(
buffering=0, # Make sure it's on disk when we try to use it
) as tmp:
# Get file contents from S3 and put them in a temp file.
tmp.write(opinion.local_path.read())
file_contents = opinion.local_path.read()
tmp.write(file_contents)

if extension == "doc":
content, err = extract_from_doc(tmp.name)
Expand All @@ -380,7 +343,11 @@ def extract_doc_content(
return

# Do page count, if possible
opinion.page_count = get_page_count(tmp.name, extension)
opinion.page_count = microservice(
service="page-count",
file_type=extension,
file=file_contents,
).content

assert isinstance(
content, str
Expand Down
13 changes: 1 addition & 12 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Expand Up @@ -65,7 +65,6 @@ psycopg2 = "^2.8.6"
pycparser = "^2.14"
pyopenssl = "*"
pyparsing = "^2.4.2"
pypdf2 = "^1.26.0"
python = ">=3.10, <3.11"
python-dateutil = "^2.8.1"
python-igraph = "0.9.1"
Expand Down

0 comments on commit 1e37dc0

Please sign in to comment.