fix(pacer): Use r.content instead of r.text

It turns out that r.text makes calls to chardet each time it is called. That's not great because chardet can be slow and use a lot of memory, particularly when checking PDFs. Instead of doing that or checking if things are PDFs all the time, simply use the binary content instead of the text. Fixes: #564 Relates to: psf/requests#6250
freelawproject · Sep 29, 2022 · b2a4781 · b2a4781
1 parent 1f925e7
commit b2a4781
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 36 deletions.
diff --git a/juriscraper/pacer/http.py b/juriscraper/pacer/http.py
@@ -14,34 +14,33 @@
 requests.packages.urllib3.disable_warnings(exceptions.InsecureRequestWarning)
 
 
-def check_if_logged_in_page(text):
+def check_if_logged_in_page(content: bytes) -> bool:
     """Is this a valid HTML page from PACER?
 
-    Check if the html in 'text' is from a valid PACER page or valid PACER XML
-    document, or if it's from a page telling you to log in or informing you
+    Check if the data in 'content' is from a valid PACER page or valid PACER
+    XML document, or if it's from a page telling you to log in or informing you
     that you're not logged in.
-    :param text: The HTML or XML of the page to test
+    :param content: The data to test, of type bytes. This uses bytes to avoid
+    converting data to text using an unknown encoding. (see #564)
     :return boolean: True if logged in, False if not.
     """
-    if isinstance(text, bytes):
-        text = text.decode("utf-8")
 
     valid_case_number_query = (
-        "<case number=" in text
-        or "<request number=" in text
-        or 'id="caseid"' in text
-        or "Cost: " in text
+        b"<case number=" in content
+        or b"<request number=" in content
+        or b'id="caseid"' in content
+        or b"Cost: " in content
     )
-    no_results_case_number_query = re.search("<message.*Cannot find", text)
-    sealed_case_query = re.search("<message.*Case Under Seal", text)
+    no_results_case_number_query = re.search(b"<message.*Cannot find", content)
+    sealed_case_query = re.search(b"<message.*Case Under Seal", content)
     if any(
         [
             valid_case_number_query,
             no_results_case_number_query,
             sealed_case_query,
         ]
     ):
-        not_logged_in = re.search("text.*Not logged in", text)
+        not_logged_in = re.search(b"text.*Not logged in", content)
         if not_logged_in:
             # An unauthenticated PossibleCaseNumberApi XML result. Simply
             # continue onwards. The complete result looks like:
@@ -55,21 +54,23 @@ def check_if_logged_in_page(text):
 
     # Detect if we are logged in. If so, no need to do so. If not, we login
     # again below.
-    found_district_logout_link = "/cgi-bin/login.pl?logout" in text
-    found_appellate_logout_link = "InvalidUserLogin.jsp" in text
+    found_district_logout_link = b"/cgi-bin/login.pl?logout" in content
+    found_appellate_logout_link = b"InvalidUserLogin.jsp" in content
 
     # A download confirmation page doesn't contain a logout link but we're
     # logged into.
-    is_a_download_confirmation_page = "Download Confirmation" in text
+    is_a_download_confirmation_page = b"Download Confirmation" in content
     # When looking for a download confirmation page sometimes an appellate
     # attachment page is returned instead, see:
     # https://ecf.ca8.uscourts.gov/n/beam/servlet/TransportRoom?servlet=ShowDoc&pacer=i&dls_id=00802251695
-    appellate_attachment_page = "Documents are attached to this filing" in text
+    appellate_attachment_page = (
+        b"Documents are attached to this filing" in content
+    )
     # Sometimes the document is completely unavailable and an error message is
     # shown, see:
     # https://ecf.ca11.uscourts.gov/n/beam/servlet/TransportRoom?servlet=ShowDoc/009033568259
     appellate_document_error = (
-        "The requested document cannot be displayed" in text
+        b"The requested document cannot be displayed" in content
     )
     if any(
         [
@@ -134,7 +135,7 @@ def get(self, url, auto_login=True, **kwargs):
 
         r = super().get(url, **kwargs)
 
-        if "This user has no access privileges defined." in r.text:
+        if b"This user has no access privileges defined." in r.content:
             # This is a strange error that we began seeing in CM/ECF 6.3.1 at
             # ILND. You can currently reproduce it by logging in on the central
             # login page, selecting "Court Links" as your destination, and then
@@ -370,7 +371,7 @@ def _login_again(self, r):
         if is_text(r):
             return False
 
-        logged_in = check_if_logged_in_page(r.text)
+        logged_in = check_if_logged_in_page(r.content)
         if logged_in:
             return False
 

diff --git a/juriscraper/pacer/reports.py b/juriscraper/pacer/reports.py
@@ -35,8 +35,6 @@ def re_xpath(self, path):
 class BaseReport:
     """A base report for working with pages on PACER."""
 
-    REDIRECT_REGEX = re.compile(r'window\.\s*?location\s*=\s*"(.*)"\s*;')
-
     # Subclasses should override PATH
     PATH = ""
 
@@ -229,7 +227,8 @@ def download_pdf(
                 pacer_case_id, pacer_doc_id, pacer_magic_num, got_receipt="1"
             )
 
-            if "Cannot locate the case with caseid" in r.text:
+            # Use r.content instead of r.text for performance. See #564
+            if b"Cannot locate the case with caseid" in r.content:
                 # This document is from a different docket, but is included in
                 # this docket. Probably a criminal case with the doppelganger
                 # bug. Try again, but do so without the pacer_case_id.
@@ -239,12 +238,12 @@ def download_pdf(
                 )
 
             error = None
-            if "could not retrieve dktentry for dlsid" in r.text:
+            if b"could not retrieve dktentry for dlsid" in r.content:
                 error = (
                     f"Failed to get docket entry in case: "
                     f"{pacer_case_id=} at {url}"
                 )
-            if "document is not available" in r.text:
+            if b"document is not available" in r.content:
                 # See: https://ecf.akb.uscourts.gov/doc1/02211536343
                 # See: https://ecf.ksd.uscourts.gov/doc1/07912639735
                 # Matches against:
@@ -255,38 +254,39 @@ def download_pdf(
                     f"{pacer_case_id=} at {url}"
                 )
             if re.search(
-                r"You do not have permission to view\s+this document.", r.text
+                rb"You do not have permission to view\s+this document.",
+                r.content,
             ):
                 error = (
                     f"Permission denied getting document. It's probably "
                     f"sealed. {pacer_case_id=}, {url=}"
                 )
-            if "You do not have access to this transcript." in r.text:
+            if b"You do not have access to this transcript." in r.content:
                 error = f"Unable to get transcript. {pacer_case_id=}, {url=}"
-            if "Sealed Document" in r.text or "Under Seal" in r.text:
+            if b"Sealed Document" in r.content or b"Under Seal" in r.content:
                 # See: https://ecf.almd.uscourts.gov/doc1/01712589088
                 # See: https://ecf.cand.uscourts.gov/doc1/035122021132
                 # Matches against:
                 # "Sealed Document" and
                 # "This document is currently Under Seal and not available..."
                 error = f"Document is sealed: {pacer_case_id=} {url=}"
             if (
-                "This image is not available for viewing by non-court users"
-                in r.text
+                b"This image is not available for viewing by non-court users"
+                in r.content
             ):
                 # See: https://ecf.wvsd.uscourts.gov/doc1/20115419289
                 error = (
                     f"Image not available for viewing by non-court users. "
                     f"{pacer_case_id=}, {url=}"
                 )
-            if "A Client Code is required for PACER search" in r.text:
+            if b"A Client Code is required for PACER search" in r.content:
                 error = (
                     f"Unable to get document. Client code required: "
                     f"{pacer_case_id=}, {url=}"
                 )
             if (
-                "Permission to view this document is denied based on Nature of Suit"
-                in r.text
+                b"Permission to view this document is denied based on Nature of Suit"
+                in r.content
             ):
                 # See: https://ecf.cacd.uscourts.gov/doc1/031134206600
                 error = (
@@ -301,7 +301,8 @@ def download_pdf(
             # Some pacer sites use window.location in their JS, so we have to
             # look for that. See: oknd, 13-cv-00357-JED-FHM, doc #24. But, be
             # warned, you can only catch the redirection with JS off.
-            m = self.REDIRECT_REGEX.search(r.text)
+            redirect_re = re.compile(rb'window\.\s*?location\s*=\s*"(.*)"\s*;')
+            m = redirect_re.search(r.content)
             if m is not None:
                 r = self.session.get(urljoin(url, m.group(1)))
                 r.raise_for_status()

diff --git a/tests/local/test_PacerNeedLoginTest.py b/tests/local/test_PacerNeedLoginTest.py
@@ -31,9 +31,9 @@ def parse_files(self, path_root, file_ext):
             json_path = os.path.join(dirname, f"{filename_sans_ext}.json")
 
             with open(path, "rb") as f:
-                text = f.read()
+                content = f.read()
 
-            result = check_if_logged_in_page(text)
+            result = check_if_logged_in_page(content)
 
             if not os.path.exists(json_path):
                 with open(json_path, "w") as f: