Skip to content

Commit

Permalink
fix(pacer): Use r.content instead of r.text
Browse files Browse the repository at this point in the history
It turns out that r.text makes calls to chardet each time it is called. That's
not great because chardet can be slow and use a lot of memory, particularly
when checking PDFs.

Instead of doing that or checking if things are PDFs all the time, simply use
the binary content instead of the text.

Fixes: #564
Relates to: psf/requests#6250
  • Loading branch information
mlissner committed Sep 29, 2022
1 parent 1f925e7 commit b2a4781
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 36 deletions.
41 changes: 21 additions & 20 deletions juriscraper/pacer/http.py
Expand Up @@ -14,34 +14,33 @@
requests.packages.urllib3.disable_warnings(exceptions.InsecureRequestWarning)


def check_if_logged_in_page(text):
def check_if_logged_in_page(content: bytes) -> bool:
"""Is this a valid HTML page from PACER?
Check if the html in 'text' is from a valid PACER page or valid PACER XML
document, or if it's from a page telling you to log in or informing you
Check if the data in 'content' is from a valid PACER page or valid PACER
XML document, or if it's from a page telling you to log in or informing you
that you're not logged in.
:param text: The HTML or XML of the page to test
:param content: The data to test, of type bytes. This uses bytes to avoid
converting data to text using an unknown encoding. (see #564)
:return boolean: True if logged in, False if not.
"""
if isinstance(text, bytes):
text = text.decode("utf-8")

valid_case_number_query = (
"<case number=" in text
or "<request number=" in text
or 'id="caseid"' in text
or "Cost: " in text
b"<case number=" in content
or b"<request number=" in content
or b'id="caseid"' in content
or b"Cost: " in content
)
no_results_case_number_query = re.search("<message.*Cannot find", text)
sealed_case_query = re.search("<message.*Case Under Seal", text)
no_results_case_number_query = re.search(b"<message.*Cannot find", content)
sealed_case_query = re.search(b"<message.*Case Under Seal", content)
if any(
[
valid_case_number_query,
no_results_case_number_query,
sealed_case_query,
]
):
not_logged_in = re.search("text.*Not logged in", text)
not_logged_in = re.search(b"text.*Not logged in", content)
if not_logged_in:
# An unauthenticated PossibleCaseNumberApi XML result. Simply
# continue onwards. The complete result looks like:
Expand All @@ -55,21 +54,23 @@ def check_if_logged_in_page(text):

# Detect if we are logged in. If so, no need to do so. If not, we login
# again below.
found_district_logout_link = "/cgi-bin/login.pl?logout" in text
found_appellate_logout_link = "InvalidUserLogin.jsp" in text
found_district_logout_link = b"/cgi-bin/login.pl?logout" in content
found_appellate_logout_link = b"InvalidUserLogin.jsp" in content

# A download confirmation page doesn't contain a logout link but we're
# logged into.
is_a_download_confirmation_page = "Download Confirmation" in text
is_a_download_confirmation_page = b"Download Confirmation" in content
# When looking for a download confirmation page sometimes an appellate
# attachment page is returned instead, see:
# https://ecf.ca8.uscourts.gov/n/beam/servlet/TransportRoom?servlet=ShowDoc&pacer=i&dls_id=00802251695
appellate_attachment_page = "Documents are attached to this filing" in text
appellate_attachment_page = (
b"Documents are attached to this filing" in content
)
# Sometimes the document is completely unavailable and an error message is
# shown, see:
# https://ecf.ca11.uscourts.gov/n/beam/servlet/TransportRoom?servlet=ShowDoc/009033568259
appellate_document_error = (
"The requested document cannot be displayed" in text
b"The requested document cannot be displayed" in content
)
if any(
[
Expand Down Expand Up @@ -134,7 +135,7 @@ def get(self, url, auto_login=True, **kwargs):

r = super().get(url, **kwargs)

if "This user has no access privileges defined." in r.text:
if b"This user has no access privileges defined." in r.content:
# This is a strange error that we began seeing in CM/ECF 6.3.1 at
# ILND. You can currently reproduce it by logging in on the central
# login page, selecting "Court Links" as your destination, and then
Expand Down Expand Up @@ -370,7 +371,7 @@ def _login_again(self, r):
if is_text(r):
return False

logged_in = check_if_logged_in_page(r.text)
logged_in = check_if_logged_in_page(r.content)
if logged_in:
return False

Expand Down
29 changes: 15 additions & 14 deletions juriscraper/pacer/reports.py
Expand Up @@ -35,8 +35,6 @@ def re_xpath(self, path):
class BaseReport:
"""A base report for working with pages on PACER."""

REDIRECT_REGEX = re.compile(r'window\.\s*?location\s*=\s*"(.*)"\s*;')

# Subclasses should override PATH
PATH = ""

Expand Down Expand Up @@ -229,7 +227,8 @@ def download_pdf(
pacer_case_id, pacer_doc_id, pacer_magic_num, got_receipt="1"
)

if "Cannot locate the case with caseid" in r.text:
# Use r.content instead of r.text for performance. See #564
if b"Cannot locate the case with caseid" in r.content:
# This document is from a different docket, but is included in
# this docket. Probably a criminal case with the doppelganger
# bug. Try again, but do so without the pacer_case_id.
Expand All @@ -239,12 +238,12 @@ def download_pdf(
)

error = None
if "could not retrieve dktentry for dlsid" in r.text:
if b"could not retrieve dktentry for dlsid" in r.content:
error = (
f"Failed to get docket entry in case: "
f"{pacer_case_id=} at {url}"
)
if "document is not available" in r.text:
if b"document is not available" in r.content:
# See: https://ecf.akb.uscourts.gov/doc1/02211536343
# See: https://ecf.ksd.uscourts.gov/doc1/07912639735
# Matches against:
Expand All @@ -255,38 +254,39 @@ def download_pdf(
f"{pacer_case_id=} at {url}"
)
if re.search(
r"You do not have permission to view\s+this document.", r.text
rb"You do not have permission to view\s+this document.",
r.content,
):
error = (
f"Permission denied getting document. It's probably "
f"sealed. {pacer_case_id=}, {url=}"
)
if "You do not have access to this transcript." in r.text:
if b"You do not have access to this transcript." in r.content:
error = f"Unable to get transcript. {pacer_case_id=}, {url=}"
if "Sealed Document" in r.text or "Under Seal" in r.text:
if b"Sealed Document" in r.content or b"Under Seal" in r.content:
# See: https://ecf.almd.uscourts.gov/doc1/01712589088
# See: https://ecf.cand.uscourts.gov/doc1/035122021132
# Matches against:
# "Sealed Document" and
# "This document is currently Under Seal and not available..."
error = f"Document is sealed: {pacer_case_id=} {url=}"
if (
"This image is not available for viewing by non-court users"
in r.text
b"This image is not available for viewing by non-court users"
in r.content
):
# See: https://ecf.wvsd.uscourts.gov/doc1/20115419289
error = (
f"Image not available for viewing by non-court users. "
f"{pacer_case_id=}, {url=}"
)
if "A Client Code is required for PACER search" in r.text:
if b"A Client Code is required for PACER search" in r.content:
error = (
f"Unable to get document. Client code required: "
f"{pacer_case_id=}, {url=}"
)
if (
"Permission to view this document is denied based on Nature of Suit"
in r.text
b"Permission to view this document is denied based on Nature of Suit"
in r.content
):
# See: https://ecf.cacd.uscourts.gov/doc1/031134206600
error = (
Expand All @@ -301,7 +301,8 @@ def download_pdf(
# Some pacer sites use window.location in their JS, so we have to
# look for that. See: oknd, 13-cv-00357-JED-FHM, doc #24. But, be
# warned, you can only catch the redirection with JS off.
m = self.REDIRECT_REGEX.search(r.text)
redirect_re = re.compile(rb'window\.\s*?location\s*=\s*"(.*)"\s*;')
m = redirect_re.search(r.content)
if m is not None:
r = self.session.get(urljoin(url, m.group(1)))
r.raise_for_status()
Expand Down
4 changes: 2 additions & 2 deletions tests/local/test_PacerNeedLoginTest.py
Expand Up @@ -31,9 +31,9 @@ def parse_files(self, path_root, file_ext):
json_path = os.path.join(dirname, f"{filename_sans_ext}.json")

with open(path, "rb") as f:
text = f.read()
content = f.read()

result = check_if_logged_in_page(text)
result = check_if_logged_in_page(content)

if not os.path.exists(json_path):
with open(json_path, "w") as f:
Expand Down

0 comments on commit b2a4781

Please sign in to comment.