From 4d164d5053a65a4debe6df6d934899b4439a65d1 Mon Sep 17 00:00:00 2001
From: Jeremy Singer-Vine <jsvine@gmail.com>
Date: Mon, 16 May 2022 19:46:57 -0400
Subject: [PATCH 1/2] Parse data from NY's historical PDFs

Responding to the call-out here:
https://github.com/biglocalnews/warn-scraper/issues/476

This being my first commit to the project, and not knowing how the
maintainers would like to handle the overlap between the data sources, I
tried to take the least destructive approach.
---
 .pre-commit-config.yaml |  4 +--
 warn/scrapers/ny.py     | 65 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bc0a7578..46aa0cf3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,12 +15,12 @@ repos:
     -   id: mixed-line-ending
 
 -   repo: https://github.com/psf/black
-    rev: 21.12b0
+    rev: 22.3.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/asottile/blacken-docs
-    rev: v1.12.0
+    rev: v1.12.1
     hooks:
     -   id: blacken-docs
         additional_dependencies: [black]
diff --git a/warn/scrapers/ny.py b/warn/scrapers/ny.py
index 4d149817..2e5b7dac 100644
--- a/warn/scrapers/ny.py
+++ b/warn/scrapers/ny.py
@@ -1,14 +1,16 @@
 import logging
+import re
 from pathlib import Path
 
+import pdfplumber
 from bs4 import BeautifulSoup
 from openpyxl import load_workbook
 
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"]
-__tags__ = ["historical", "excel"]
+__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire", "jsvine"]
+__tags__ = ["historical", "excel", "pdf"]
 __source__ = {
     "name": "New York Department of Labor",
     "url": "https://dol.ny.gov/warn-notices",
@@ -46,12 +48,19 @@ def scrape(
     # Get the historical static data file
     excel_row_list = _get_historical_data(cache)
 
+    # Get data from the historical PDFs
+    pdf_row_list = _get_historical_pdf_data(cache)
+
     # Set the export path
     data_path = data_dir / "ny.csv"
 
     # Combine and write out the file
-    fieldnames = list(html_row_list[0].keys()) + list(excel_row_list[0].keys())
-    row_list = html_row_list + excel_row_list
+    fieldnames = (
+        list(html_row_list[0].keys())
+        + list(excel_row_list[0].keys())
+        + list(pdf_row_list[0].keys())
+    )
+    row_list = html_row_list + excel_row_list + pdf_row_list
     utils.write_dict_rows_to_csv(
         data_path,
         fieldnames,
@@ -125,5 +134,53 @@ def _get_historical_data(cache):
     return dict_list
 
 
+def _get_historical_pdf_data(cache):
+    # See https://github.com/biglocalnews/warn-scraper/issues/476
+    urls = (
+        "https://github.com/biglocalnews/warn-scraper/files/8400324/FL-22-0165.Records.for.Release_Part1.pdf",
+        "https://github.com/biglocalnews/warn-scraper/files/8400325/FL-22-0165.Records.for.Release_Part2.pdf",
+        "https://github.com/biglocalnews/warn-scraper/files/8400326/FL-22-0165.Records.for.Release_Part3.pdf",
+    )
+
+    # Fetch the given file from its URL or the cache, return the local path
+    def download(url):
+        filename = url.split("/")[-1]
+        return cache.download(f"ny/{filename}", url)
+
+    # Normalize the whitespace (esp. newlines) in a list of strings
+    def clean_row(strings):
+        return [re.sub(r"\s+", " ", s) for s in strings]
+
+    # For each row of the main table on each page, yield a header:value dict
+    def gen_rows_from_pdf(pdf):
+        logger.debug(f"Parsing {pdf.stream.name.split('/')[-1]} …")
+        for page in pdf.pages:
+            logger.debug(f"Page {page.page_number}")
+
+            # In a few instances, the *literal* whitespace characters in the
+            # PDF cause unwanted effects.. Removing them, and instead relying
+            # only on character positions, produces slightly better output.
+            # (E.g., "St. Lawrence" instead of "S t.  Lawrence".) Not entirely
+            # necessary, though.
+            prepared = page.filter(lambda obj: obj.get("text") != " ")
+
+            table = prepared.extract_table({"text_x_tolerance": 1})
+            table_clean = list(map(clean_row, table))
+
+            # Let's make sure we have the table we expect
+            assert table_clean[0][0] == "Company"
+
+            for row in table_clean[1:]:
+                yield dict(zip(table_clean[0], row))
+
+    def parse(path):
+        with pdfplumber.open(path) as pdf:
+            return list(gen_rows_from_pdf(pdf))
+
+    paths = list(map(download, urls))
+    parsed = [y for x in map(parse, paths) for y in x]
+    return parsed
+
+
 if __name__ == "__main__":
     scrape()

From b594d3fb45d010d534e1fb03ca34702e9035029c Mon Sep 17 00:00:00 2001
From: Jeremy Singer-Vine <jsvine@gmail.com>
Date: Wed, 25 May 2022 18:11:27 -0400
Subject: [PATCH 2/2] Check cache for NY PDF files before downloading

---
 warn/scrapers/ny.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/warn/scrapers/ny.py b/warn/scrapers/ny.py
index 2e5b7dac..fca1681e 100644
--- a/warn/scrapers/ny.py
+++ b/warn/scrapers/ny.py
@@ -143,9 +143,15 @@ def _get_historical_pdf_data(cache):
     )
 
     # Fetch the given file from its URL or the cache, return the local path
-    def download(url):
+    def get_file(url):
         filename = url.split("/")[-1]
-        return cache.download(f"ny/{filename}", url)
+        cache_key = f"ny/{filename}"
+        if cache.exists(cache_key):
+            logger.debug(f"Fetching {filename} from cache")
+            return cache.path / cache_key
+        else:
+            logger.debug(f"Downloading {filename}")
+            return cache.download(cache_key, url)
 
     # Normalize the whitespace (esp. newlines) in a list of strings
     def clean_row(strings):
@@ -177,9 +183,8 @@ def parse(path):
         with pdfplumber.open(path) as pdf:
             return list(gen_rows_from_pdf(pdf))
 
-    paths = list(map(download, urls))
-    parsed = [y for x in map(parse, paths) for y in x]
-    return parsed
+    paths = list(map(get_file, urls))
+    return [y for x in map(parse, paths) for y in x]
 
 
 if __name__ == "__main__":