biglocalnews · jsvine · May 16, 2022 · May 25, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,12 +15,12 @@ repos:
     -   id: mixed-line-ending
 
 -   repo: https://github.com/psf/black
-    rev: 21.12b0
+    rev: 22.3.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/asottile/blacken-docs
-    rev: v1.12.0
+    rev: v1.12.1
     hooks:
     -   id: blacken-docs
         additional_dependencies: [black]

diff --git a/warn/scrapers/ny.py b/warn/scrapers/ny.py
@@ -1,14 +1,16 @@
 import logging
+import re
 from pathlib import Path
 
+import pdfplumber
 from bs4 import BeautifulSoup
 from openpyxl import load_workbook
 
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"]
-__tags__ = ["historical", "excel"]
+__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire", "jsvine"]
+__tags__ = ["historical", "excel", "pdf"]
 __source__ = {
     "name": "New York Department of Labor",
     "url": "https://dol.ny.gov/warn-notices",
@@ -46,12 +48,19 @@ def scrape(
     # Get the historical static data file
     excel_row_list = _get_historical_data(cache)
 
+    # Get data from the historical PDFs
+    pdf_row_list = _get_historical_pdf_data(cache)
+
     # Set the export path
     data_path = data_dir / "ny.csv"
 
     # Combine and write out the file
-    fieldnames = list(html_row_list[0].keys()) + list(excel_row_list[0].keys())
-    row_list = html_row_list + excel_row_list
+    fieldnames = (
+        list(html_row_list[0].keys())
+        + list(excel_row_list[0].keys())
+        + list(pdf_row_list[0].keys())
+    )
+    row_list = html_row_list + excel_row_list + pdf_row_list
     utils.write_dict_rows_to_csv(
         data_path,
         fieldnames,
@@ -125,5 +134,53 @@ def _get_historical_data(cache):
     return dict_list
 
 
+def _get_historical_pdf_data(cache):
+    # See https://github.com/biglocalnews/warn-scraper/issues/476
+    urls = (
+        "https://github.com/biglocalnews/warn-scraper/files/8400324/FL-22-0165.Records.for.Release_Part1.pdf",
+        "https://github.com/biglocalnews/warn-scraper/files/8400325/FL-22-0165.Records.for.Release_Part2.pdf",
+        "https://github.com/biglocalnews/warn-scraper/files/8400326/FL-22-0165.Records.for.Release_Part3.pdf",
+    )
+
+    # Fetch the given file from its URL or the cache, return the local path
+    def download(url):
+        filename = url.split("/")[-1]
+        return cache.download(f"ny/{filename}", url)
+
+    # Normalize the whitespace (esp. newlines) in a list of strings
+    def clean_row(strings):
+        return [re.sub(r"\s+", " ", s) for s in strings]
+
+    # For each row of the main table on each page, yield a header:value dict
+    def gen_rows_from_pdf(pdf):
+        logger.debug(f"Parsing {pdf.stream.name.split('/')[-1]} …")
+        for page in pdf.pages:
+            logger.debug(f"Page {page.page_number}")
+
+            # In a few instances, the *literal* whitespace characters in the
+            # PDF cause unwanted effects.. Removing them, and instead relying
+            # only on character positions, produces slightly better output.
+            # (E.g., "St. Lawrence" instead of "S t.  Lawrence".) Not entirely
+            # necessary, though.
+            prepared = page.filter(lambda obj: obj.get("text") != " ")
+
+            table = prepared.extract_table({"text_x_tolerance": 1})
+            table_clean = list(map(clean_row, table))
+
+            # Let's make sure we have the table we expect
+            assert table_clean[0][0] == "Company"
+
+            for row in table_clean[1:]:
+                yield dict(zip(table_clean[0], row))
+
+    def parse(path):
+        with pdfplumber.open(path) as pdf:
+            return list(gen_rows_from_pdf(pdf))
+
+    paths = list(map(download, urls))
+    parsed = [y for x in map(parse, paths) for y in x]
+    return parsed
+
+
 if __name__ == "__main__":
     scrape()