From 4d164d5053a65a4debe6df6d934899b4439a65d1 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Mon, 16 May 2022 19:46:57 -0400 Subject: [PATCH 1/2] Parse data from NY's historical PDFs Responding to the call-out here: https://github.com/biglocalnews/warn-scraper/issues/476 This being my first commit to the project, and not knowing how the maintainers would like to handle the overlap between the data sources, I tried to take the least destructive approach. --- .pre-commit-config.yaml | 4 +-- warn/scrapers/ny.py | 65 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc0a7578..46aa0cf3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,12 +15,12 @@ repos: - id: mixed-line-ending - repo: https://github.com/psf/black - rev: 21.12b0 + rev: 22.3.0 hooks: - id: black - repo: https://github.com/asottile/blacken-docs - rev: v1.12.0 + rev: v1.12.1 hooks: - id: blacken-docs additional_dependencies: [black] diff --git a/warn/scrapers/ny.py b/warn/scrapers/ny.py index 4d149817..2e5b7dac 100644 --- a/warn/scrapers/ny.py +++ b/warn/scrapers/ny.py @@ -1,14 +1,16 @@ import logging +import re from pathlib import Path +import pdfplumber from bs4 import BeautifulSoup from openpyxl import load_workbook from .. import utils from ..cache import Cache -__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"] -__tags__ = ["historical", "excel"] +__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire", "jsvine"] +__tags__ = ["historical", "excel", "pdf"] __source__ = { "name": "New York Department of Labor", "url": "https://dol.ny.gov/warn-notices", @@ -46,12 +48,19 @@ def scrape( # Get the historical static data file excel_row_list = _get_historical_data(cache) + # Get data from the historical PDFs + pdf_row_list = _get_historical_pdf_data(cache) + # Set the export path data_path = data_dir / "ny.csv" # Combine and write out the file - fieldnames = list(html_row_list[0].keys()) + list(excel_row_list[0].keys()) - row_list = html_row_list + excel_row_list + fieldnames = ( + list(html_row_list[0].keys()) + + list(excel_row_list[0].keys()) + + list(pdf_row_list[0].keys()) + ) + row_list = html_row_list + excel_row_list + pdf_row_list utils.write_dict_rows_to_csv( data_path, fieldnames, @@ -125,5 +134,53 @@ def _get_historical_data(cache): return dict_list +def _get_historical_pdf_data(cache): + # See https://github.com/biglocalnews/warn-scraper/issues/476 + urls = ( + "https://github.com/biglocalnews/warn-scraper/files/8400324/FL-22-0165.Records.for.Release_Part1.pdf", + "https://github.com/biglocalnews/warn-scraper/files/8400325/FL-22-0165.Records.for.Release_Part2.pdf", + "https://github.com/biglocalnews/warn-scraper/files/8400326/FL-22-0165.Records.for.Release_Part3.pdf", + ) + + # Fetch the given file from its URL or the cache, return the local path + def download(url): + filename = url.split("/")[-1] + return cache.download(f"ny/{filename}", url) + + # Normalize the whitespace (esp. newlines) in a list of strings + def clean_row(strings): + return [re.sub(r"\s+", " ", s) for s in strings] + + # For each row of the main table on each page, yield a header:value dict + def gen_rows_from_pdf(pdf): + logger.debug(f"Parsing {pdf.stream.name.split('/')[-1]} …") + for page in pdf.pages: + logger.debug(f"Page {page.page_number}") + + # In a few instances, the *literal* whitespace characters in the + # PDF cause unwanted effects.. Removing them, and instead relying + # only on character positions, produces slightly better output. + # (E.g., "St. Lawrence" instead of "S t. Lawrence".) Not entirely + # necessary, though. + prepared = page.filter(lambda obj: obj.get("text") != " ") + + table = prepared.extract_table({"text_x_tolerance": 1}) + table_clean = list(map(clean_row, table)) + + # Let's make sure we have the table we expect + assert table_clean[0][0] == "Company" + + for row in table_clean[1:]: + yield dict(zip(table_clean[0], row)) + + def parse(path): + with pdfplumber.open(path) as pdf: + return list(gen_rows_from_pdf(pdf)) + + paths = list(map(download, urls)) + parsed = [y for x in map(parse, paths) for y in x] + return parsed + + if __name__ == "__main__": scrape() From b594d3fb45d010d534e1fb03ca34702e9035029c Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Wed, 25 May 2022 18:11:27 -0400 Subject: [PATCH 2/2] Check cache for NY PDF files before downloading --- warn/scrapers/ny.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/warn/scrapers/ny.py b/warn/scrapers/ny.py index 2e5b7dac..fca1681e 100644 --- a/warn/scrapers/ny.py +++ b/warn/scrapers/ny.py @@ -143,9 +143,15 @@ def _get_historical_pdf_data(cache): ) # Fetch the given file from its URL or the cache, return the local path - def download(url): + def get_file(url): filename = url.split("/")[-1] - return cache.download(f"ny/{filename}", url) + cache_key = f"ny/{filename}" + if cache.exists(cache_key): + logger.debug(f"Fetching {filename} from cache") + return cache.path / cache_key + else: + logger.debug(f"Downloading {filename}") + return cache.download(cache_key, url) # Normalize the whitespace (esp. newlines) in a list of strings def clean_row(strings): @@ -177,9 +183,8 @@ def parse(path): with pdfplumber.open(path) as pdf: return list(gen_rows_from_pdf(pdf)) - paths = list(map(download, urls)) - parsed = [y for x in map(parse, paths) for y in x] - return parsed + paths = list(map(get_file, urls)) + return [y for x in map(parse, paths) for y in x] if __name__ == "__main__":