Skip to content

Commit

Permalink
crs 2023 december (#201)
Browse files Browse the repository at this point in the history
* cleanup

* checkpoint

* checkpoint

* checkpoint

* type tweaks

* fix: date parsing

* scrapy typing

* remove toolz typings

* update vcr typings

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* refactor
  • Loading branch information
dogweather committed Dec 19, 2023
1 parent 8c3d728 commit 40f339e
Show file tree
Hide file tree
Showing 136 changed files with 237 additions and 7,421 deletions.
15 changes: 11 additions & 4 deletions public_law/dates.py
@@ -1,3 +1,5 @@
"""Provide date-related functions."""

from typing import Protocol

from datetime import datetime, date
Expand All @@ -17,17 +19,22 @@ def todays_date() -> str:
return iso_8601(today())


def current_year() -> int:
"""Provide the current year."""

return today().year


def today() -> date:
"""Provide today's date in the given timezone."""

# TODO: Refactor the timezone to a config setting.
# But the Scrapy settings don't seem to be
# available in this context.
# See https://doc.scrapy.org/en/latest/topics/settings.html.
mountain = pytz.timezone(
"US/Mountain"
)
return mountain.localize(datetime.now()).date()
tz = pytz.timezone("US/Mountain")

return tz.localize(datetime.now()).date()


def iso_8601(a_date: date) -> str:
Expand Down
2 changes: 1 addition & 1 deletion public_law/parsers/aus/ip_glossary.py
Expand Up @@ -60,6 +60,6 @@ def _parse_mod_date(html: HtmlResponse) -> date:
<span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2021-03-26T00:00:00+11:00">26 March 2021</span>
"""
mod_date_str: str = cast(str, (
html.selector.css("span.date-display-single").xpath("@content").get() # type: ignore
html.selector.css("span.date-display-single").xpath("@content").get()
))
return datetime.fromisoformat(mod_date_str).date()
6 changes: 3 additions & 3 deletions public_law/parsers/can/doj_glossaries.py
Expand Up @@ -102,9 +102,9 @@
}


def configured_urls() -> tuple[str, ...]:
def configured_urls() -> list[str]:
"""All the URLs that have been properly set up with subjects."""
return tuple(SUBJECTS.keys())
return list(SUBJECTS.keys())


def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
Expand All @@ -114,7 +114,7 @@ def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
entries: list[GlossaryEntry] = []

match html.css("main dl"):
case [first, *_] if isinstance(first, Selector):
case [first, *_]:
first_dl_list = first
case _:
raise ParseException("Expected a <dl>")
Expand Down
45 changes: 24 additions & 21 deletions public_law/parsers/usa/colorado/crs.py
@@ -1,12 +1,7 @@
# pyright: reportUnknownMemberType=false
# pyright: reportOptionalMemberAccess=false
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownArgumentType=false
# pyright: reportUnknownLambdaType=false


from scrapy.selector.unified import Selector
from scrapy.http.response import Response
from scrapy.http.response.xml import XmlResponse

from typing import Any

Expand All @@ -16,23 +11,29 @@
from public_law.parsers.usa.colorado.crs_divisions import parse_divisions


def parse_title_bang(dom: Response, logger: Any) -> Title:
def parse_title_bang(dom: XmlResponse, logger: Any) -> Title:
match parse_title(dom, logger):
case None:
raise Exception("Could not parse title")
case result:
return result
case title:
return title


def parse_title(dom: Response, logger: Any) -> Title | None:
raw_name = dom.xpath("//TITLE-TEXT/text()").get()

if raw_name is None:
logger.warn(f"Could not parse title name in {dom.url}")
return None
def parse_title(dom: XmlResponse, logger: Any) -> Title | None:
match(dom.xpath("//TITLE-TEXT/text()").get()):
case str(raw_name):
name = NonemptyString(titleize(raw_name))
case None:
logger.warn(f"Could not the parse title name in {dom.url}")
return None

match(dom.xpath("//TITLE-NUM/text()").get()):
case str(raw_number):
number = NonemptyString(raw_number.split(" ")[1])
case None:
logger.warn(f"Could not the parse title number in {dom.url}")
return None

name = NonemptyString(titleize(raw_name))
number = NonemptyString(dom.xpath("//TITLE-NUM/text()").get().split(" ")[1])
url_number = number.rjust(2, "0")
source_url = URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")

Expand All @@ -44,13 +45,15 @@ def parse_title(dom: Response, logger: Any) -> Title | None:
)


def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | Response, logger: Any) -> list[Division] | list[Article]:
def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Any) -> list[Division] | list[Article]:
division_nodes = dom.xpath("//T-DIV")
article_nodes = dom.xpath("//TA-LIST")

if len(division_nodes) > 0:
return parse_divisions(title_number, dom, logger)
func = parse_divisions
elif len(article_nodes) > 0:
return parse_articles(title_number, dom, logger)
func = parse_articles
else:
raise Exception(f"Could not parse divisions or articles in Title {title_number}")
raise Exception(f"Could not parse divisions or articles in Title {title_number}. Neither T-DIV nor TA-LIST nodes were found.")

return func(title_number, dom, logger)
4 changes: 0 additions & 4 deletions public_law/parsers/usa/colorado/crs_articles.py
@@ -1,8 +1,4 @@
# pyright: reportUnknownMemberType=false
# pyright: reportOptionalMemberAccess=false
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownArgumentType=false
# pyright: reportUnknownLambdaType=false

from itertools import takewhile, dropwhile
from typing import Any
Expand Down
6 changes: 1 addition & 5 deletions public_law/parsers/usa/colorado/crs_divisions.py
@@ -1,8 +1,4 @@
# pyright: reportUnknownMemberType=false
# pyright: reportOptionalMemberAccess=false
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownArgumentType=false
# pyright: reportUnknownLambdaType=false


from scrapy.selector.unified import Selector
Expand All @@ -22,7 +18,7 @@
def parse_divisions(title_number: NonemptyString, dom: Selector | Response, logger: Any) -> list[Division]:
division_nodes = dom.xpath("//T-DIV")

divs = []
divs: list[Division] = []
for div_node in division_nodes:
raw_div_name = div_name_text(div_node)

Expand Down
10 changes: 3 additions & 7 deletions public_law/parsers/usa/colorado/crs_sections.py
@@ -1,13 +1,9 @@
# pyright: reportUnknownMemberType=false
# pyright: reportOptionalMemberAccess=false
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownArgumentType=false
# pyright: reportUnknownLambdaType=false

from typing import Any

from bs4 import BeautifulSoup
from scrapy.http.response import Response
from scrapy.http.response.xml import XmlResponse
from scrapy.selector.unified import Selector

from public_law.selector_util import just_text
Expand All @@ -16,10 +12,10 @@



def parse_sections(dom: Response, logger: Any) -> list[Section]:
def parse_sections(dom: XmlResponse, logger: Any) -> list[Section]:
section_nodes = dom.xpath("//SECTION-TEXT")

sections = []
sections: list[Section] = []
for node in section_nodes:
if _is_repealed(node):
continue
Expand Down
4 changes: 2 additions & 2 deletions public_law/parsers/usa/georgia_ag_opinions.py
Expand Up @@ -48,12 +48,12 @@ def parse_ag_opinion(html: Response) -> OpinionParseResult:
join("\n"),
),
)
citation_set = pipe(
citation_set = cast(CitationSet, pipe(
re.findall(r"\d+-\d+-\d+(?:\([-().A-Za-z0-9]*[-A-Za-z0-9]\))?", full_text),
set,
sorted,
CitationSet,
)
))

return OpinionParseResult(
summary=summary,
Expand Down
3 changes: 2 additions & 1 deletion public_law/selector_util.py
Expand Up @@ -3,8 +3,9 @@
from scrapy.selector.unified import Selector, SelectorList


def node_name(node: Selector):
def node_name(node: Selector) -> str | None:
return node.xpath("name()").get()


def just_text(node: Selector | SelectorList | Any) -> str | None:
return node.xpath("text()").get()
22 changes: 4 additions & 18 deletions public_law/spiders/usa/colorado_crs.py
@@ -1,21 +1,17 @@
# pyright: reportUnknownMemberType=false
# pyright: reportUnknownArgumentType=false
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownParameterType=false
# pyright: reportGeneralTypeIssues=false
# pyright: reportUnusedCallResult=false

import os
import re

from pathlib import Path
from typing import Any

from progressbar import ProgressBar
from scrapy import Spider
from scrapy.http.request import Request
from scrapy.http.response.html import HtmlResponse
from typing import Any

from public_law import dates
from public_law.parsers.usa.colorado.crs import parse_title
from public_law.parsers.usa.colorado.crs_sections import parse_sections

Expand All @@ -25,7 +21,7 @@ class ColoradoCRS(Spider):
Reads the sources from a local directory instead of the web.
"""
name = "usa_colorado_crs"
name = "usa_colorado_crs"


def start_requests(self):
Expand Down Expand Up @@ -53,21 +49,11 @@ def start_requests(self):

def parse(self, response: HtmlResponse, **_: dict[str, Any]): # type: ignore[override]
if "README.txt" in response.url:
yield from self.parse_readme(response)
yield { "kind": "CRS", "edition": dates.current_year() }
else:
yield from self.parse_title_xml(response)


def parse_readme(self, response: HtmlResponse, **_: dict[str, Any]):
result = re.findall(r'COLORADO REVISED STATUTES (\d\d\d\d) DATASET', str(response.body))
if len(result) != 1:
raise Exception(f"Could not parse year from README: {response.body}")

year: str = result[0]

yield { "kind": "CRS", "edition": int(year) }


def parse_title_xml(self, response: HtmlResponse, **_: dict[str, Any]):
"""Framework callback which parses one XML file."""
self.logger.debug(f"Parsing {response.url}...")
Expand Down
30 changes: 19 additions & 11 deletions public_law/spiders/usa/georgia_ag_opinions.py
Expand Up @@ -6,11 +6,13 @@
# pyright: reportGeneralTypeIssues=false


from ...parsers.usa.georgia_ag_opinions import parse_ag_opinion
from scrapy import Spider
from scrapy.http.request import Request
from scrapy.http.response import Response
from typing import Any, Dict, cast
from scrapy.http.response.html import HtmlResponse
from typing import Any, Dict

from ...parsers.usa.georgia_ag_opinions import parse_ag_opinion

JD_VERBOSE_NAME = "USA / Georgia"
PUBLICATION_NAME = "Attorney General Opinions"
Expand All @@ -29,23 +31,28 @@ class GeorgiaAgOpinions(Spider):
"https://law.georgia.gov/opinions/unofficial",
]


def parse(self, response: Response, **kwargs: Dict[str, Any]):
"""Framework callback which begins the parsing."""
return self.parse_index_page(response)

def parse_index_page(self, response: Response):
match(response):
case HtmlResponse():
return self.parse_index_page(response)

case _:
raise Exception(f"Unexpected response type: {type(response)}")


def parse_index_page(self, response: HtmlResponse):
#
# 1. Find all the individual opinions on this index page
# and request a parse for each.
#
opinion_paths = cast(
list[str],
response.xpath(
opinion_paths = response.xpath(
"//td[contains(@class, 'views-field-title')]/a/@href"
).getall(),
)
).getall()

for url in [cast(str, response.urljoin(p)) for p in opinion_paths]:
for url in [response.urljoin(p) for p in opinion_paths]:
yield Request(url, callback=self.parse_opinion_page)

#
Expand All @@ -60,5 +67,6 @@ def parse_index_page(self, response: Response):
response.urljoin(next_page_path), callback=self.parse_index_page
)

def parse_opinion_page(self, response: Response):

def parse_opinion_page(self, response: HtmlResponse):
yield parse_ag_opinion(response)._asdict()
2 changes: 1 addition & 1 deletion public_law/spiders/usa/oregon_regs.py
Expand Up @@ -106,7 +106,7 @@ def from_crawler(cls, crawler: Crawler, *args: List[str], **kwargs: Dict[str, An
"""Override to register to receive the idle event"""
spider = cast(OregonRegs, super(OregonRegs, cls).from_crawler(crawler, *args, **kwargs))

crawler.signals.connect(spider.spider_idle, signal=scrapy.signals.spider_idle) # type: ignore
crawler.signals.connect(spider.spider_idle, signal=scrapy.signals.spider_idle)
return spider

def spider_idle(self, spider: Spider):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Expand Up @@ -41,6 +41,7 @@ typeCheckingMode = "strict"
reportCallInDefaultInitializer = "error"
reportImplicitStringConcatenation = "error"
reportMissingSuperCall = "error"
reportMissingTypeStubs = false
reportPropertyTypeMismatch = "error"
reportUninitializedInstanceVariable = "error"
reportUnnecessaryTypeIgnoreComment = "error"
Expand Down
3 changes: 3 additions & 0 deletions script/create-crs-json
@@ -0,0 +1,3 @@
#!/usr/bin/env fish

scrapy crawl -a crsdata_dir=tmp/sources --overwrite-output tmp/crs.json:jsonl usa_colorado_crs
3 changes: 0 additions & 3 deletions script/create-crs-xml-files-notes
@@ -1,8 +1,5 @@
#!/usr/bin/env fish

#
# So far, just a history of commands I ran.
#

cd tmp/sources/TITLES

Expand Down
22 changes: 0 additions & 22 deletions typings/scrapy/__init__.pyi

This file was deleted.

6 changes: 0 additions & 6 deletions typings/scrapy/__main__.pyi

This file was deleted.

0 comments on commit 40f339e

Please sign in to comment.