Skip to content

Commit

Permalink
More fixes (#204)
Browse files Browse the repository at this point in the history
* refactor

* refactor

* refactor

* refactor

* checkpoint

* type cleanup

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* scrapy from master

* fix: lib errors

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint

* checkpoint
  • Loading branch information
dogweather committed Dec 19, 2023
1 parent 40f4ab0 commit 2b06762
Show file tree
Hide file tree
Showing 25 changed files with 279 additions and 216 deletions.
60 changes: 31 additions & 29 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions public_law/middlewares.py
Expand Up @@ -3,9 +3,8 @@
# pyright: reportUnknownVariableType=false
# pyright: reportUnknownParameterType=false
# pyright: reportMissingParameterType=false
# pyright: reportUnknownMemberType=false

# pyright: reportUnknownArgumentType=false
# pyright: reportGeneralTypeIssues=false


# -*- coding: utf-8 -*-
Expand Down Expand Up @@ -66,7 +65,7 @@ def process_start_requests(self, start_requests, _spider: Spider):
yield req

def spider_opened(self, spider: Spider):
spider.logger.info(f"Spider opened: {spider.name}")
spider.logger.info(f"Spider opened: {spider.name}") # type: ignore


class OarDownloaderMiddleware:
Expand Down Expand Up @@ -113,4 +112,4 @@ def process_exception(self, request: Request, exception, spider: Spider):
pass

def spider_opened(self, spider: Spider):
spider.logger.info(f"Spider opened: {spider.name}")
spider.logger.info(f"Spider opened: {spider.name}") # type: ignore
8 changes: 4 additions & 4 deletions public_law/parsers/can/doj_glossaries.py
@@ -1,9 +1,9 @@
# pyright: reportUnknownMemberType=false

# pyright: reportUnknownVariableType=false

import re
from datetime import date
from typing import Any, TypeAlias, cast
from typing import Any, TypeAlias

from scrapy.http.response.html import HtmlResponse
from scrapy.selector.unified import Selector, SelectorList
Expand Down Expand Up @@ -113,7 +113,7 @@ def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:

entries: list[GlossaryEntry] = []

match html.css("main dl"):
match html.selector.css("main dl"):
case [first, *_]:
first_dl_list = first
case _:
Expand Down Expand Up @@ -148,7 +148,7 @@ def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
)

parsed_entries = tuple(entries)
url = cast(str, html.url)
url = html.url

match SUBJECTS.get(url):
case tuple(subjects):
Expand Down
2 changes: 1 addition & 1 deletion public_law/parsers/can/parliamentary_glossary.py
Expand Up @@ -15,7 +15,7 @@ def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
dcterms_language="en",
dcterms_coverage="CAN",
# Info about original source
dcterms_source=html.url,
dcterms_source=String(html.url),
publiclaw_sourceModified="unknown",
publiclaw_sourceCreator=String("Parliament of Canada"),
dcterms_subject=(
Expand Down
92 changes: 54 additions & 38 deletions public_law/parsers/usa/colorado/crs.py
@@ -1,64 +1,80 @@
# pyright: reportUnknownMemberType=false

from scrapy.selector.unified import Selector
from scrapy.http.response.xml import XmlResponse

from typing import Any, Optional
from typing import Any, Optional, cast, Protocol
from toolz.functoolz import curry, flip, pipe # type: ignore

from public_law.exceptions import ParseException
from public_law.selector_util import xpath_get
from public_law.text import NonemptyString, URL, titleize
import public_law.text as text
from public_law.items.crs import Article, Division, Title
from public_law.parsers.usa.colorado.crs_articles import parse_articles
from public_law.parsers.usa.colorado.crs_divisions import parse_divisions

split = curry(flip(str.split))
xpath_get = curry(xpath_get)

def second(x: list[Any]) -> Any:
return x[1]

class Logger(Protocol):
def warn(self, message: str) -> None: ...


def parse_title_bang(dom: XmlResponse, logger: Any) -> Title:

def parse_title_bang(dom: XmlResponse, logger: Logger) -> Title:
match parse_title(dom, logger):
case None:
raise Exception("Could not parse title")
case title:
return title


def parse_title(dom: XmlResponse, logger: Any) -> Optional[Title]:
match(dom.xpath("//TITLE-TEXT/text()").get()):
case str(raw_name):
name = NonemptyString(titleize(raw_name))
case None:
logger.warn(f"Could not the parse title name in {dom.url}")
return None
def parse_title(dom: XmlResponse, logger: Logger) -> Optional[Title]:
try:
name = string_pipe(
"//TITLE-TEXT/text()",
xpath_get(dom),
titleize
)
number = string_pipe(
"//TITLE-NUM/text()",
xpath_get(dom),
text.split_on_space,
second
)
children = _parse_divisions_or_articles(number, dom, logger)
url = source_url(number)
return Title(name, number, children, url)

match(dom.xpath("//TITLE-NUM/text()").get()):
case str(raw_number):
number = NonemptyString(raw_number.split(" ")[1])
case None:
logger.warn(f"Could not the parse title number in {dom.url}")
return None
except ParseException as e:
logger.warn(f"Could not parse the title: {e}")
return None

match _parse_divisions_or_articles(number, dom, logger):
case None:
return None
case children:
url_number = number.rjust(2, "0")
source_url = URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
return Title(
name = name,
number = number,
source_url = URL(source_url),
children = children
)


def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Any) -> Optional[list[Division] | list[Article]]:

def string_pipe(*args: Any) -> NonemptyString:
"""A wrapper around pipe() that casts the result to a NonemptyString."""
args_with_string: Any = args + (NonemptyString,)

return cast(NonemptyString, pipe(*args_with_string))


def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]:
division_nodes = dom.xpath("//T-DIV")
article_nodes = dom.xpath("//TA-LIST")

if len(division_nodes) > 0:
func = parse_divisions
parse_fun = parse_divisions
elif len(article_nodes) > 0:
func = parse_articles
parse_fun = parse_articles
else:
msg = f"Could not parse divisions or articles in Title {title_number}. Neither T-DIV nor TA-LIST nodes were found."
logger.warn(msg)
return None
msg = f"Neither T-DIV nor TA-LIST nodes were found in Title {title_number}."
raise ParseException(msg)

return parse_fun(title_number, dom, logger)


return func(title_number, dom, logger)
def source_url(title_number: NonemptyString) -> URL:
url_number = title_number.rjust(2, "0")
return URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
27 changes: 21 additions & 6 deletions public_law/parsers/usa/colorado/crs_articles.py
@@ -1,12 +1,12 @@
# pyright: reportUnknownMemberType=false


from itertools import takewhile, dropwhile
from typing import Any

from bs4 import BeautifulSoup

from scrapy.selector.unified import Selector
from scrapy.http.response import Response
from scrapy.http.response.xml import XmlResponse

from public_law.selector_util import node_name
from public_law.items.crs import *
Expand All @@ -15,7 +15,7 @@

def parse_articles_from_division(
title_number: NonemptyString,
dom: Selector | Response,
dom: Selector | XmlResponse,
raw_div_name: str,
subdiv_name: NonemptyString|None = None) -> list[Article]:

Expand All @@ -25,9 +25,14 @@ def parse_articles_from_division(
return _parse_articles_from_subdivision(title_number, dom, raw_div_name, subdiv_name)


def _parse_articles_from_division(title_number: NonemptyString, dom: Selector | Response, raw_div_name: str) -> list[Article]:
def _parse_articles_from_division(title_number: NonemptyString, dom_or_sel: Selector | XmlResponse, raw_div_name: str) -> list[Article]:
"""Return the articles within the given Division."""

if isinstance(dom_or_sel, XmlResponse):
dom = dom_or_sel.selector
else:
dom = dom_or_sel

#
# Algorithm:
#
Expand Down Expand Up @@ -61,9 +66,14 @@ def _parse_articles_from_division(title_number: NonemptyString, dom: Selector |
]


def _parse_articles_from_subdivision(title_number: NonemptyString, dom: Selector | Response, raw_div_name: str, subdiv_name: NonemptyString) -> list[Article]:
def _parse_articles_from_subdivision(title_number: NonemptyString, dom_or_sel: Selector | XmlResponse, raw_div_name: str, subdiv_name: NonemptyString) -> list[Article]:
"""Return the articles within the given Subdivision."""

if isinstance(dom_or_sel, XmlResponse):
dom = dom_or_sel.selector
else:
dom = dom_or_sel

#
# Algorithm:
#
Expand Down Expand Up @@ -104,7 +114,12 @@ def _parse_articles_from_subdivision(title_number: NonemptyString, dom: Selector



def parse_articles(title_number: NonemptyString, dom: Selector | Response, logger: Any) -> list[Article]:
def parse_articles(title_number: NonemptyString, dom_or_sel: Selector | XmlResponse, logger: Any) -> list[Article]:
if isinstance(dom_or_sel, XmlResponse):
dom = dom_or_sel.selector
else:
dom = dom_or_sel

#
# Algorithm:
#
Expand Down

0 comments on commit 2b06762

Please sign in to comment.