refactor (#206)

* po * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint
public-law · Dec 31, 2023 · 68289e1 · 68289e1
1 parent 2b06762
commit 68289e1
Show file tree

Hide file tree

Showing 16 changed files with 378 additions and 361 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/public_law/flipped.py b/public_law/flipped.py
diff --git a/public_law/html.py b/public_law/html.py
@@ -0,0 +1,38 @@
+from typing import Any
+from toolz.functoolz import curry
+
+from scrapy.selector.unified import Selector, SelectorList
+from scrapy.http.response.xml import XmlResponse
+
+from .exceptions import ParseException
+
+
+def node_name(node: Selector) -> str | None:
+    return node.xpath("name()").get()
+
+def just_text(node: Selector | SelectorList | Any) -> str | None:
+    return node.xpath("text()").get()
+
+def xpath(selector: str, dom: XmlResponse) -> str:
+    """
+    Extracts the text content from the XML response using the given XPath selector.
+    It does this by appending "/text()" to the selector and returning the first
+    match. If no match is found, it raises a ParseException.
+
+    Args:
+        selector (str): The XPath selector to match the desired elements.
+        dom (XmlResponse): The XML response object.
+
+    Returns:
+        str: The extracted text content.
+
+    Raises:
+        ParseException: If the specified XPath selector cannot be found in the XML response.
+    """
+    match dom.xpath(selector + "/text()").get():
+        case str(value):
+            return value
+        case None:
+            raise ParseException(f"Could not find {xpath} in {dom.url}")
+
+xpath = curry(xpath) # type: ignore
diff --git a/public_law/models/glossary.py b/public_law/models/glossary.py
@@ -1,7 +1,8 @@
-from dataclasses import dataclass
 import dataclasses
+from dataclasses import dataclass
 from functools import cache
-from typing import Any, Iterable, Callable, TypeAlias
+from typing import Any, Callable, Iterable, TypeAlias
+
 from scrapy.http.response.html import HtmlResponse
 
 from ..metadata import Metadata

diff --git a/public_law/parsers/aus/dv_glossary.py b/public_law/parsers/aus/dv_glossary.py
@@ -1,12 +1,16 @@
 from typing import Any, Iterable, cast
-from toolz.functoolz import pipe  # type: ignore
-from public_law.flipped import rstrip
+
 from scrapy.http.response.html import HtmlResponse
+from toolz.functoolz import pipe  # type: ignore
+
+from public_law import text
 
 from ...metadata import Metadata, Subject
 from ...models.glossary import GlossaryEntry, GlossaryParseResult
-from ...text import URL, LoCSubject, NonemptyString as String
-from ...text import Sentence, ensure_ends_with_period, make_soup, normalize_nonempty
+from ...text import URL, LoCSubject
+from ...text import NonemptyString as String
+from ...text import (Sentence, ensure_ends_with_period, make_soup,
+                     normalize_nonempty)
 
 
 def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
@@ -44,7 +48,10 @@ def __parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
     """TODO: Refactor into a parent class"""
 
     for phrase, defn in __raw_entries(html):
-        fixed_phrase: String = cast(Sentence, pipe(phrase, rstrip(": "), String))  # type: ignore
+        fixed_phrase = text.pipe(
+            phrase
+            , text.rstrip(": ")                             # type: ignore
+        )
 
         fixed_definition: Sentence = cast(Sentence, pipe(defn, ensure_ends_with_period, normalize_nonempty, Sentence))
 

diff --git a/public_law/parsers/irl/courts_glossary.py b/public_law/parsers/irl/courts_glossary.py
@@ -4,16 +4,13 @@
 from scrapy.http.response.html import HtmlResponse
 from toolz.functoolz import pipe  # type: ignore
 
-from ...flipped import lstrip, rstrip
+from public_law import text
+
 from ...metadata import Metadata, Subject
 from ...models.glossary import GlossaryEntry, GlossaryParseResult
-from ...text import URL, LoCSubject, NonemptyString as String, WikidataTopic
-from ...text import (
-    Sentence,
-    capitalize_first_char,
-    ensure_ends_with_period,
-    normalize_nonempty,
-)
+from ...text import URL, LoCSubject
+from ...text import NonemptyString as String
+from ...text import (Sentence, WikidataTopic)
 
 
 def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
@@ -52,23 +49,22 @@ def _parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
     functions for cleaning up the definitions and phrases.
     """
 
-    def cleanup_definition(defn: str) -> Sentence:
+    def cleanup_definition(definition: str) -> Sentence:
         return pipe(
-            defn,
-            normalize_nonempty,
-            lstrip(":"),  # type: ignore
-            ensure_ends_with_period,
-            normalize_nonempty,
-            capitalize_first_char,
-            Sentence,
+            definition
+            , text.normalize_nonempty
+            , text.lstrip(":")                                              # type: ignore
+            , text.ensure_ends_with_period
+            , text.normalize_nonempty
+            , text.capitalize_first_char
+            , Sentence
         )
 
     def cleanup_phrase(phrase: str) -> String:
-        return pipe(
-            phrase,
-            rstrip(":"),  # type: ignore
-            normalize_nonempty,
-            String,
+        return text.pipe(
+            phrase
+            , text.rstrip(":")                                             # type: ignore
+            , text.normalize_nonempty
         )
 
     for phrase, defn in _raw_entries(html):

diff --git a/public_law/parsers/usa/colorado/crs.py b/public_law/parsers/usa/colorado/crs.py
@@ -1,66 +1,53 @@
-from scrapy.selector.unified import Selector
-from scrapy.http.response.xml import XmlResponse
+from typing import Optional, Protocol
 
-from typing import Any, Optional, cast, Protocol
-from toolz.functoolz import curry, flip, pipe # type: ignore
+from scrapy.http.response.xml import XmlResponse
+from scrapy.selector.unified import Selector
 
+from public_law import html, seq, text
 from public_law.exceptions import ParseException
-from public_law.selector_util import xpath_get
-from public_law.text import NonemptyString, URL, titleize
-import public_law.text as text
 from public_law.items.crs import Article, Division, Title
-from public_law.parsers.usa.colorado.crs_articles  import parse_articles
+from public_law.parsers.usa.colorado.crs_articles import parse_articles
 from public_law.parsers.usa.colorado.crs_divisions import parse_divisions
 
-split     = curry(flip(str.split))
-xpath_get = curry(xpath_get)
-
-def second(x: list[Any]) -> Any:
-    return x[1]
 
 class Logger(Protocol):
+    """Defines a simple shape-based logger interface."""
     def warn(self, message: str) -> None: ...
 
 
 
 def parse_title_bang(dom: XmlResponse, logger: Logger) -> Title:
     match parse_title(dom, logger):
         case None:
-            raise Exception("Could not parse title")
+            raise ParseException("Could not parse title")
         case title:
             return title
 
 
 def parse_title(dom: XmlResponse, logger: Logger) -> Optional[Title]:
     try:
-        name = string_pipe(
-            "//TITLE-TEXT/text()",
-            xpath_get(dom),
-            titleize
+        name = text.pipe(
+            dom
+            , html.xpath("//TITLE-TEXT")                                       # type: ignore
+            , text.titleize
         )
-        number = string_pipe(
-            "//TITLE-NUM/text()",
-            xpath_get(dom),
-            text.split_on_space,
-            second
+        number = text.pipe(
+            dom
+            , html.xpath("//TITLE-NUM")                                        # type: ignore
+            , text.split(" ")                                                  # type: ignore
+            , seq.get(1)                                                       # type: ignore
         )
         children = _parse_divisions_or_articles(number, dom, logger)
-        url      = source_url(number)
+        url      = _source_url(number)
+
         return Title(name, number, children, url)
 
     except ParseException as e:
         logger.warn(f"Could not parse the title: {e}")
         return None
-
-
-def string_pipe(*args: Any) -> NonemptyString:
-    """A wrapper around pipe() that casts the result to a NonemptyString."""
-    args_with_string: Any = args + (NonemptyString,)
 
-    return cast(NonemptyString, pipe(*args_with_string))
-
 
-def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]:
+def _parse_divisions_or_articles(title_number: text.NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]:
     division_nodes = dom.xpath("//T-DIV")
     article_nodes  = dom.xpath("//TA-LIST")
 
@@ -75,6 +62,6 @@ def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | X
     return parse_fun(title_number, dom, logger)
 
 
-def source_url(title_number: NonemptyString) -> URL:
+def _source_url(title_number: text.NonemptyString) -> text.URL:
     url_number = title_number.rjust(2, "0")
-    return URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
+    return text.URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf")
diff --git a/public_law/parsers/usa/colorado/crs_articles.py b/public_law/parsers/usa/colorado/crs_articles.py
@@ -8,7 +8,7 @@
 from scrapy.selector.unified import Selector
 from scrapy.http.response.xml import XmlResponse
 
-from public_law.selector_util import node_name
+from public_law.html import node_name
 from public_law.items.crs import *
 from public_law.text import remove_trailing_period, normalize_whitespace, NonemptyString
 

diff --git a/public_law/parsers/usa/colorado/crs_divisions.py b/public_law/parsers/usa/colorado/crs_divisions.py
@@ -5,7 +5,7 @@
 from itertools import takewhile, dropwhile
 
 
-from public_law.selector_util import just_text
+from public_law.html import just_text
 from public_law.text import NonemptyString
 from public_law.items.crs import Division, Subdivision
 from public_law.parsers.usa.colorado.crs_articles import div_name_text, parse_articles_from_division

diff --git a/public_law/parsers/usa/colorado/crs_sections.py b/public_law/parsers/usa/colorado/crs_sections.py
@@ -6,7 +6,7 @@
 from scrapy.http.response.xml import XmlResponse
 from scrapy.selector.unified import Selector
 
-from public_law.selector_util import just_text
+from public_law.html import just_text
 from public_law.items.crs import Section
 from public_law.text import remove_trailing_period, normalize_whitespace, NonemptyString
 

diff --git a/public_law/parsers/usa/uscis_glossary.py b/public_law/parsers/usa/uscis_glossary.py
@@ -11,6 +11,7 @@
     capitalize_first_char,
     normalize_nonempty,
 )
+from public_law import text
 
 
 def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
@@ -64,12 +65,11 @@ def cleanup_definition(defn: str) -> Sentence:
     def cleanup_phrase(phrase: str) -> String:
         assert isinstance(phrase, str)
 
-        return pipe(
-            phrase,
-            normalize_nonempty,
-            String,
-        ) # type: ignore
-
+        return text.pipe(
+            phrase
+            , normalize_nonempty
+        )
+
     for phrase, defn in _raw_entries(html):
         assert isinstance(phrase, str)
         assert isinstance(defn, str)

diff --git a/public_law/selector_util.py b/public_law/selector_util.py
diff --git a/public_law/seq.py b/public_law/seq.py
@@ -0,0 +1,12 @@
+"""Sequence (Iterables, Lists, etc.) functions."""
+
+
+from typing import Any
+
+from toolz.functoolz import curry
+
+
+def get(index: int, x: list[Any]) -> Any:
+    return x[index]
+
+get = curry(get)