Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ NEW: Add inventory reader and CLI #656

Merged
merged 5 commits into from Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Expand Up @@ -99,7 +99,7 @@ jobs:
else:
raise AssertionError()"
- name: Run pytest for docutils-only tests
run: pytest tests/test_docutils.py tests/test_renderers/test_fixtures_docutils.py tests/test_renderers/test_include_directive.py
run: pytest tests/test_docutils.py tests/test_renderers/test_fixtures_docutils.py tests/test_renderers/test_include_directive.py tests/test_renderers/test_myst_config.py
chrisjsewell marked this conversation as resolved.
Show resolved Hide resolved
- name: Run docutils CLI
run: echo "test" | myst-docutils-html

Expand Down
10 changes: 8 additions & 2 deletions myst_parser/_compat.py
Expand Up @@ -5,9 +5,15 @@
from docutils.nodes import Element

if sys.version_info >= (3, 8):
from typing import Literal, Protocol, get_args, get_origin # noqa: F401
from typing import Literal, Protocol, TypedDict, get_args, get_origin # noqa: F401
else:
from typing_extensions import Literal, Protocol, get_args, get_origin # noqa: F401
from typing_extensions import ( # noqa: F401
Literal,
Protocol,
TypedDict,
get_args,
get_origin,
)


def findall(node: Element) -> Callable[..., Iterable[Element]]:
Expand Down
366 changes: 366 additions & 0 deletions myst_parser/inventory.py
@@ -0,0 +1,366 @@
"""Logic for dealing with sphinx style inventories (e.g. `objects.inv`).

These contain mappings of reference names to ids, scoped by domain and object type.

This is adapted from the Sphinx inventory.py module.
We replicate it here, so that it can be used without Sphinx.
"""
from __future__ import annotations

import argparse
import json
import re
import zlib
from dataclasses import dataclass
from fnmatch import fnmatchcase
from typing import IO, TYPE_CHECKING, Iterator
from urllib.request import urlopen

import yaml

from ._compat import TypedDict

if TYPE_CHECKING:
from sphinx.util.typing import Inventory


class InventoryItemType(TypedDict):
"""A single inventory item."""

loc: str
"""The relative location of the item."""
text: str | None
"""Implicit text to show for the item."""


class InventoryType(TypedDict):
"""Inventory data."""

name: str
"""The name of the project."""
version: str
"""The version of the project."""
objects: dict[str, dict[str, dict[str, InventoryItemType]]]
"""Mapping of domain -> object type -> name -> item."""


def format_inventory(inv: Inventory) -> InventoryType:
"""Convert a Sphinx inventory to one that is JSON compliant."""
project = ""
version = ""
objs: dict[str, dict[str, dict[str, InventoryItemType]]] = {}
for domain_obj_name, data in inv.items():
if ":" not in domain_obj_name:
continue

domain_name, obj_type = domain_obj_name.split(":", 1)
objs.setdefault(domain_name, {}).setdefault(obj_type, {})
for refname, refdata in data.items():
project, version, uri, text = refdata
objs[domain_name][obj_type][refname] = {
"loc": uri,
"text": None if (not text or text == "-") else text,
}

return {
"name": project,
"version": version,
"objects": objs,
}


def load(stream: IO) -> InventoryType:
"""Load inventory data from a stream."""
reader = InventoryFileReader(stream)
line = reader.readline().rstrip()
if line == "# Sphinx inventory version 1":
return _load_v1(reader)
elif line == "# Sphinx inventory version 2":
return _load_v2(reader)
else:
raise ValueError("invalid inventory header: %s" % line)


def _load_v1(stream: InventoryFileReader) -> InventoryType:
"""Load inventory data (format v1) from a stream."""
projname = stream.readline().rstrip()[11:]
version = stream.readline().rstrip()[11:]
invdata: InventoryType = {
"name": projname,
"version": version,
"objects": {},
}
for line in stream.readlines():
name, objtype, location = line.rstrip().split(None, 2)
# version 1 did not add anchors to the location
domain = "py"
if objtype == "mod":
objtype = "module"
location += "#module-" + name
else:
location += "#" + name
invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
invdata["objects"][domain][objtype][name] = {"loc": location, "text": None}

return invdata


def _load_v2(stream: InventoryFileReader) -> InventoryType:
"""Load inventory data (format v2) from a stream."""
projname = stream.readline().rstrip()[11:]
version = stream.readline().rstrip()[11:]
invdata: InventoryType = {
"name": projname,
"version": version,
"objects": {},
}
line = stream.readline()
if "zlib" not in line:
raise ValueError("invalid inventory header (not compressed): %s" % line)

for line in stream.read_compressed_lines():
# be careful to handle names with embedded spaces correctly
m = re.match(r"(?x)(.+?)\s+(\S+)\s+(-?\d+)\s+?(\S*)\s+(.*)", line.rstrip())
if not m:
continue
name: str
type: str
name, type, _, location, text = m.groups()
if ":" not in type:
# wrong type value. type should be in the form of "{domain}:{objtype}"
#
# Note: To avoid the regex DoS, this is implemented in python (refs: #8175)
continue
if (
type == "py:module"
and type in invdata["objects"]
and name in invdata["objects"][type]
):
# due to a bug in 1.1 and below,
# two inventory entries are created
# for Python modules, and the first
# one is correct
continue
if location.endswith("$"):
location = location[:-1] + name
domain, objtype = type.split(":", 1)
invdata["objects"].setdefault(domain, {}).setdefault(objtype, {})
if not text or text == "-":
text = None
invdata["objects"][domain][objtype][name] = {"loc": location, "text": text}
return invdata


_BUFSIZE = 16 * 1024


class InventoryFileReader:
"""A file reader for an inventory file.

This reader supports mixture of texts and compressed texts.
"""

def __init__(self, stream: IO) -> None:
self.stream = stream
self.buffer = b""
self.eof = False

def read_buffer(self) -> None:
chunk = self.stream.read(_BUFSIZE)
if chunk == b"":
self.eof = True
self.buffer += chunk

def readline(self) -> str:
pos = self.buffer.find(b"\n")
if pos != -1:
line = self.buffer[:pos].decode()
self.buffer = self.buffer[pos + 1 :]
elif self.eof:
line = self.buffer.decode()
self.buffer = b""
else:
self.read_buffer()
line = self.readline()

return line

def readlines(self) -> Iterator[str]:
while not self.eof:
line = self.readline()
if line:
yield line

def read_compressed_chunks(self) -> Iterator[bytes]:
decompressor = zlib.decompressobj()
while not self.eof:
self.read_buffer()
yield decompressor.decompress(self.buffer)
self.buffer = b""
yield decompressor.flush()

def read_compressed_lines(self) -> Iterator[str]:
buf = b""
for chunk in self.read_compressed_chunks():
buf += chunk
pos = buf.find(b"\n")
while pos != -1:
yield buf[:pos].decode()
buf = buf[pos + 1 :]
pos = buf.find(b"\n")


@dataclass
class InvMatch:
"""A match from an inventory."""

inv: str
domain: str
otype: str
name: str
proj: str
version: str
uri: str
text: str


def resolve_inventory(
inventories: dict[str, Inventory],
ref_target: str,
*,
ref_inv: None | str = None,
ref_domain: None | str = None,
ref_otype: None | str = None,
match_end=False,
) -> list[InvMatch]:
"""Resolve a cross-reference in the loaded sphinx inventories.

:param inventories: Mapping of inventory name to inventory data
:param ref_target: The target to search for
:param ref_inv: The name of the sphinx inventory to search, if None then
all inventories will be searched
:param ref_domain: The name of the domain to search, if None then all domains
will be searched
:param ref_otype: The type of object to search for, if None then all types will be searched
:param match_end: Whether to match against targets that end with the ref_target

:returns: matching results
"""
results: list[InvMatch] = []
for inv_name, inv_data in inventories.items():

if ref_inv is not None and ref_inv != inv_name:
continue

for domain_obj_name, data in inv_data.items():

if ":" not in domain_obj_name:
continue

domain_name, obj_type = domain_obj_name.split(":", 1)

if ref_domain is not None and ref_domain != domain_name:
continue

if ref_otype is not None and ref_otype != obj_type:
continue

if not match_end and ref_target in data:
results.append(
InvMatch(
inv_name, domain_name, obj_type, ref_target, *data[ref_target]
)
)
elif match_end:
for target in data:
if target.endswith(ref_target):
results.append(
InvMatch(
inv_name, domain_name, obj_type, target, *data[target]
)
)

return results


def inventory_cli(inputs: None | list[str] = None):
"""Command line interface for fetching and parsing an inventory."""
parser = argparse.ArgumentParser(description="Parse an inventory file.")
parser.add_argument("uri", metavar="[URL|PATH]", help="URI of the inventory file")
parser.add_argument(
"-d",
"--domain",
metavar="DOMAIN",
help="Filter the inventory by domain pattern",
)
parser.add_argument(
"-o",
"--object-type",
metavar="TYPE",
help="Filter the inventory by object type pattern",
)
parser.add_argument(
"-n",
"--name",
metavar="NAME",
help="Filter the inventory by reference name pattern",
)
parser.add_argument(
"-f",
"--format",
choices=["yaml", "json"],
default="yaml",
help="Output format (default: yaml)",
)
args = parser.parse_args(inputs)

if args.uri.startswith("http"):
try:
with urlopen(args.uri) as stream:
invdata = load(stream)
except Exception:
with urlopen(args.uri + "/objects.inv") as stream:
invdata = load(stream)
else:
with open(args.uri, "rb") as stream:
invdata = load(stream)

# filter the inventory
if args.domain:
invdata["objects"] = {
d: invdata["objects"][d]
for d in invdata["objects"]
if fnmatchcase(d, args.domain)
}
if args.object_type:
for domain in list(invdata["objects"]):
invdata["objects"][domain] = {
t: invdata["objects"][domain][t]
for t in invdata["objects"][domain]
if fnmatchcase(t, args.object_type)
}
if args.name:
for domain in invdata["objects"]:
for otype in list(invdata["objects"][domain]):
invdata["objects"][domain][otype] = {
n: invdata["objects"][domain][otype][n]
for n in invdata["objects"][domain][otype]
if fnmatchcase(n, args.name)
}

# clean up empty items
for domain in list(invdata["objects"]):
for otype in list(invdata["objects"][domain]):
if not invdata["objects"][domain][otype]:
del invdata["objects"][domain][otype]
if not invdata["objects"][domain]:
del invdata["objects"][domain]

if args.format == "json":
print(json.dumps(invdata, indent=2, sort_keys=False))
else:
print(yaml.dump(invdata, sort_keys=False))


if __name__ == "__main__":
inventory_cli()