Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

micropkg packaging improvements #2614

Merged
merged 15 commits into from Jun 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions RELEASE.md
Expand Up @@ -13,6 +13,8 @@
## Major features and improvements

## Bug fixes and other changes
* Reworked micropackaging workflow to use standard Python packaging practices.
* Make `kedro micropkg package` accept `--verbose`.

## Breaking changes to the API

Expand Down
4 changes: 2 additions & 2 deletions docs/source/nodes_and_pipelines/micro_packaging.md
Expand Up @@ -6,7 +6,7 @@ Micro-packaging allows users to share Kedro micro-packages across codebases, org

You can package a micro-package by executing: `kedro micropkg package <micropkg_name>`

* This will generate a new [tar](https://docs.python.org/3/distutils/sourcedist.html) file for this micro-package.
* This will generate a new [source distribution](https://docs.python.org/3/distutils/sourcedist.html) for this micro-package.
* By default, the tar file will be saved into `dist/` directory inside your project.
* You can customise the target with the `--destination` (`-d`) option.

Expand Down Expand Up @@ -64,7 +64,7 @@ The examples above apply to any generic Python package, modular pipelines fall u

You can pull a micro-package from a tar file by executing `kedro micropkg pull <package_name>`.

* The `<package_name>` must either be a package name on PyPI or a path to the tar file.
* The `<package_name>` must either be a package name on PyPI or a path to the source distribution file.
* Kedro will unpack the tar file, and install the files in following locations in your Kedro project:
* All the micro-package code in `src/<package_name>/<micropkg_name>/`
* Configuration files in `conf/<env>/parameters/<micropkg_name>.yml`, where `<env>` defaults to `base`.
Expand Down
206 changes: 161 additions & 45 deletions kedro/framework/cli/micropkg.py
Expand Up @@ -8,14 +8,17 @@
import tempfile
from importlib import import_module
from pathlib import Path
from typing import Iterable, List, Tuple, Union
from typing import Any, Iterable, Iterator, List, Tuple, Union

import click
import pkg_resources
from build.util import project_wheel_metadata
from packaging.requirements import InvalidRequirement, Requirement
from packaging.utils import canonicalize_name
from rope.base.project import Project
from rope.contrib import generate
from rope.refactor.move import MoveModule
from rope.refactor.rename import Rename
from setuptools.discovery import FlatLayoutPackageFinder

from kedro.framework.cli.pipeline import (
_assert_pkg_name_ok,
Expand Down Expand Up @@ -47,6 +50,55 @@
"""


class _EquivalentRequirement(Requirement):
astrojuanlu marked this conversation as resolved.
Show resolved Hide resolved
"""Parse a requirement according to PEP 508.

This class overrides __eq__ to be backwards compatible with pkg_resources.Requirement
while making __str__ and __hash__ use the non-canonicalized name
as agreed in https://github.com/pypa/packaging/issues/644,

Implementation taken from https://github.com/pypa/packaging/pull/696/
"""

def _iter_parts(self, name: str) -> Iterator[str]:
yield name

if self.extras:
formatted_extras = ",".join(sorted(self.extras))
yield f"[{formatted_extras}]"

if self.specifier:
yield str(self.specifier)

if self.url:
yield f"@ {self.url}"
if self.marker:
yield " "

if self.marker:
yield f"; {self.marker}"

def __str__(self) -> str:
return "".join(self._iter_parts(self.name))

def __hash__(self) -> int:
return hash(
(
self.__class__.__name__,
*self._iter_parts(canonicalize_name(self.name)),
)
)

def __eq__(self, other: Any) -> bool:
return (
canonicalize_name(self.name) == canonicalize_name(other.name)
and self.extras == other.extras
and self.specifier == other.specifier
and self.url == other.url
and self.marker == other.marker
)


def _check_module_path(ctx, param, value): # pylint: disable=unused-argument
if value and not re.match(r"^[\w.]+$", value):
message = (
Expand Down Expand Up @@ -131,7 +183,7 @@ def pull_package( # pylint:disable=unused-argument, too-many-arguments
click.secho(message, fg="green")


# pylint: disable=too-many-arguments, too-many-locals
# pylint: disable=too-many-arguments
def _pull_package(
package_path: str,
metadata: ProjectMetadata,
Expand All @@ -144,25 +196,46 @@ def _pull_package(
temp_dir_path = Path(temp_dir).resolve()
_unpack_sdist(package_path, temp_dir_path, fs_args)

egg_info_files = list((temp_dir_path).rglob("*.egg-info"))
if len(egg_info_files) != 1:
# temp_dir_path is the parent directory of the project root dir
contents = [member for member in temp_dir_path.iterdir() if member.is_dir()]
if len(contents) != 1:
raise KedroCliError(
f"More than 1 or no egg-info files found from {package_path}. "
f"There has to be exactly one egg-info directory."
"Invalid sdist was extracted: exactly one directory was expected, "
f"got {contents}"
)
egg_info_file = egg_info_files[0]
package_name = egg_info_file.stem
package_requirements = egg_info_file.parent / "setup.py"

# Finds a string representation of 'install_requires' list from setup.py
reqs_list_pattern = r"install_requires\=(.*?)\,\n"
list_reqs = re.findall(
reqs_list_pattern, package_requirements.read_text(encoding="utf-8")
)
project_root_dir = contents[0]

# This is much slower than parsing the requirements
# directly from the metadata files
# because it installs the package in an isolated environment,
# but it's the only reliable way of doing it
# without making assumptions on the project metadata.
library_meta = project_wheel_metadata(project_root_dir)

# Project name will be `my-pipeline` even if `setup.py` says `my_pipeline`
# because standards mandate normalization of names for comparison,
# see https://packaging.python.org/en/latest/specifications/core-metadata/#name
# The proper way to get it would be
# project_name = library_meta.get("Name")
# However, the rest of the code expects the non-normalized package name,
# so we have to find it.
packages = [
package
for package in FlatLayoutPackageFinder().find(project_root_dir)
if "." not in package
]
if len(packages) != 1:
# Should not happen if user is calling `micropkg pull`
# with the result of a `micropkg package`,
# and in general if the distribution only contains one package (most likely),
# but helps give a sensible error message otherwise
raise KedroCliError(
"Invalid package contents: exactly one package was expected, "
f"got {packages}"
)
package_name = packages[0]

# Finds all elements from the above string representation of a list
reqs_element_pattern = r"\'(.*?)\'"
package_reqs = re.findall(reqs_element_pattern, list_reqs[0])
package_reqs = _get_all_library_reqs(library_meta)

if package_reqs:
requirements_txt = metadata.source_dir / "requirements.txt"
Expand All @@ -172,7 +245,7 @@ def _pull_package(
_install_files(
metadata,
package_name,
egg_info_file.parent,
project_root_dir,
env,
alias,
destination,
Expand Down Expand Up @@ -227,7 +300,7 @@ def _package_micropkgs_from_manifest(metadata: ProjectMetadata) -> None:
click.secho("Micro-packages packaged!", fg="green")


@micropkg.command("package")
@command_with_verbosity(micropkg, "package")
@env_option(
help="Environment where the micro-package configuration lives. Defaults to `base`."
)
Expand All @@ -254,8 +327,14 @@ def _package_micropkgs_from_manifest(metadata: ProjectMetadata) -> None:
@click.argument("module_path", nargs=1, required=False, callback=_check_module_path)
@click.pass_obj # this will pass the metadata as first argument
def package_micropkg(
metadata: ProjectMetadata, module_path, env, alias, destination, all_flag
):
metadata: ProjectMetadata,
module_path,
env,
alias,
destination,
all_flag,
**kwargs,
): # pylint: disable=unused-argument
"""Package up a modular pipeline or micro-package as a Python source distribution."""
if not module_path and not all_flag:
click.secho(
Expand Down Expand Up @@ -324,11 +403,20 @@ def _unpack_sdist(location: str, destination: Path, fs_args: str | None) -> None
safe_extract(tar_file, destination)
else:
python_call(
"pip", ["download", "--no-deps", "--dest", str(destination), location]
"pip",
[
"download",
"--no-deps",
"--no-binary", # the micropackaging expects an sdist,
":all:", # wheels are not supported
"--dest",
str(destination),
location,
],
)
sdist_file = list(destination.glob("*.tar.gz"))
# `--no-deps` should fetch only one source distribution file, and CLI should fail if that's
# not the case.
# `--no-deps --no-binary :all:` should fetch only one source distribution file,
# and CLI should fail if that's not the case.
if len(sdist_file) != 1:
file_names = [sf.name for sf in sdist_file]
raise KedroCliError(
Expand Down Expand Up @@ -568,13 +656,23 @@ def _sync_path_list(source: list[tuple[Path, str]], target: Path) -> None:
_sync_dirs(source_path, target_with_suffix)


def _drop_comment(line):
# https://github.com/pypa/setuptools/blob/b545fc7/\
# pkg_resources/_vendor/jaraco/text/__init__.py#L554-L566
return line.partition(" #")[0]


def _make_install_requires(requirements_txt: Path) -> list[str]:
"""Parses each line of requirements.txt into a version specifier valid to put in
install_requires."""
install_requires.
Matches pkg_resources.parse_requirements"""
if not requirements_txt.exists():
return []
requirements = pkg_resources.parse_requirements(requirements_txt.read_text())
return [str(requirement) for requirement in requirements]
return [
str(_EquivalentRequirement(_drop_comment(requirement_line)))
for requirement_line in requirements_txt.read_text().splitlines()
if requirement_line and not requirement_line.startswith("#")
]


def _create_nested_package(project: Project, package_path: Path) -> Path:
Expand Down Expand Up @@ -749,9 +847,7 @@ def _generate_sdist_file(
raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc

_generate_manifest_file(temp_dir_path)
setup_file = _generate_setup_file(
package_name, version, install_requires, temp_dir_path
)
_generate_setup_file(package_name, version, install_requires, temp_dir_path)

package_file = destination / _get_sdist_name(name=package_name, version=version)

Expand All @@ -760,14 +856,14 @@ def _generate_sdist_file(
f"Package file {package_file} will be overwritten!", fg="yellow"
)

# python setup.py sdist --formats=gztar --dist-dir <destination>
# python -m build --outdir <destination>
call(
[
sys.executable,
str(setup_file.resolve()),
"sdist",
"--formats=gztar",
"--dist-dir",
"-m",
"build",
"--sdist",
"--outdir",
str(destination),
],
cwd=temp_dir,
Expand Down Expand Up @@ -854,19 +950,39 @@ def _append_package_reqs(
)


def _get_all_library_reqs(metadata):
"""Get all library requirements from metadata, leaving markers intact."""
# See https://discuss.python.org/t/\
# programmatically-getting-non-optional-requirements-of-current-directory/26963/2
return [
str(_EquivalentRequirement(dep_str))
for dep_str in metadata.get_all("Requires-Dist", [])
]


def _safe_parse_requirements(
requirements: str | Iterable[str],
) -> set[pkg_resources.Requirement]:
"""Safely parse a requirement or set of requirements. This effectively replaces
pkg_resources.parse_requirements, which blows up with a ValueError as soon as it
) -> set[_EquivalentRequirement]:
"""Safely parse a requirement or set of requirements. This avoids blowing up when it
encounters a requirement it cannot parse (e.g. `-r requirements.txt`). This way
we can still extract all the parseable requirements out of a set containing some
unparseable requirements.
"""
parseable_requirements = set()
for requirement in pkg_resources.yield_lines(requirements):
try:
parseable_requirements.add(pkg_resources.Requirement.parse(requirement))
except ValueError:
continue
if isinstance(requirements, str):
requirements = requirements.splitlines()
# TODO: Properly handle continuation lines,
astrojuanlu marked this conversation as resolved.
Show resolved Hide resolved
# see https://github.com/pypa/setuptools/blob/v67.8.0/setuptools/_reqs.py
for requirement_line in requirements:
if (
requirement_line
and not requirement_line.startswith("#")
and not requirement_line.startswith("-e")
):
try:
parseable_requirements.add(
_EquivalentRequirement(_drop_comment(requirement_line))
)
except InvalidRequirement:
continue
return parseable_requirements