Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce {Work,Flow}.lightningignore #15818

Merged
merged 18 commits into from Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/source-app/workflows/run_app_on_cloud/cloud_files.rst
Expand Up @@ -30,7 +30,6 @@ For example, the source code directory below with the ``.lightningignore`` file
├── requirements.txt
└── model.pt


.. code:: bash

~/project/home ❯ cat .lightningignore
Expand All @@ -39,6 +38,18 @@ For example, the source code directory below with the ``.lightningignore`` file

A sample ``.lightningignore`` file can be found `here <https://github.com/Lightning-AI/lightning.beta/blob/master/.lightningignore>`_.

If you are a component author and your components creates local files that you want to ignore, you can do:

.. code-block:: python

class MyComponent(L.LightningWork): # or L.LightningFlow
def __init__(self):
super().__init__()
self.lightningignore = ["model.pt", "data_dir"]


This has the benefit that the files will be ignored automatically for all the component users, making an easier
transition between running locally vs in the cloud.

----

Expand Down
2 changes: 2 additions & 0 deletions src/lightning_app/CHANGELOG.md
Expand Up @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Show a message when `BuildConfig(requirements=[...])` is passed but a `requirements.txt` file is already present in the Work ([#15799](https://github.com/Lightning-AI/lightning/pull/15799))
- Show a message when `BuildConfig(dockerfile="...")` is passed but a `Dockerfile` file is already present in the Work ([#15799](https://github.com/Lightning-AI/lightning/pull/15799))

- Added `Lightning{Flow,Work}.lightningignores` attributes to programmatically ignore files before uploading to the cloud ([#15818](https://github.com/Lightning-AI/lightning/pull/15818))

- Added a CloudMultiProcessBackend which enables running a child App from within the Flow in the cloud ([#15800](https://github.com/Lightning-AI/lightning/pull/15800))


Expand Down
3 changes: 3 additions & 0 deletions src/lightning_app/components/multi_node/trainer.py
Expand Up @@ -92,3 +92,6 @@ def __init__(
cloud_compute=cloud_compute,
**work_kwargs,
)

# the Trainer enables TensorBoard by default, so this is often an undesired directory to upload to the cloud
self.lightningignore.append("lightning_logs")
10 changes: 10 additions & 0 deletions src/lightning_app/core/flow.py
Expand Up @@ -104,6 +104,7 @@ def __init__(self):
self._layout: Union[List[Dict], Dict] = {}
self._paths = {}
self._backend: Optional[Backend] = None
self._lightningignore: List[str] = []

@property
def name(self):
Expand Down Expand Up @@ -286,6 +287,15 @@ def flows(self) -> Dict[str, "LightningFlow"]:
flows.update(getattr(self, struct_name).flows)
return flows

@property
def lightningignore(self) -> List[str]:
carmocca marked this conversation as resolved.
Show resolved Hide resolved
"""Programmatic equivalent of the ``.lightningignore`` file."""
return self._lightningignore

@lightningignore.setter
def lightningignore(self, lightningignore: List[str]) -> None:
self._lightningignore = lightningignore

def works(self, recurse: bool = True) -> List[LightningWork]:
"""Return its :class:`~lightning_app.core.work.LightningWork`."""
works = [getattr(self, el) for el in sorted(self._works)]
Expand Down
10 changes: 10 additions & 0 deletions src/lightning_app/core/work.py
Expand Up @@ -148,6 +148,7 @@ def __init__(
self._local_build_config = local_build_config or BuildConfig()
self._cloud_build_config = cloud_build_config or BuildConfig()
self._cloud_compute = cloud_compute or CloudCompute()
self._lightningignore: List[str] = []
self._backend: Optional[Backend] = None
self._check_run_is_implemented()
self._on_init_end()
Expand Down Expand Up @@ -247,6 +248,15 @@ def cloud_compute(self, cloud_compute: CloudCompute) -> None:
compute_store.remove(self.name)
self._cloud_compute = cloud_compute

@property
def lightningignore(self) -> List[str]:
"""Programmatic equivalent of the ``.lightningignore`` file."""
return self._lightningignore

@lightningignore.setter
def lightningignore(self, lightningignore: List[str]) -> None:
self._lightningignore = lightningignore

@property
def status(self) -> WorkStatus:
"""Return the current status of the work.
Expand Down
52 changes: 38 additions & 14 deletions src/lightning_app/runners/cloud.py
Expand Up @@ -5,6 +5,7 @@
import sys
import time
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from typing import Any, Callable, List, Optional, Union

Expand Down Expand Up @@ -59,6 +60,7 @@
from lightning_app.runners.backends.cloud import CloudBackend
from lightning_app.runners.runtime import Runtime
from lightning_app.source_code import LocalSourceCodeDir
from lightning_app.source_code.copytree import _filter_ignored, _parse_lightningignore
from lightning_app.storage import Drive, Mount
from lightning_app.utilities.app_helpers import Logger
from lightning_app.utilities.cloud import _get_project
Expand Down Expand Up @@ -214,7 +216,19 @@ def dispatch(
root = Path(self.entrypoint_file).absolute().parent
cleanup_handle = _prepare_lightning_wheels_and_requirements(root)
self.app._update_index_file()
repo = LocalSourceCodeDir(path=root)

# gather and merge all lightningignores
children = self.app.flows + self.app.works
lightningignores = [c.lightningignore for c in children]
if lightningignores:
merged = sum(lightningignores, [])
logger.debug(f"Found the following lightningignores: {merged}")
patterns = _parse_lightningignore(merged)
Borda marked this conversation as resolved.
Show resolved Hide resolved
ignore_functions = [partial(_filter_ignored, root, patterns)]
else:
ignore_functions = None

repo = LocalSourceCodeDir(path=root, ignore_functions=ignore_functions)
self._check_uploaded_folder(root, repo)
requirements_file = root / "requirements.txt"
# The entry point file needs to be relative to the root of the uploaded source file directory,
Expand Down Expand Up @@ -480,24 +494,34 @@ def _ensure_cluster_project_binding(self, project_id: str, cluster_id: str):
@staticmethod
def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None:
"""This method is used to inform the users if their folder files are large and how to filter them."""
lightning_tar = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz"))
app_folder_size = sum(Path(p).stat().st_size for p in repo.files if p not in lightning_tar)
app_folder_size_in_mb = round(app_folder_size / (1000 * 1000), 5)
excludes = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz"))
excludes.update(fnmatch.filter(repo.files, ".lightningignore"))
files = [Path(f) for f in repo.files if f not in excludes]
file_sizes = {f: f.stat().st_size for f in files}
mb = 1000 * 1000
Borda marked this conversation as resolved.
Show resolved Hide resolved
app_folder_size_in_mb = sum(file_sizes.values()) / mb
if app_folder_size_in_mb > CLOUD_UPLOAD_WARNING:
path_sizes = [(p, Path(p).stat().st_size / (1000 * 1000)) for p in repo.files]
largest_paths = sorted((x for x in path_sizes if x[-1] > 0.01), key=lambda x: x[1], reverse=True)[:25]
largest_paths_msg = "\n".join(f"{round(s, 5)} MB: {p}" for p, s in largest_paths)
# filter out files under 0.01mb
relevant_files = {f: s for f, s in file_sizes.items() if s > 0.01 * mb}
Borda marked this conversation as resolved.
Show resolved Hide resolved
if relevant_files:
by_largest = dict(sorted(relevant_files.items(), key=lambda x: x[1], reverse=True))
by_largest = dict(list(by_largest.items())[:25]) # trim
largest_paths_msg = "\n".join(
f"{round(s / mb, 5)} MB: {p.relative_to(root)}" for p, s in by_largest.items()
Borda marked this conversation as resolved.
Show resolved Hide resolved
)
largest_paths_msg = f"Here are the largest files:\n{largest_paths_msg}\n"
else:
largest_paths_msg = ""
warning_msg = (
f"Your application folder '{root.absolute()}' is more than {CLOUD_UPLOAD_WARNING} MB. "
f"The total size is {app_folder_size_in_mb} MB\n"
f"Here are the largest files: \n{largest_paths_msg}\n"
"Perhaps you should try running the app in an empty directory."
f"The total size is {round(app_folder_size_in_mb, 2)} MB. {len(files)} files were uploaded.\n"
+ largest_paths_msg
+ "Perhaps you should try running the app in an empty directory."
Borda marked this conversation as resolved.
Show resolved Hide resolved
)
if not (root / DOT_IGNORE_FILENAME).is_file():
warning_msg = (
warning_msg
+ "\nIn order to ignore some files or folder, "
+ "create a `.lightningignore` file and add the paths to ignore."
warning_msg += (
"\nIn order to ignore some files or folder, create a `.lightningignore` file and add the paths to"
" ignore. You can also set the `lightningingore` attribute in a Flow or Work."
)
else:
warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`."
Expand Down
6 changes: 4 additions & 2 deletions src/lightning_app/source_code/copytree.py
Expand Up @@ -3,18 +3,20 @@
from functools import partial
from pathlib import Path
from shutil import copy2, copystat, Error
from typing import Callable, List, Set, Union
from typing import Callable, List, Optional, Set, Union

from lightning_app.core.constants import DOT_IGNORE_FILENAME
from lightning_app.utilities.app_helpers import Logger

logger = Logger(__name__)

_IGNORE_FUNCTION = Callable[[Path, List[Path]], List[Path]]


def _copytree(
src: Union[Path, str],
dst: Union[Path, str],
ignore_functions: List[Callable] = None,
ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None,
dirs_exist_ok=False,
dry_run=False,
) -> List[str]:
Expand Down
9 changes: 5 additions & 4 deletions src/lightning_app/source_code/local.py
Expand Up @@ -4,7 +4,7 @@
from shutil import rmtree
from typing import List, Optional

from lightning_app.source_code.copytree import _copytree
from lightning_app.source_code.copytree import _copytree, _IGNORE_FUNCTION
from lightning_app.source_code.hashing import _get_hash
from lightning_app.source_code.tar import _tar_path
from lightning_app.source_code.uploader import FileUploader
Expand All @@ -15,8 +15,9 @@ class LocalSourceCodeDir:

cache_location: Path = Path.home() / ".lightning" / "cache" / "repositories"

def __init__(self, path: Path):
def __init__(self, path: Path, ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None) -> None:
self.path = path
self.ignore_functions = ignore_functions

# cache checksum version
self._version: Optional[str] = None
Expand All @@ -33,7 +34,7 @@ def __init__(self, path: Path):
def files(self) -> List[str]:
"""Returns a set of files that are not ignored by .lightningignore."""
if self._non_ignored_files is None:
self._non_ignored_files = _copytree(self.path, "", dry_run=True)
self._non_ignored_files = _copytree(self.path, "", ignore_functions=self.ignore_functions, dry_run=True)
return self._non_ignored_files

@property
Expand All @@ -59,7 +60,7 @@ def packaging_session(self) -> Path:
session_path = self.cache_location / "packaging_sessions" / self.version
try:
rmtree(session_path, ignore_errors=True)
_copytree(self.path, session_path)
_copytree(self.path, session_path, ignore_functions=self.ignore_functions)
yield session_path
finally:
rmtree(session_path, ignore_errors=True)
Expand Down
107 changes: 91 additions & 16 deletions tests/tests_app/runners/test_cloud.py
Expand Up @@ -41,13 +41,15 @@
V1Work,
)

from lightning_app import BuildConfig, LightningApp, LightningWork
from lightning_app import BuildConfig, LightningApp, LightningFlow, LightningWork
from lightning_app.runners import backends, cloud, CloudRuntime
from lightning_app.runners.cloud import (
_generate_works_json_gallery,
_generate_works_json_web,
_validate_build_spec_and_compute,
)
from lightning_app.source_code.copytree import _copytree, _parse_lightningignore
from lightning_app.source_code.local import LocalSourceCodeDir
from lightning_app.storage import Drive, Mount
from lightning_app.testing.helpers import EmptyWork
from lightning_app.utilities.cloud import _get_project
Expand Down Expand Up @@ -1184,31 +1186,38 @@ def test_get_project(monkeypatch):
assert ret.project_id == "test-project-id1"


def write_file_of_size(path, size):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
f.seek(size)
f.write(b"\0")


@mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock())
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock())
def test_check_uploaded_folder(monkeypatch, tmpdir, caplog):

monkeypatch.setattr(cloud, "logger", logging.getLogger())

app = MagicMock()
repo = MagicMock()
root = Path(tmpdir)
repo = LocalSourceCodeDir(root)
backend = cloud.CloudRuntime(app)
with caplog.at_level(logging.WARN):
backend._check_uploaded_folder(Path(tmpdir), repo)
backend._check_uploaded_folder(root, repo)
assert caplog.messages == []

mock = MagicMock()
mock.st_mode = 33188
mock.st_size = 5 * 1000 * 1000
repo.files = [str(Path("./a.png"))]
monkeypatch.setattr(Path, "stat", MagicMock(return_value=mock))
# write some files to assert the message below.
write_file_of_size(root / "a.png", 4 * 1000 * 1000)
write_file_of_size(root / "b.txt", 5 * 1000 * 1000)
write_file_of_size(root / "c.jpg", 6 * 1000 * 1000)

path = Path(".")
repo._non_ignored_files = None # force reset
with caplog.at_level(logging.WARN):
backend._check_uploaded_folder(path, repo)
assert caplog.messages[0].startswith(
f"Your application folder '{path.absolute()}' is more than 2 MB. The total size is 5.0 MB"
)
backend._check_uploaded_folder(root, repo)
assert f"Your application folder '{root.absolute()}' is more than 2 MB" in caplog.text
assert "The total size is 15.0 MB" in caplog.text
assert "3 files were uploaded" in caplog.text
assert "files:\n6.0 MB: c.jpg\n5.0 MB: b.txt\n4.0 MB: a.png\nPerhaps" in caplog.text # tests the order
assert "create a `.lightningignore` file" in caplog.text
assert "lightningingore` attribute in a Flow or Work" in caplog.text


@mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock())
Expand Down Expand Up @@ -1368,3 +1377,69 @@ def run(self):

with pytest.raises(ValueError, match="You requested a custom base image for the Work with name"):
_validate_build_spec_and_compute(Work())


def test_programmatic_lightningignore(monkeypatch, caplog, tmpdir):
mock_client = mock.MagicMock()
mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse(
memberships=[V1Membership(name="test-project", project_id="test-project-id")]
)
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
V1ListLightningappInstancesResponse(lightningapps=[])
)
mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease(
cluster_id="test"
)
cloud_backend = mock.MagicMock(client=mock_client)
monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))

class MyWork(LightningWork):
def __init__(self):
super().__init__()
self.lightningignore = ["foo", "lightning_logs"]

def run(self):
# this is ignored
self.lightningignore.append("foobar")
carmocca marked this conversation as resolved.
Show resolved Hide resolved

class MyFlow(LightningFlow):
def __init__(self):
super().__init__()
self.lightningignore = ["foo"]
self.w = MyWork()

def run(self):
# this is ignored
self.lightningignore.append("baz")
self.w.run()

app = LightningApp(MyFlow())
monkeypatch.setattr(app, "_update_index_file", mock.MagicMock())

path = Path(tmpdir)
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=path / "entrypoint.py")
monkeypatch.setattr(LocalSourceCodeDir, "upload", mock.MagicMock())

# write some files
write_file_of_size(path / "a.txt", 5 * 1000 * 1000)
write_file_of_size(path / "foo.png", 4 * 1000 * 1000)
write_file_of_size(path / "lightning_logs" / "foo.ckpt", 6 * 1000 * 1000)
# also an actual .lightningignore file
(path / ".lightningignore").write_text("foo.png")

with mock.patch(
"lightning_app.runners.cloud._parse_lightningignore", wraps=_parse_lightningignore
) as parse_mock, mock.patch(
"lightning_app.source_code.local._copytree", wraps=_copytree
) as copy_mock, caplog.at_level(
logging.WARN
):
cloud_runtime.dispatch()

parse_mock.assert_called_once_with(["foo", "foo", "lightning_logs"])
assert copy_mock.mock_calls[0].kwargs["ignore_functions"][0].args[1] == {"lightning_logs", "foo"}

assert f"Your application folder '{path.absolute()}' is more than 2 MB" in caplog.text
assert "The total size is 5.0 MB" in caplog.text
assert "2 files were uploaded" # a.txt and .lightningignore
assert "files:\n5.0 MB: a.txt\nPerhaps" in caplog.text # only this file appears