Skip to content

Commit

Permalink
Introduce {Work,Flow}.lightningignore (#15818)
Browse files Browse the repository at this point in the history
(cherry picked from commit edd2b42)
  • Loading branch information
carmocca authored and Borda committed Dec 15, 2022
1 parent dc96640 commit e2a7668
Show file tree
Hide file tree
Showing 10 changed files with 214 additions and 41 deletions.
13 changes: 12 additions & 1 deletion docs/source-app/workflows/run_app_on_cloud/cloud_files.rst
Expand Up @@ -30,7 +30,6 @@ For example, the source code directory below with the ``.lightningignore`` file
├── requirements.txt
└── model.pt
.. code:: bash
~/project/home ❯ cat .lightningignore
Expand All @@ -39,6 +38,18 @@ For example, the source code directory below with the ``.lightningignore`` file
A sample ``.lightningignore`` file can be found `here <https://github.com/Lightning-AI/lightning.beta/blob/master/.lightningignore>`_.

If you are a component author and your components creates local files that you want to ignore, you can do:

.. code-block:: python
class MyComponent(L.LightningWork): # or L.LightningFlow
def __init__(self):
super().__init__()
self.lightningignore = ("model.pt", "data_dir")
This has the benefit that the files will be ignored automatically for all the component users, making an easier
transition between running locally vs in the cloud.

----

Expand Down
2 changes: 2 additions & 0 deletions src/lightning_app/CHANGELOG.md
Expand Up @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added `Lightning{Flow,Work}.lightningignores` attributes to programmatically ignore files before uploading to the cloud ([#15818](https://github.com/Lightning-AI/lightning/pull/15818))


### Changed

Expand Down
3 changes: 3 additions & 0 deletions src/lightning_app/components/multi_node/trainer.py
Expand Up @@ -114,3 +114,6 @@ def __init__(
cloud_compute=cloud_compute,
**work_kwargs,
)

# the Trainer enables TensorBoard by default, so this is often an undesired directory to upload to the cloud
self.lightningignore += ("lightning_logs",)
24 changes: 23 additions & 1 deletion src/lightning_app/core/flow.py
Expand Up @@ -10,7 +10,13 @@
from lightning_app.frontend import Frontend
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
_set_child_name,
is_overridden,
)
from lightning_app.utilities.component import _sanitize_state
from lightning_app.utilities.exceptions import ExitAppException
from lightning_app.utilities.introspection import _is_init_context, _is_run_context
Expand Down Expand Up @@ -104,6 +110,8 @@ def __init__(self):
self._layout: Union[List[Dict], Dict] = {}
self._paths = {}
self._backend: Optional[Backend] = None
# tuple instead of a list so that it cannot be modified without using the setter
self._lightningignore: Tuple[str, ...] = tuple()

@property
def name(self):
Expand Down Expand Up @@ -310,6 +318,20 @@ def flows(self) -> Dict[str, "LightningFlow"]:
flows.update(getattr(self, struct_name).flows)
return flows

@property
def lightningignore(self) -> Tuple[str, ...]:
"""Programmatic equivalent of the ``.lightningignore`` file."""
return self._lightningignore

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
)
self._lightningignore = lightningignore

def works(self, recurse: bool = True) -> List[LightningWork]:
"""Return its :class:`~lightning_app.core.work.LightningWork`."""
works = [getattr(self, el) for el in sorted(self._works)]
Expand Down
25 changes: 23 additions & 2 deletions src/lightning_app/core/work.py
Expand Up @@ -3,15 +3,20 @@
import warnings
from copy import deepcopy
from functools import partial, wraps
from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, Union

from deepdiff import DeepHash, Delta

from lightning_app.core.queues import BaseQueue
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.storage.payload import Payload
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
is_overridden,
)
from lightning_app.utilities.component import _is_flow_context, _sanitize_state
from lightning_app.utilities.enum import (
CacheCallsKeys,
Expand Down Expand Up @@ -154,6 +159,8 @@ def __init__(
self._local_build_config = local_build_config or BuildConfig()
self._cloud_build_config = cloud_build_config or BuildConfig()
self._cloud_compute = cloud_compute or CloudCompute()
# tuple instead of a list so that it cannot be modified without using the setter
self._lightningignore: Tuple[str, ...] = tuple()
self._backend: Optional[Backend] = None
self._check_run_is_implemented()
self._on_init_end()
Expand Down Expand Up @@ -253,6 +260,20 @@ def cloud_compute(self, cloud_compute: CloudCompute) -> None:
compute_store.remove(self.name)
self._cloud_compute = cloud_compute

@property
def lightningignore(self) -> Tuple[str, ...]:
"""Programmatic equivalent of the ``.lightningignore`` file."""
return self._lightningignore

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
)
self._lightningignore = lightningignore

@property
def status(self) -> WorkStatus:
"""Return the current status of the work.
Expand Down
52 changes: 38 additions & 14 deletions src/lightning_app/runners/cloud.py
Expand Up @@ -5,6 +5,7 @@
import sys
import time
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from textwrap import dedent
from typing import Any, List, Optional, Union
Expand Down Expand Up @@ -62,6 +63,7 @@
from lightning_app.runners.backends.cloud import CloudBackend
from lightning_app.runners.runtime import Runtime
from lightning_app.source_code import LocalSourceCodeDir
from lightning_app.source_code.copytree import _filter_ignored, _parse_lightningignore
from lightning_app.storage import Drive, Mount
from lightning_app.utilities.app_helpers import _is_headless, Logger
from lightning_app.utilities.cloud import _get_project
Expand Down Expand Up @@ -217,7 +219,19 @@ def dispatch(
root = Path(self.entrypoint_file).absolute().parent
cleanup_handle = _prepare_lightning_wheels_and_requirements(root)
self.app._update_index_file()
repo = LocalSourceCodeDir(path=root)

# gather and merge all lightningignores
children = self.app.flows + self.app.works
lightningignores = [c.lightningignore for c in children]
if lightningignores:
merged = sum(lightningignores, tuple())
logger.debug(f"Found the following lightningignores: {merged}")
patterns = _parse_lightningignore(merged)
ignore_functions = [partial(_filter_ignored, root, patterns)]
else:
ignore_functions = None

repo = LocalSourceCodeDir(path=root, ignore_functions=ignore_functions)
self._check_uploaded_folder(root, repo)
requirements_file = root / "requirements.txt"
# The entry point file needs to be relative to the root of the uploaded source file directory,
Expand Down Expand Up @@ -493,24 +507,34 @@ def _ensure_cluster_project_binding(self, project_id: str, cluster_id: str):
@staticmethod
def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None:
"""This method is used to inform the users if their folder files are large and how to filter them."""
lightning_tar = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz"))
app_folder_size = sum(Path(p).stat().st_size for p in repo.files if p not in lightning_tar)
app_folder_size_in_mb = round(app_folder_size / (1000 * 1000), 5)
excludes = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz"))
excludes.update(fnmatch.filter(repo.files, ".lightningignore"))
files = [Path(f) for f in repo.files if f not in excludes]
file_sizes = {f: f.stat().st_size for f in files}
mb = 1000_000
app_folder_size_in_mb = sum(file_sizes.values()) / mb
if app_folder_size_in_mb > CLOUD_UPLOAD_WARNING:
path_sizes = [(p, Path(p).stat().st_size / (1000 * 1000)) for p in repo.files]
largest_paths = sorted((x for x in path_sizes if x[-1] > 0.01), key=lambda x: x[1], reverse=True)[:25]
largest_paths_msg = "\n".join(f"{round(s, 5)} MB: {p}" for p, s in largest_paths)
# filter out files under 0.01mb
relevant_files = {f: sz for f, sz in file_sizes.items() if sz > 0.01 * mb}
if relevant_files:
by_largest = dict(sorted(relevant_files.items(), key=lambda x: x[1], reverse=True))
by_largest = dict(list(by_largest.items())[:25]) # trim
largest_paths_msg = "\n".join(
f"{round(sz / mb, 5)} MB: {p.relative_to(root)}" for p, sz in by_largest.items()
)
largest_paths_msg = f"Here are the largest files:\n{largest_paths_msg}\n"
else:
largest_paths_msg = ""
warning_msg = (
f"Your application folder '{root.absolute()}' is more than {CLOUD_UPLOAD_WARNING} MB. "
f"The total size is {app_folder_size_in_mb} MB\n"
f"Here are the largest files: \n{largest_paths_msg}\n"
"Perhaps you should try running the app in an empty directory."
f"The total size is {round(app_folder_size_in_mb, 2)} MB. {len(files)} files were uploaded.\n"
+ largest_paths_msg
+ "Perhaps you should try running the app in an empty directory."
)
if not (root / DOT_IGNORE_FILENAME).is_file():
warning_msg = (
warning_msg
+ "\nIn order to ignore some files or folder, "
+ "create a `.lightningignore` file and add the paths to ignore."
warning_msg += (
"\nIn order to ignore some files or folder, create a `.lightningignore` file and add the paths to"
" ignore. You can also set the `lightningingore` attribute in a Flow or Work."
)
else:
warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`."
Expand Down
6 changes: 4 additions & 2 deletions src/lightning_app/source_code/copytree.py
Expand Up @@ -3,18 +3,20 @@
from functools import partial
from pathlib import Path
from shutil import copy2, copystat, Error
from typing import Callable, List, Set, Union
from typing import Callable, List, Optional, Set, Union

from lightning_app.core.constants import DOT_IGNORE_FILENAME
from lightning_app.utilities.app_helpers import Logger

logger = Logger(__name__)

_IGNORE_FUNCTION = Callable[[Path, List[Path]], List[Path]]


def _copytree(
src: Union[Path, str],
dst: Union[Path, str],
ignore_functions: List[Callable] = None,
ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None,
dirs_exist_ok=False,
dry_run=False,
) -> List[str]:
Expand Down
9 changes: 5 additions & 4 deletions src/lightning_app/source_code/local.py
Expand Up @@ -4,7 +4,7 @@
from shutil import rmtree
from typing import List, Optional

from lightning_app.source_code.copytree import _copytree
from lightning_app.source_code.copytree import _copytree, _IGNORE_FUNCTION
from lightning_app.source_code.hashing import _get_hash
from lightning_app.source_code.tar import _tar_path
from lightning_app.source_code.uploader import FileUploader
Expand All @@ -15,8 +15,9 @@ class LocalSourceCodeDir:

cache_location: Path = Path.home() / ".lightning" / "cache" / "repositories"

def __init__(self, path: Path):
def __init__(self, path: Path, ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None) -> None:
self.path = path
self.ignore_functions = ignore_functions

# cache checksum version
self._version: Optional[str] = None
Expand All @@ -33,7 +34,7 @@ def __init__(self, path: Path):
def files(self) -> List[str]:
"""Returns a set of files that are not ignored by .lightningignore."""
if self._non_ignored_files is None:
self._non_ignored_files = _copytree(self.path, "", dry_run=True)
self._non_ignored_files = _copytree(self.path, "", ignore_functions=self.ignore_functions, dry_run=True)
return self._non_ignored_files

@property
Expand All @@ -59,7 +60,7 @@ def packaging_session(self) -> Path:
session_path = self.cache_location / "packaging_sessions" / self.version
try:
rmtree(session_path, ignore_errors=True)
_copytree(self.path, session_path)
_copytree(self.path, session_path, ignore_functions=self.ignore_functions)
yield session_path
finally:
rmtree(session_path, ignore_errors=True)
Expand Down
6 changes: 5 additions & 1 deletion src/lightning_app/utilities/app_helpers.py
Expand Up @@ -511,11 +511,15 @@ def is_static_method(klass_or_instance, attr) -> bool:
return isinstance(inspect.getattr_static(klass_or_instance, attr), staticmethod)


def _lightning_dispatched() -> bool:
return bool(int(os.getenv("LIGHTNING_DISPATCHED", 0)))


def _should_dispatch_app() -> bool:
return (
__debug__
and "_pytest.doctest" not in sys.modules
and not bool(int(os.getenv("LIGHTNING_DISPATCHED", "0")))
and not _lightning_dispatched()
and "LIGHTNING_APP_STATE_URL" not in os.environ
)

Expand Down

0 comments on commit e2a7668

Please sign in to comment.