Skip to content

Commit

Permalink
Hydra + DDP improvements
Browse files Browse the repository at this point in the history
* Create different hydra output subdirectories for processes started by DDP
* Support experimental-rerun
* If rerun is not enabled but multi-run used, raise explicit error
Reverts parts of Lightning-AI#15737
  • Loading branch information
nisheethlahoti committed Jul 27, 2023
1 parent 324d90a commit f2d57e0
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 4 deletions.
22 changes: 19 additions & 3 deletions src/lightning/fabric/strategies/launchers/subprocess_script.py
Expand Up @@ -14,6 +14,7 @@
import os
import subprocess
import sys
from pathlib import Path
from typing import Any, Callable, Optional, Sequence, Tuple

from lightning_utilities.core.imports import RequirementCache
Expand Down Expand Up @@ -143,6 +144,8 @@ def _basic_subprocess_cmd() -> Sequence[str]:

def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
from hydra.core.hydra_config import HydraConfig
from hydra.types import RunMode
from hydra.utils import get_original_cwd, to_absolute_path

# when user is using hydra find the absolute path
Expand All @@ -151,9 +154,22 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
else:
command = [sys.executable, "-m", __main__.__spec__.name]

command += sys.argv[1:]
# extract the hydra configuration
hydra_cfg = HydraConfig.get()
hydra_output = Path(hydra_cfg.runtime.output_dir)

if hydra_cfg.output_subdir is None: # config isn't saved, so re-run original command
if hydra_cfg.mode == RunMode.MULTIRUN:
raise RuntimeError(f"DDP with multirun requires either re-run callback or saved config file")
command += sys.argv[1:] + [f"hydra.run.dir={hydra_output}"] # Keep output directory same
else:
hydra_subdir = hydra_output / hydra_cfg.output_subdir
pickled_config_path = hydra_subdir / "config.pickle"
if pickled_config_path.exists():
command += ["--experimental-rerun", str(pickled_config_path)]
else:
command += ["-cp", str(hydra_subdir), "-cn", "config.yaml"] # Used saved config for new run
command += [f"hydra.output_subdir=.pl_ddp_hydra_{local_rank}", f"hydra.run.dir={hydra_output}"]

cwd = get_original_cwd()
os_cwd = f'"{os.getcwd()}"'
command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
return command, cwd
@@ -1,5 +1,6 @@
import subprocess
import sys
from pathlib import Path
from unittest.mock import Mock

import pytest
Expand All @@ -13,6 +14,7 @@

if _HYDRA_WITH_RUN_PROCESS:
from hydra.test_utils.test_utils import run_process
from omegaconf import OmegaConf


# Script to run from command line
Expand Down Expand Up @@ -48,7 +50,7 @@ def task_fn(cfg):

@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"])
def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
monkeypatch.chdir(tmpdir)

Expand All @@ -63,6 +65,98 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
cmd += [f"hydra.output_subdir={subdir}"]
run_process(cmd)

if subdir == "null": # There should be no subdirectory created
# Make sure there's no config.yaml
logs = list(Path.cwd().glob("**/config.yaml"))
assert len(logs) == 0
else:
# Make sure config.yaml was created for additional processes.
logs = list(Path.cwd().glob("**/config.yaml"))
assert len(logs) == devices

# Make sure the parameter was set and used
cfg = OmegaConf.load(logs[0])
assert cfg.devices == devices

# Make sure PL spawned a job that is logged by Hydra
logs = list(Path.cwd().glob("**/*.log"))
assert len(logs) == 1


@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
@pytest.mark.parametrize("num_jobs", [1, 2])
def test_ddp_with_hydra_multirunjob(tmpdir, num_jobs, monkeypatch):
monkeypatch.chdir(tmpdir)

# Save script locally
with open("temp.py", "w") as fn:
fn.write(script)

# create fake multirun params based on `num_jobs`
fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))

# Run CLI
run_process([sys.executable, "temp.py", "+devices=2", '+strategy="ddp"', fake_param, "--multirun"])

# Make sure config.yaml was created for each job
configs = sorted(Path.cwd().glob("**/.pl_ddp_hydra_*/config.yaml"))
assert len(configs) == num_jobs

# Make sure the parameter was set and used for each job
for i, config in enumerate(configs):
cfg = OmegaConf.load(config)
local_rank = int(config.parent.parent.parts[-1])
assert cfg.devices == 2
assert cfg.foo == local_rank

logs = list(Path.cwd().glob("**/*.log"))
assert len(logs) == num_jobs


yaml_file = """
hydra:
callbacks:
save_job_info:
_target_: hydra.experimental.callbacks.PickleJobInfoCallback
"""


@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RERUN, reason=str(_HYDRA_WITH_RERUN))
@pytest.mark.parametrize("num_jobs", [1, 2])
def test_ddp_with_hydra_multirunjob_rerun(tmpdir, num_jobs, monkeypatch):
monkeypatch.chdir(tmpdir)

# Save script locally
with open("temp.py", "w") as fn:
fn.write(script)

with open("config.yaml", "w") as fn:
fn.write(yaml_file)

# create fake multirun params based on `num_jobs`
fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))

# Run CLI
run_process(
[
sys.executable,
"temp.py",
"-cp",
".",
"-cn",
"config.yaml",
"+devices=2",
'+strategy="ddp"',
fake_param,
"--multirun",
]
)

pickles = sorted(Path.cwd().glob("**/.hydra/config.pickle"))
assert len(pickles) == num_jobs


def test_kill():
launcher = _SubprocessScriptLauncher(Mock(), 1, 1)
Expand Down

0 comments on commit f2d57e0

Please sign in to comment.