Skip to content

Commit

Permalink
Use hydra.run.dir (not os.getcwd) for DDP subprocesses' run dir (#18145)
Browse files Browse the repository at this point in the history
Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>

(cherry picked from commit ebbd538)
  • Loading branch information
nisheethlahoti authored and Borda committed Aug 14, 2023
1 parent 2e828e2 commit 29486ab
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

-
- Fixed issue where DDP subprocesses that used Hydra would set hydra's working directory to current directory ([#18145](https://github.com/Lightning-AI/lightning/pull/18145))


## [2.0.6] - 2023-07-20
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def _basic_subprocess_cmd() -> Sequence[str]:

def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
from hydra.core.hydra_config import HydraConfig
from hydra.utils import get_original_cwd, to_absolute_path

# when user is using hydra find the absolute path
Expand All @@ -154,6 +155,7 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
command += sys.argv[1:]

cwd = get_original_cwd()
os_cwd = f'"{os.getcwd()}"'
command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
rundir = f'"{HydraConfig.get().run.dir}"'
# Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"]
return command, cwd
23 changes: 18 additions & 5 deletions tests/tests_pytorch/strategies/launchers/test_subprocess_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
from tests_pytorch.helpers.runif import RunIf

_HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2")
_HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7")

if _HYDRA_WITH_RUN_PROCESS:
from hydra.test_utils.test_utils import run_process
from omegaconf import OmegaConf


# Script to run from command line
Expand Down Expand Up @@ -48,21 +48,34 @@ def task_fn(cfg):

@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
monkeypatch.chdir(tmpdir)
@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"])
def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)

# Save script locally
with open("temp.py", "w") as fn:
fn.write(script)

# Run CLI
devices = 2
cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"']
run_dir = tmp_path / "hydra_output"
cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"]
if subdir is not None:
cmd += [f"hydra.output_subdir={subdir}"]
run_process(cmd)

# Make sure no config.yaml was created for additional processes
saved_confs = list(run_dir.glob("**/config.yaml"))
assert len(saved_confs) == (0 if subdir == "null" else 1) # Main process has config.yaml iff subdir!="null"

if saved_confs: # Make sure the parameter was set and used
cfg = OmegaConf.load(saved_confs[0])
assert cfg.devices == devices

# Make sure PL spawned jobs that are logged by Hydra
logs = list(run_dir.glob("**/*.log"))
assert len(logs) == devices


def test_kill():
launcher = _SubprocessScriptLauncher(Mock(), 1, 1)
Expand Down

0 comments on commit 29486ab

Please sign in to comment.