Use hydra.run.dir (not os.getcwd) for DDP subprocesses' run dir (#18145)

Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit ebbd538)
Lightning-AI · Aug 14, 2023 · 29486ab · 29486ab
1 parent 2e828e2
commit 29486ab
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 8 deletions.
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed issue where DDP subprocesses that used Hydra would set hydra's working directory to current directory ([#18145](https://github.com/Lightning-AI/lightning/pull/18145))
 
 
 ## [2.0.6] - 2023-07-20

diff --git a/src/lightning/fabric/strategies/launchers/subprocess_script.py b/src/lightning/fabric/strategies/launchers/subprocess_script.py
@@ -143,6 +143,7 @@ def _basic_subprocess_cmd() -> Sequence[str]:
 
 def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
     import __main__  # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
+    from hydra.core.hydra_config import HydraConfig
     from hydra.utils import get_original_cwd, to_absolute_path
 
     # when user is using hydra find the absolute path
@@ -154,6 +155,7 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
     command += sys.argv[1:]
 
     cwd = get_original_cwd()
-    os_cwd = f'"{os.getcwd()}"'
-    command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
+    rundir = f'"{HydraConfig.get().run.dir}"'
+    # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
+    command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"]
     return command, cwd
diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py
@@ -8,11 +8,11 @@
 from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
 from tests_pytorch.helpers.runif import RunIf
 
-_HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2")
 _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7")
 
 if _HYDRA_WITH_RUN_PROCESS:
     from hydra.test_utils.test_utils import run_process
+    from omegaconf import OmegaConf
 
 
 # Script to run from command line
@@ -48,21 +48,34 @@ def task_fn(cfg):
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
-@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
-def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
-    monkeypatch.chdir(tmpdir)
+@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"])
+def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
 
     # Save script locally
     with open("temp.py", "w") as fn:
         fn.write(script)
 
     # Run CLI
     devices = 2
-    cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"']
+    run_dir = tmp_path / "hydra_output"
+    cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"]
     if subdir is not None:
         cmd += [f"hydra.output_subdir={subdir}"]
     run_process(cmd)
 
+    # Make sure no config.yaml was created for additional processes
+    saved_confs = list(run_dir.glob("**/config.yaml"))
+    assert len(saved_confs) == (0 if subdir == "null" else 1)  # Main process has config.yaml iff subdir!="null"
+
+    if saved_confs:  # Make sure the parameter was set and used
+        cfg = OmegaConf.load(saved_confs[0])
+        assert cfg.devices == devices
+
+    # Make sure PL spawned jobs that are logged by Hydra
+    logs = list(run_dir.glob("**/*.log"))
+    assert len(logs) == devices
+
 
 def test_kill():
     launcher = _SubprocessScriptLauncher(Mock(), 1, 1)