Fix ddp_spawn -> ddp fallback logic when on LSF cluster (#15657)

Co-authored-by: awaelchli <aedu.waelchli@gmail.com> (cherry picked from commit cdb7006)
Lightning-AI · Nov 16, 2022 · f3bb85c · f3bb85c
1 parent 2d2c902
commit f3bb85c
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 35 deletions.
diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md
@@ -17,7 +17,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- 
+- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
+
 
 
 ## [1.8.1] - 2022-11-10

diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py
@@ -395,7 +395,10 @@ def _check_strategy_and_fallback(self) -> None:
         strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
 
         if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
-            TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
+            TorchElasticEnvironment.detect()
+            or KubeflowEnvironment.detect()
+            or SLURMEnvironment.detect()
+            or LSFEnvironment.detect()
         ):
             strategy_flag = "ddp"
         if strategy_flag == "dp" and self._accelerator_flag == "cpu":

diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -34,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed manual optimization raising `AttributeError` with Bagua Strategy ([#12534](https://github.com/PyTorchLightning/pytorch-lightning/issues/12534))
 - Fixed the import of `pytorch_lightning` causing a warning 'Redirects are currently not supported in Windows or MacOs' ([#15610](https://github.com/PyTorchLightning/pytorch-lightning/issues/15610))
 
+- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
+
 
 ## [1.8.0] - 2022-11-01
 

diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None:
         strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
 
         if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
-            TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
+            TorchElasticEnvironment.detect()
+            or KubeflowEnvironment.detect()
+            or SLURMEnvironment.detect()
+            or LSFEnvironment.detect()
         ):
             strategy_flag = "ddp"
         if strategy_flag == "dp" and self._accelerator_flag == "cpu":

diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py
@@ -32,6 +32,7 @@
 from lightning_lite.plugins.environments import (
     KubeflowEnvironment,
     LightningEnvironment,
+    LSFEnvironment,
     SLURMEnvironment,
     TorchElasticEnvironment,
 )
@@ -200,24 +201,41 @@ class Strat(DDPStrategy):
     assert connector.strategy is strategy
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "SLURM_NTASKS": "2",
-        "SLURM_NTASKS_PER_NODE": "1",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_PROCID": "0",
-        "SLURM_LOCALID": "0",
-    },
+@pytest.mark.parametrize(
+    "env_vars,expected_environment",
+    [
+        (
+            {
+                "SLURM_NTASKS": "2",
+                "SLURM_NTASKS_PER_NODE": "1",
+                "SLURM_JOB_NAME": "SOME_NAME",
+                "SLURM_NODEID": "0",
+                "LOCAL_RANK": "0",
+                "SLURM_PROCID": "0",
+                "SLURM_LOCALID": "0",
+            },
+            SLURMEnvironment,
+        ),
+        (
+            {
+                "LSB_JOBID": "1",
+                "LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
+                "JSM_NAMESPACE_LOCAL_RANK": "1",
+                "JSM_NAMESPACE_SIZE": "20",
+                "JSM_NAMESPACE_RANK": "1",
+            },
+            LSFEnvironment,
+        ),
+    ],
 )
-@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0)
-def test_dist_backend_accelerator_mapping(*_):
-    connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
-    assert isinstance(connector.accelerator, CPUAccelerator)
-    assert isinstance(connector.strategy, DDPStrategy)
-    assert connector.strategy.local_rank == 0
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
+def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
+    with mock.patch.dict(os.environ, env_vars, clear=True):
+        trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
+    assert isinstance(trainer.accelerator, CPUAccelerator)
+    assert isinstance(trainer.strategy, DDPStrategy)
+    assert isinstance(trainer.strategy.cluster_environment, expected_environment)
 
 
 @RunIf(mps=False)

diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -25,6 +25,7 @@
 from lightning_lite.plugins.environments import (
     KubeflowEnvironment,
     LightningEnvironment,
+    LSFEnvironment,
     SLURMEnvironment,
     TorchElasticEnvironment,
 )
@@ -193,24 +194,41 @@ class Strat(DDPStrategy):
     assert trainer._accelerator_connector.strategy is strategy
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "SLURM_NTASKS": "2",
-        "SLURM_NTASKS_PER_NODE": "1",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_PROCID": "0",
-        "SLURM_LOCALID": "0",
-    },
+@pytest.mark.parametrize(
+    "env_vars,expected_environment",
+    [
+        (
+            {
+                "SLURM_NTASKS": "2",
+                "SLURM_NTASKS_PER_NODE": "1",
+                "SLURM_JOB_NAME": "SOME_NAME",
+                "SLURM_NODEID": "0",
+                "LOCAL_RANK": "0",
+                "SLURM_PROCID": "0",
+                "SLURM_LOCALID": "0",
+            },
+            SLURMEnvironment,
+        ),
+        (
+            {
+                "LSB_JOBID": "1",
+                "LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
+                "JSM_NAMESPACE_LOCAL_RANK": "1",
+                "JSM_NAMESPACE_SIZE": "20",
+                "JSM_NAMESPACE_RANK": "1",
+            },
+            LSFEnvironment,
+        ),
+    ],
 )
-@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
-def test_dist_backend_accelerator_mapping(cuda_count_0):
-    trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2)
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
+def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
+    with mock.patch.dict(os.environ, env_vars, clear=True):
+        trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2)
     assert isinstance(trainer.accelerator, CPUAccelerator)
     assert isinstance(trainer.strategy, DDPStrategy)
-    assert trainer.strategy.local_rank == 0
+    assert isinstance(trainer.strategy.cluster_environment, expected_environment)
 
 
 def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):