Lightning-AI · awaelchli · Feb 19, 2022 · Feb 14, 2022 · Feb 14, 2022 · Feb 14, 2022
@@ -9,7 +9,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Add new `DETAIL` log level to provide useful logs for improving monitoring and debugging of batch jobs
+- Enable gradient accumulation using Horovod's `backward_passes_per_step` ([#11911](https://github.com/PyTorchLightning/pytorch-lightning/pull/11911))
+
+
+- Add new `DETAIL` log level to provide useful logs for improving monitoring and debugging of batch jobs ([#11008](https://github.com/PyTorchLightning/pytorch-lightning/pull/11008))
 
 
 - Added a flag `SLURMEnvironment(auto_requeue=True|False)` to control whether Lightning handles the requeuing ([#10601](https://github.com/PyTorchLightning/pytorch-lightning/pull/10601))

@@ -27,6 +27,7 @@
 from pytorch_lightning.utilities.distributed import group as dist_group
 from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.enums import _StrategyType
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
@@ -77,6 +78,11 @@ def distributed_sampler_kwargs(self):
         distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank)
         return distributed_sampler_kwargs
 
+    @property
+    def handles_gradient_accumulation(self) -> bool:
+        """Whether the plugin handles gradient accumulation internally."""
+        return True
+
     def setup(self, trainer: "pl.Trainer") -> None:
         self.model_to_device()
 
@@ -112,7 +118,13 @@ def _unpack_lightning_optimizer(opt):
         for optimizer in optimizers:
             hvd.broadcast_optimizer_state(optimizer, root_rank=0)
 
-        self.optimizers = self._wrap_optimizers(optimizers)
+        accumulation_scheduler = trainer.accumulation_scheduler
+        if accumulation_scheduler.epochs != [0]:
+            raise MisconfigurationException(
+                "Horovod currently does not support different `accumulate_grad_batches` at different epochs."
+            )
+
+        self.optimizers = self._wrap_optimizers(optimizers, trainer.accumulate_grad_batches)
         for optimizer in self.optimizers:
             # Synchronization will be performed explicitly following backward()
             self._exit_stack.enter_context(optimizer.skip_synchronize())
@@ -182,10 +194,16 @@ def post_backward(self, closure_loss: torch.Tensor) -> None:
         for optimizer in self.optimizers:
             optimizer.synchronize()
 
-    def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["hvd.DistributedOptimizer"]:
+    def _wrap_optimizers(
+        self, optimizers: List[Optimizer], accumulate_grad_batches: int
+    ) -> List["hvd.DistributedOptimizer"]:
         """Wraps optimizers to perform gradient aggregation via allreduce."""
         return [
-            hvd.DistributedOptimizer(opt, named_parameters=self._filter_named_parameters(self.lightning_module, opt))
+            hvd.DistributedOptimizer(
+                opt,
+                backward_passes_per_step=accumulate_grad_batches,
+                named_parameters=self._filter_named_parameters(self.lightning_module, opt),
+            )
             if "horovod" not in str(opt.__class__)
             else opt
             for opt in optimizers

@@ -30,9 +30,11 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.advanced_models import BasicGAN
 from tests.helpers.runif import RunIf
+from tests.helpers.simple_models import ClassificationModel
 
 if _HOROVOD_AVAILABLE:
     import horovod
@@ -82,6 +84,43 @@ def test_horovod_cpu(tmpdir):
     _run_horovod(trainer_options)
 
 
+@RunIf(skip_windows=True, horovod=True, skip_49370=True)
+def test_horovod_cpu_accumulate_grad_batches(tmpdir):
+    """Test Horovod running multi-process on CPU."""
+    trainer_options = dict(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        enable_progress_bar=False,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        accumulate_grad_batches=4,
+        strategy="horovod",
+    )
+    _run_horovod(trainer_options)
+
+
+@RunIf(skip_windows=True, horovod=True, skip_49370=True)
+def test_horovod_cpu_accumulate_grad_batches_different(tmpdir):
+    """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod
+    Strategy on multi-cpus."""
+    model = ClassificationModel()
+    trainer = Trainer(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        enable_progress_bar=False,
+        max_epochs=4,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        accumulate_grad_batches={0: 4, 2: 2},
+        strategy="horovod",
+    )
+    with pytest.raises(MisconfigurationException):
+        trainer.fit(model)
+
+
 @RunIf(skip_windows=True, horovod=True, skip_49370=True)
 def test_horovod_cpu_clip_grad_by_value(tmpdir):
     """Test Horovod running multi-process on CPU."""
@@ -125,12 +164,54 @@ def test_horovod_multi_gpu(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
 
 
+@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True)
+def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
+    """Test Horovod with multi-GPU support."""
+    trainer_options = dict(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        enable_progress_bar=False,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        accumulate_grad_batches=4,
+        accelerator="gpu",
+        devices=2,
+        strategy="horovod",
+    )
+    _run_horovod(trainer_options, on_gpu=True)
+
+
+@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True)
+def test_horovod_multi_gpu_accumulate_grad_batches_different(tmpdir):
+    """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod
+    Strategy on multi-gpus."""
+    model = ClassificationModel()
+    trainer = Trainer(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        enable_progress_bar=False,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        accumulate_grad_batches={0: 4, 2: 2},
+        accelerator="gpu",
+        devices=2,
+        strategy="horovod",
+    )
+    with pytest.raises(MisconfigurationException):
+        trainer.fit(model)
+
+
 @RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True)
 def test_horovod_multi_gpu_grad_by_value(tmpdir):
     """Test Horovod with multi-GPU support."""
@@ -143,7 +224,8 @@ def test_horovod_multi_gpu_grad_by_value(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
@@ -164,7 +246,8 @@ def test_horovod_apex(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy="horovod",
         amp_backend="apex",
         precision=16,
@@ -183,7 +266,8 @@ def test_horovod_amp(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy="horovod",
         amp_backend="native",
         precision=16,
@@ -202,7 +286,8 @@ def test_horovod_gather(tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=2,
+        accelerator="gpu",
+        devices=2,
         strategy="horovod",
     )
     _run_horovod(trainer_options, on_gpu=True)
@@ -227,7 +312,8 @@ def validation_step(self, batch, *args, **kwargs):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.2,
-        gpus=1,
+        accelerator="gpu",
+        devices=1,
         strategy="horovod",
     )
     tpipes.run_model_test_without_loggers(trainer_options, model)