Lightning-AI · awaelchli · Aug 9, 2022 · Oct 14, 2021 · Oct 14, 2021 · Oct 14, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -280,6 +280,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed support for logging within callbacks returned from `LightningModule` ([#10991](https://github.com/PyTorchLightning/pytorch-lightning/pull/10991))
 
 
+- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/PyTorchLightning/pytorch-lightning/pull/9938))
+
+
 - The TQDM progress bar now correctly shows the `on_epoch` logged values on train epoch end ([#11069](https://github.com/PyTorchLightning/pytorch-lightning/pull/11069))
 
 

diff --git a/pytorch_lightning/callbacks/stochastic_weight_avg.py b/pytorch_lightning/callbacks/stochastic_weight_avg.py
@@ -16,14 +16,15 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 """
 from copy import deepcopy
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch import nn
 from torch.optim.swa_utils import SWALR
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.plugins.training_type import DDPFullyShardedPlugin, DeepSpeedPlugin
 from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -115,14 +116,20 @@ def __init__(
         if device is not None and not isinstance(device, (torch.device, str)):
             raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}")
 
+        self.n_averaged: Optional[torch.Tensor] = None
         self._swa_epoch_start = swa_epoch_start
         self._swa_lrs = swa_lrs
         self._annealing_epochs = annealing_epochs
         self._annealing_strategy = annealing_strategy
         self._avg_fn = avg_fn or self.avg_fn
         self._device = device
-        self._model_contains_batch_norm = None
-        self._average_model = None
+        self._model_contains_batch_norm: Optional[bool] = None
+        self._average_model: Optional[pl.LightningModule] = None
+        self._initialized = False
+        self._swa_scheduler: Optional[SWALR] = None
+        self._scheduler_step_count: Optional[int] = None
+        self._init_n_averaged = 0
+        self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None
 
     @property
     def swa_start(self) -> int:
@@ -145,6 +152,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
         optimizers = trainer.optimizers
         lr_schedulers = trainer.lr_schedulers
 
+        if isinstance(trainer.training_type_plugin, (DDPFullyShardedPlugin, DeepSpeedPlugin)):
+            raise MisconfigurationException("SWA does not currently support sharded models.")
+
         if len(optimizers) != 1:
             raise MisconfigurationException("SWA currently works with 1 `optimizer`.")
 
@@ -162,7 +172,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
             trainer.fit_loop.max_epochs += 1
 
     def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
-        if trainer.current_epoch == self.swa_start:
+        if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end):
+            self._initialized = True
+
             # move average model to request device.
             self._average_model = self._average_model.to(self._device or pl_module.device)
 
@@ -182,6 +194,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
                 anneal_strategy=self._annealing_strategy,
                 last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1,
             )
+            if self._scheduler_step_count is not None:
+                # Restore scheduler step count from checkpoint
+                self._swa_scheduler._step_count = self._scheduler_step_count
+            elif trainer.current_epoch != self.swa_start:
+                # Log a warning if we're initializing after start without any checkpoint data,
+                # as behaviour will be different compared to having checkpoint data.
+                rank_zero_warn(
+                    "SWA is initializing after swa_start without any checkpoint data. "
+                    "This may be caused by loading a checkpoint from an older version of PyTorch Lightning."
+                )
+
             default_scheduler_cfg = _get_default_scheduler_config()
             assert default_scheduler_cfg["interval"] == "epoch" and default_scheduler_cfg["frequency"] == 1
             default_scheduler_cfg["scheduler"] = self._swa_scheduler
@@ -198,14 +221,14 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
             else:
                 trainer.lr_schedulers.append(default_scheduler_cfg)
 
-            self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device)
+            if self.n_averaged is None:
+                self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device)
 
         if self.swa_start <= trainer.current_epoch <= self.swa_end:
             self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn)
 
         # Note: No > here in case the callback is saved with the model and training continues
         if trainer.current_epoch == self.swa_end + 1:
-
             # Transfer weights from average model to pl_module
             self.transfer_weights(self._average_model, pl_module)
 
@@ -280,3 +303,34 @@ def avg_fn(
     ) -> torch.FloatTensor:
         """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97."""
         return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1)
+
+    def on_save_checkpoint(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: Dict[str, Any]
+    ) -> dict:
+        return {
+            "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(),
+            "scheduler_step_count": None if self._swa_scheduler is None else self._swa_scheduler._step_count,
+            "average_model_parameters": self._get_average_model_parameters(trainer),
+        }
+
+    def on_load_checkpoint(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", callback_state: Dict[str, Any]
+    ) -> None:
+        self._init_n_averaged = callback_state["n_averaged"]
+        self._scheduler_step_count = callback_state["scheduler_step_count"]
+        self._load_average_model_parameters(callback_state["average_model_parameters"])
+
+    def _get_average_model_parameters(self, trainer: "pl.Trainer") -> Optional[List[nn.Parameter]]:
+        if self._average_model is None or not (self.swa_start <= trainer.current_epoch <= self.swa_end):
+            # If we're not within the SWA epochs then when loading checkpoint data we would want
+            # to use parameters from the underlying model rather than the SWA parameters.
+            return
+        return list(self._average_model.parameters())
+
+    def _load_average_model_parameters(self, parameter_state: Any) -> None:
+        if self._average_model is None or parameter_state is None:
+            return
+        for p_swa, p_checkpoint in zip(self._average_model.parameters(), parameter_state):
+            device = p_swa.device
+            p_swa_ = p_swa.detach()
+            p_swa_.copy_(p_checkpoint.to(device))
diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
+from pathlib import Path
+from typing import Optional
 from unittest import mock
 
 import pytest
@@ -30,7 +33,9 @@
 
 
 class SwaTestModel(BoringModel):
-    def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False):
+    def __init__(
+        self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_after_epoch=None
+    ):
         super().__init__()
         layers = [nn.Linear(32, 32)]
         if batchnorm:
@@ -39,6 +44,9 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat
         self.layer = nn.Sequential(*layers)
         self.interval = interval
         self.iterable_dataset = iterable_dataset
+        self.crash_after_epoch = crash_after_epoch
+        self._epoch_count = 0
+        self.save_hyperparameters()
 
     def training_step(self, batch, batch_idx):
         output = self.forward(batch)
@@ -62,10 +70,19 @@ def configure_optimizers(self):
             },
         }
 
+    def training_epoch_end(self, _):
+        if not self.crash_after_epoch:
+            return
+        self._epoch_count += 1
+        if self._epoch_count >= self.crash_after_epoch:
+            raise RuntimeError("Crash test")
+
 
 class SwaTestCallback(StochasticWeightAveraging):
     update_parameters_calls: int = 0
     transfer_weights_calls: int = 0
+    # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0
+    first_epoch: Optional[int] = None
 
     def update_parameters(self, *args, **kwargs):
         self.update_parameters_calls += 1
@@ -77,6 +94,8 @@ def transfer_weights(self, *args, **kwargs):
 
     def on_train_epoch_start(self, trainer, *args):
         super().on_train_epoch_start(trainer, *args)
+        if self.first_epoch is None:
+            self.first_epoch = trainer.current_epoch
         assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end)
         if self.swa_start <= trainer.current_epoch:
             assert isinstance(trainer.lr_schedulers[0]["scheduler"], SWALR)
@@ -88,6 +107,9 @@ def on_train_epoch_end(self, trainer, *args):
         if self.swa_start <= trainer.current_epoch <= self.swa_end:
             swa_epoch = trainer.current_epoch - self.swa_start
             assert self.n_averaged == swa_epoch + 1
+            assert self._swa_scheduler is not None
+            # Scheduler is stepped once on initialization and then at the end of each epoch
+            assert self._swa_scheduler._step_count == swa_epoch + 2
         elif trainer.current_epoch > self.swa_end:
             assert self.n_averaged == self._max_epochs - self.swa_start
 
@@ -101,10 +123,13 @@ def on_train_end(self, trainer, pl_module):
 
         if not isinstance(trainer.training_type_plugin, DDPSpawnPlugin):
             # check backward call count. the batchnorm update epoch should not backward
-            assert trainer.training_type_plugin.backward.call_count == trainer.max_epochs * trainer.limit_train_batches
+            assert trainer.training_type_plugin.backward.call_count == (
+                (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches
+            )
 
         # check call counts
-        assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1)
+        first_swa_epoch = max(self.first_epoch, self.swa_start)
+        assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch
         assert self.transfer_weights_calls == 1
 
 
@@ -247,9 +272,10 @@ def test_swa_multiple_lrs(tmpdir):
 
     class TestModel(BoringModel):
         def __init__(self):
-            super(BoringModel, self).__init__()
+            super().__init__()
             self.layer1 = torch.nn.Linear(32, 32)
             self.layer2 = torch.nn.Linear(32, 2)
+            self.on_train_epoch_start_called = False
 
         def forward(self, x):
             x = self.layer1(x)
@@ -276,3 +302,89 @@ def on_train_epoch_start(self):
     )
     trainer.fit(model)
     assert model.on_train_epoch_start_called
+
+
+def swa_resume_training_from_checkpoint(tmpdir, crash_after_epoch=4, ddp=False):
+    model = SwaTestModel(crash_after_epoch=crash_after_epoch)
+    swa_start = 3
+    max_epochs = 5
+    swa_callback = SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1)
+
+    num_processes = 2 if ddp else 1
+    strategy = "ddp_spawn" if ddp else None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        enable_progress_bar=False,
+        max_epochs=max_epochs,
+        limit_train_batches=5,
+        limit_val_batches=0,
+        callbacks=[swa_callback],
+        accumulate_grad_batches=2,
+        num_processes=num_processes,
+        strategy=strategy,
+    )
+
+    exception_type = Exception if ddp else RuntimeError
+    backward_patch = mock.patch.object(TrainingTypePlugin, "backward", wraps=trainer.training_type_plugin.backward)
+    with backward_patch, pytest.raises(exception_type):
+        trainer.fit(model)
+
+    checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints"
+    checkpoint_files = os.listdir(checkpoint_dir)
+    assert len(checkpoint_files) == 1
+    checkpoint_path = checkpoint_dir / checkpoint_files[0]
+
+    model = SwaTestModel()
+    swa_callback = SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        enable_progress_bar=False,
+        max_epochs=max_epochs,
+        limit_train_batches=5,
+        limit_val_batches=0,
+        callbacks=[swa_callback],
+        accumulate_grad_batches=2,
+        num_processes=num_processes,
+        strategy=strategy,
+    )
+
+    with mock.patch.object(TrainingTypePlugin, "backward", wraps=trainer.training_type_plugin.backward):
+        trainer.fit(model, ckpt_path=checkpoint_path.as_posix())
+
+
+@pytest.mark.parametrize("crash_after_epoch", [2, 4])
+def test_swa_resume_training_from_checkpoint(tmpdir, crash_after_epoch):
+    swa_resume_training_from_checkpoint(tmpdir, crash_after_epoch=crash_after_epoch)
+
+
+@RunIf(skip_windows=True, min_torch="1.8")
+def test_swa_resume_training_from_checkpoint_ddp(tmpdir):
+    # Requires PyTorch >= 1.8 to include this segfault fix:
+    # https://github.com/pytorch/pytorch/pull/50998
+    swa_resume_training_from_checkpoint(tmpdir, ddp=True)
+
+
+def _test_misconfiguration_error_with_sharded_model(tmpdir, strategy, gpus=None):
+    model = SwaTestModel()
+    swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        enable_progress_bar=False,
+        max_epochs=5,
+        callbacks=[swa_callback],
+        strategy=strategy,
+        gpus=gpus,
+    )
+    with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"):
+        trainer.fit(model)
+
+
+@RunIf(fairscale_fully_sharded=True, min_gpus=1)
+def test_misconfiguration_error_with_ddp_fully_sharded(tmpdir):
+    _test_misconfiguration_error_with_sharded_model(tmpdir, "fsdp", 1)
+
+
+@RunIf(deepspeed=True)
+def test_misconfiguration_error_with_deep_speed(tmpdir):
+    _test_misconfiguration_error_with_sharded_model(tmpdir, "deepspeed")