diff --git a/CHANGELOG.md b/CHANGELOG.md index aa7c4f9b056bc..f370985803a7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `opt_idx` to scheduler config if not assigned by user ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247)) +- Teardown the active loop and strategy on exception ([#11620](https://github.com/PyTorchLightning/pytorch-lightning/pull/11620)) + + - Added a `MisconfigurationException` if user provided `opt_idx` in scheduler config doesn't match with actual optimizer index of its respective optimizer ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247)) diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 22b575a590cfd..6b6433841d5ae 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -13,7 +13,7 @@ # limitations under the License. import json import os -from typing import Any, List, Optional, Union +from typing import Any, Callable, List, Optional, Union import torch from torch.utils.data import DataLoader @@ -116,6 +116,8 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) + self._update_dataloader_original: Optional[Callable] = None + def setup(self, trainer: "pl.Trainer") -> None: # set the `accumulate_grad_batches` property as early as possible self._handle_gradient_accumulation_steps() @@ -279,8 +281,9 @@ def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: def teardown(self) -> None: super().teardown() - # undo dataloader patching - pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original + if self._update_dataloader_original is not None: + # undo dataloader patching + pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original for model in self.poptorch_models.values(): model.destroy() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ac01227fd00ac..462fbd11867e7 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -685,14 +685,13 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: except BaseException as exception: self.state.status = TrainerStatus.INTERRUPTED if distributed_available() and self.world_size > 1: - # try syncing remaing processes, kill otherwise + # try syncing remaining processes, kill otherwise self.strategy.reconciliate_processes(traceback.format_exc()) self._on_exception() - # reset bookkeeping - self.state.stage = None self._call_callback_hooks("on_exception", exception) - # shutdown workers - self._data_connector.teardown() + self._teardown() + # teardown might access the stage so we reset it after + self.state.stage = None raise def fit( @@ -1174,6 +1173,7 @@ def _run( self.checkpoint_connector.resume_end() results = self._run_stage() + log.detail(f"{self.__class__.__name__}: trainer tearing down") self._teardown() @@ -1188,8 +1188,7 @@ def _run( log.detail(f"{self.__class__.__name__}: calling teardown hooks") self._call_teardown_hook() - if self.state.status != TrainerStatus.INTERRUPTED: - self.state.status = TrainerStatus.FINISHED + self.state.status = TrainerStatus.FINISHED self.state.stage = None if isinstance(self.strategy, DDPSpawnStrategy): @@ -1240,7 +1239,10 @@ def _teardown(self): self.strategy.post_dispatch(self) self.strategy.teardown() self._data_connector.teardown() - self._active_loop.teardown() + loop = self._active_loop + # loop should never be `None` here but it can because we don't know the trainer stage with `ddp_spawn` + if loop is not None: + loop.teardown() self.logger_connector.teardown() self._signal_connector.teardown()