From f72daf69a545cbd741cf6567413d8a157cbe7e6d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 Jan 2022 19:45:23 +0100 Subject: [PATCH 1/4] Teardown all internal components on exception --- pytorch_lightning/trainer/trainer.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ac01227fd00ac..22fef0bf729d4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -685,14 +685,13 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: except BaseException as exception: self.state.status = TrainerStatus.INTERRUPTED if distributed_available() and self.world_size > 1: - # try syncing remaing processes, kill otherwise + # try syncing remaining processes, kill otherwise self.strategy.reconciliate_processes(traceback.format_exc()) self._on_exception() + self._call_callback_hooks("on_exception", exception) + self._teardown() # reset bookkeeping self.state.stage = None - self._call_callback_hooks("on_exception", exception) - # shutdown workers - self._data_connector.teardown() raise def fit( @@ -1174,6 +1173,7 @@ def _run( self.checkpoint_connector.resume_end() results = self._run_stage() + log.detail(f"{self.__class__.__name__}: trainer tearing down") self._teardown() @@ -1188,8 +1188,7 @@ def _run( log.detail(f"{self.__class__.__name__}: calling teardown hooks") self._call_teardown_hook() - if self.state.status != TrainerStatus.INTERRUPTED: - self.state.status = TrainerStatus.FINISHED + self.state.status = TrainerStatus.FINISHED self.state.stage = None if isinstance(self.strategy, DDPSpawnStrategy): @@ -1240,7 +1239,10 @@ def _teardown(self): self.strategy.post_dispatch(self) self.strategy.teardown() self._data_connector.teardown() - self._active_loop.teardown() + loop = self._active_loop + # loop should never be `None` here but it can because we don't know the trainer stage with `ddp_spawn` + if loop is not None: + loop.teardown() self.logger_connector.teardown() self._signal_connector.teardown() From b88f513900b7e3944cc78a339907fc874479974d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 25 Jan 2022 19:50:12 +0100 Subject: [PATCH 2/4] Update CHANGELOG --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa7c4f9b056bc..f370985803a7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `opt_idx` to scheduler config if not assigned by user ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247)) +- Teardown the active loop and strategy on exception ([#11620](https://github.com/PyTorchLightning/pytorch-lightning/pull/11620)) + + - Added a `MisconfigurationException` if user provided `opt_idx` in scheduler config doesn't match with actual optimizer index of its respective optimizer ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247)) From 87cfbcbf52f17d554588d0b3313f81ec00d23668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 26 Jan 2022 13:55:01 +0100 Subject: [PATCH 3/4] Update pytorch_lightning/trainer/trainer.py --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 22fef0bf729d4..462fbd11867e7 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -690,7 +690,7 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: self._on_exception() self._call_callback_hooks("on_exception", exception) self._teardown() - # reset bookkeeping + # teardown might access the stage so we reset it after self.state.stage = None raise From 9d74134830a85f66092ca43ede81250039044424 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 2 Feb 2022 20:23:50 +0100 Subject: [PATCH 4/4] Fix IPU teardown --- pytorch_lightning/strategies/ipu.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py index 22b575a590cfd..6b6433841d5ae 100644 --- a/pytorch_lightning/strategies/ipu.py +++ b/pytorch_lightning/strategies/ipu.py @@ -13,7 +13,7 @@ # limitations under the License. import json import os -from typing import Any, List, Optional, Union +from typing import Any, Callable, List, Optional, Union import torch from torch.utils.data import DataLoader @@ -116,6 +116,8 @@ def __init__( options["autoReport.directory"] = self.autoreport_dir os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options) + self._update_dataloader_original: Optional[Callable] = None + def setup(self, trainer: "pl.Trainer") -> None: # set the `accumulate_grad_batches` property as early as possible self._handle_gradient_accumulation_steps() @@ -279,8 +281,9 @@ def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: def teardown(self) -> None: super().teardown() - # undo dataloader patching - pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original + if self._update_dataloader_original is not None: + # undo dataloader patching + pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original for model in self.poptorch_models.values(): model.destroy()