From f72daf69a545cbd741cf6567413d8a157cbe7e6d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 Jan 2022 19:45:23 +0100
Subject: [PATCH 1/4] Teardown all internal components on exception

---
 pytorch_lightning/trainer/trainer.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ac01227fd00ac..22fef0bf729d4 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -685,14 +685,13 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
         except BaseException as exception:
             self.state.status = TrainerStatus.INTERRUPTED
             if distributed_available() and self.world_size > 1:
-                # try syncing remaing processes, kill otherwise
+                # try syncing remaining processes, kill otherwise
                 self.strategy.reconciliate_processes(traceback.format_exc())
             self._on_exception()
+            self._call_callback_hooks("on_exception", exception)
+            self._teardown()
             # reset bookkeeping
             self.state.stage = None
-            self._call_callback_hooks("on_exception", exception)
-            # shutdown workers
-            self._data_connector.teardown()
             raise
 
     def fit(
@@ -1174,6 +1173,7 @@ def _run(
         self.checkpoint_connector.resume_end()
 
         results = self._run_stage()
+
         log.detail(f"{self.__class__.__name__}: trainer tearing down")
         self._teardown()
 
@@ -1188,8 +1188,7 @@ def _run(
         log.detail(f"{self.__class__.__name__}: calling teardown hooks")
         self._call_teardown_hook()
 
-        if self.state.status != TrainerStatus.INTERRUPTED:
-            self.state.status = TrainerStatus.FINISHED
+        self.state.status = TrainerStatus.FINISHED
         self.state.stage = None
 
         if isinstance(self.strategy, DDPSpawnStrategy):
@@ -1240,7 +1239,10 @@ def _teardown(self):
         self.strategy.post_dispatch(self)
         self.strategy.teardown()
         self._data_connector.teardown()
-        self._active_loop.teardown()
+        loop = self._active_loop
+        # loop should never be `None` here but it can because we don't know the trainer stage with `ddp_spawn`
+        if loop is not None:
+            loop.teardown()
         self.logger_connector.teardown()
         self._signal_connector.teardown()
 

From b88f513900b7e3944cc78a339907fc874479974d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 25 Jan 2022 19:50:12 +0100
Subject: [PATCH 2/4] Update CHANGELOG

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa7c4f9b056bc..f370985803a7f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -69,6 +69,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `opt_idx` to scheduler config if not assigned by user ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247))
 
 
+- Teardown the active loop and strategy on exception ([#11620](https://github.com/PyTorchLightning/pytorch-lightning/pull/11620))
+
+
 - Added a `MisconfigurationException` if user provided `opt_idx` in scheduler config doesn't match with actual optimizer index of its respective optimizer ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247))
 
 

From 87cfbcbf52f17d554588d0b3313f81ec00d23668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 26 Jan 2022 13:55:01 +0100
Subject: [PATCH 3/4] Update pytorch_lightning/trainer/trainer.py

---
 pytorch_lightning/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 22fef0bf729d4..462fbd11867e7 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -690,7 +690,7 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
             self._on_exception()
             self._call_callback_hooks("on_exception", exception)
             self._teardown()
-            # reset bookkeeping
+            # teardown might access the stage so we reset it after
             self.state.stage = None
             raise
 

From 9d74134830a85f66092ca43ede81250039044424 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 2 Feb 2022 20:23:50 +0100
Subject: [PATCH 4/4] Fix IPU teardown

---
 pytorch_lightning/strategies/ipu.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/strategies/ipu.py b/pytorch_lightning/strategies/ipu.py
index 22b575a590cfd..6b6433841d5ae 100644
--- a/pytorch_lightning/strategies/ipu.py
+++ b/pytorch_lightning/strategies/ipu.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import json
 import os
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import torch
 from torch.utils.data import DataLoader
@@ -116,6 +116,8 @@ def __init__(
                 options["autoReport.directory"] = self.autoreport_dir
             os.environ["POPLAR_ENGINE_OPTIONS"] = json.dumps(options)
 
+        self._update_dataloader_original: Optional[Callable] = None
+
     def setup(self, trainer: "pl.Trainer") -> None:
         # set the `accumulate_grad_batches` property as early as possible
         self._handle_gradient_accumulation_steps()
@@ -279,8 +281,9 @@ def predict_step(self, *args, **kwargs) -> STEP_OUTPUT:
 
     def teardown(self) -> None:
         super().teardown()
-        # undo dataloader patching
-        pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original
+        if self._update_dataloader_original is not None:
+            # undo dataloader patching
+            pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original
 
         for model in self.poptorch_models.values():
             model.destroy()