From eb9cb3c8017b1f207ac1d2c7168327fbe0b63cb5 Mon Sep 17 00:00:00 2001
From: Shachar Mirkin <shacharmirkin@gmail.com>
Date: Mon, 14 Dec 2020 13:39:29 +0100
Subject: [PATCH 001/136] Add Google Colab badges (#5111)

* Add colab badges to notebook

Add colab badges to notebook to notebooks 4 & 5

* Add colab badges

Co-authored-by: chaton <thomas@grid.ai>
---
 notebooks/04-transformers-text-classification.ipynb | 7 +++++++
 notebooks/05-trainer-flags-overview.ipynb           | 7 +++++++
 2 files changed, 14 insertions(+)
diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb
index 037b24e4ddd9d..d52af84a76d97 100644
--- a/notebooks/04-transformers-text-classification.ipynb
+++ b/notebooks/04-transformers-text-classification.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb
index 6413e8239bb2e..da044a9c9b5c6 100644
--- a/notebooks/05-trainer-flags-overview.ipynb
+++ b/notebooks/05-trainer-flags-overview.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/05-trainer-flags-overview.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {

From 69123af3ea651a5e1cc25014da6f1c0dee433916 Mon Sep 17 00:00:00 2001
From: Tadej Svetina <tadej.svetina@gmail.com>
Date: Mon, 14 Dec 2020 20:13:58 +0100
Subject: [PATCH 002/136] Fix hanging metrics tests (#5134)

---
 tests/metrics/regression/test_ssim.py |  4 +---
 tests/metrics/utils.py                | 10 ++++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/metrics/regression/test_ssim.py b/tests/metrics/regression/test_ssim.py
index f581188e89fce..8bb304850e3f2 100644
--- a/tests/metrics/regression/test_ssim.py
+++ b/tests/metrics/regression/test_ssim.py
@@ -53,9 +53,7 @@ def _sk_metric(preds, target, data_range, multichannel):
 class TestSSIM(MetricTester):
     atol = 6e-5
 
-    # TODO: for some reason this test hangs with ddp=True
-    # @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("ddp", [False])
+    @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])
     def test_ssim(self, preds, target, multichannel, ddp, dist_sync_on_step):
         self.run_class_metric_test(
diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py
index c607a466b2068..4bd6608ce3fcf 100644
--- a/tests/metrics/utils.py
+++ b/tests/metrics/utils.py
@@ -11,6 +11,11 @@
 
 from pytorch_lightning.metrics import Metric
 
+try:
+    set_start_method("spawn")
+except RuntimeError:
+    pass
+
 NUM_PROCESSES = 2
 NUM_BATCHES = 10
 BATCH_SIZE = 32
@@ -165,10 +170,7 @@ def setup_class(self):
         """Setup the metric class. This will spawn the pool of workers that are
         used for metric testing and setup_ddp
         """
-        try:
-            set_start_method("spawn")
-        except RuntimeError:
-            pass
+
         self.poolSize = NUM_PROCESSES
         self.pool = Pool(processes=self.poolSize)
         self.pool.starmap(setup_ddp, [(rank, self.poolSize) for rank in range(self.poolSize)])

From 84bb9dbac67d2ab1a9707643170e8751f4a33dd4 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 14 Dec 2020 22:46:14 +0100
Subject: [PATCH 003/136] simplify changelog (#5135)

---
 CHANGELOG.md | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 87d29ff6df643..16d0f465beefc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,24 +5,6 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [unreleased.Features] - YYYY-MM-DD
-
-### Added
-
-
-### Changed
-
-
-### Deprecated
-
-
-### Removed
-
-
-### Fixed
-
-
-
 ## [unreleased.BugFix] - YYYY-MM-DD
 
 ### Added

From fde972ffc2cdca3fccb904d29c2f1c32963fcd72 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 15 Dec 2020 18:59:13 +0100
Subject: [PATCH 004/136] add copyright to tests (#5143)

---
 tests/__init__.py            | 13 +++++++++++++
 tests/collect_env_details.py | 13 +++++++++++++
 tests/conftest.py            | 14 ++++++++++++++
 tests/test_profiler.py       | 14 ++++++++++++++
 4 files changed, 54 insertions(+)

diff --git a/tests/__init__.py b/tests/__init__.py
index 981d685430da9..1bb81c466e6eb 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 
 import numpy as np
diff --git a/tests/collect_env_details.py b/tests/collect_env_details.py
index 1d443795d2876..2b8c4b3fafeed 100644
--- a/tests/collect_env_details.py
+++ b/tests/collect_env_details.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Diagnose your system and show basic information
 
 This server mainly to get detail info for better bug reporting.
diff --git a/tests/conftest.py b/tests/conftest.py
index ad4b7169456a8..07188fed4dbed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import threading
 from functools import partial, wraps
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 3bce379c1115c..4728b11582dfc 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import time
 from pathlib import Path

From fe75c735eac562deaab12d53b3580ac1ccf3b1f6 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Tue, 15 Dec 2020 19:59:35 +0000
Subject: [PATCH 005/136] Update changelog, increment version (#5148)

---
 CHANGELOG.md                  | 33 +++++++++++++++++++++++++++++++--
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16d0f465beefc..7751069862500 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,13 +21,42 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915))
 
+## [1.1.1] - 2020-12-15
+
+### Added
+
+- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)
+- Add Google Colab badges ([#5111](https://github.com/PyTorchLightning/pytorch-lightning/pull/5111)
 
-- Fixed `LightningOptimizer` exposes optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
+### Changed
 
+- Update usage of deprecated profiler ([#5010](https://github.com/PyTorchLightning/pytorch-lightning/pull/5010)
+- Update usage of deprecated automatic_optimization ([#5011](https://github.com/PyTorchLightning/pytorch-lightning/pull/5011)
+- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)
+- Split tests for deprecated api ([#5071](https://github.com/PyTorchLightning/pytorch-lightning/pull/5071)
+- Improve some tests ([#5049](https://github.com/PyTorchLightning/pytorch-lightning/pull/5049)
+- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)
 
+### Removed
+
+- Drop duplicate metrics (#5014) ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
+- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)
+- Drop unused test with results API ([#5058](https://github.com/PyTorchLightning/pytorch-lightning/pull/5058)
+
+### Fixed
+
+- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915))
+- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
 - Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
+- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)
+- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)
+- Add deprecated metric utility functions back to functional (
+    [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067),
+    [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068))
+- Allow any input in to_onnx and to_torchscript ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
+- Do not warn when the name key is used in the lr_scheduler dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)
+- Fix hanging metrics tests ([#5134](https://github.com/PyTorchLightning/pytorch-lightning/pull/5134)
 
 
 ## [1.1.0] - 2020-12-09
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 408d95a72dc47..222263ea2d385 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '1.1.1rc0'
+__version__ = '1.1.1'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From 748a74e261585bb571bfc6fa1cc85d33fb44f163 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Tue, 15 Dec 2020 22:58:28 +0000
Subject: [PATCH 006/136] Prune CHANGELOG.md (#5151)

* Prune CHANGELOG.md

* Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7751069862500..6b66c60fe5c7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,22 +27,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)
-- Add Google Colab badges ([#5111](https://github.com/PyTorchLightning/pytorch-lightning/pull/5111)
 
 ### Changed
 
-- Update usage of deprecated profiler ([#5010](https://github.com/PyTorchLightning/pytorch-lightning/pull/5010)
-- Update usage of deprecated automatic_optimization ([#5011](https://github.com/PyTorchLightning/pytorch-lightning/pull/5011)
 - Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)
-- Split tests for deprecated api ([#5071](https://github.com/PyTorchLightning/pytorch-lightning/pull/5071)
-- Improve some tests ([#5049](https://github.com/PyTorchLightning/pytorch-lightning/pull/5049)
 - Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)
 
 ### Removed
 
-- Drop duplicate metrics (#5014) ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
+- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
 - Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)
-- Drop unused test with results API ([#5058](https://github.com/PyTorchLightning/pytorch-lightning/pull/5058)
 
 ### Fixed
 
@@ -54,9 +48,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Add deprecated metric utility functions back to functional (
     [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067),
     [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068))
-- Allow any input in to_onnx and to_torchscript ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
-- Do not warn when the name key is used in the lr_scheduler dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)
-- Fix hanging metrics tests ([#5134](https://github.com/PyTorchLightning/pytorch-lightning/pull/5134)
+- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
+- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)
 
 
 ## [1.1.0] - 2020-12-09

From 79565be206590f794d256ea6fbc530bddde1ce1c Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Wed, 16 Dec 2020 11:10:33 +0530
Subject: [PATCH 007/136] Fix saved filename in ModelCheckpoint if it already
 exists (#4861)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* disable version if not required

* disable version if not required

* pep

* chlog

* improve test

* improve test

* parametrize test and update del_list

* Update pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* try appending version to already saved ckpt_file

* Revert "try appending version to already saved ckpt_file"

This reverts commit 710e05e01f738d982aabf1f36c09fa59293e5c0c.

* add more assertions

* use BoringModel

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 CHANGELOG.md                                  |  4 ++
 .../callbacks/model_checkpoint.py             | 49 +++++++++++--------
 tests/checkpointing/test_model_checkpoint.py  | 39 +++++++++++++++
 3 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16d0f465beefc..894d43d976f08 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,9 +27,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LightningOptimizer` exposes optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
 
 
+- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
+
+
 - Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
 
 
+
 ## [1.1.0] - 2020-12-09
 
 ### Added
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 1354f7f5056b3..4ac800f456c06 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -240,17 +240,14 @@ def save_checkpoint(self, trainer, pl_module):
         # what can be monitored
         monitor_candidates = self._monitor_candidates(trainer)
 
-        # ie: path/val_loss=0.5.ckpt
-        filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step)
-
         # callback supports multiple simultaneous modes
         # here we call each mode sequentially
         # Mode 1: save all checkpoints OR only the top k
         if self.save_top_k:
-            self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath)
+            self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates)
 
         # Mode 2: save the last checkpoint
-        self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath)
+        self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
 
     def __validate_init_configuration(self):
         if self.save_top_k is not None and self.save_top_k < -1:
@@ -444,6 +441,7 @@ def format_checkpoint_name(
         )
         if ver is not None:
             filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}"))
+
         ckpt_name = f"{filename}{self.FILE_EXTENSION}"
         return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
 
@@ -515,13 +513,20 @@ def _validate_monitor_key(self, trainer):
             )
             raise MisconfigurationException(m)
 
-    def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int):
+    def _get_metric_interpolated_filepath_name(
+        self,
+        ckpt_name_metrics: Dict[str, Any],
+        epoch: int,
+        step: int,
+        del_filepath: Optional[str] = None
+    ) -> str:
         filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)
+
         version_cnt = 0
-        while self._fs.exists(filepath):
+        while self._fs.exists(filepath) and filepath != del_filepath:
             filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
-            # this epoch called before
             version_cnt += 1
+
         return filepath
 
     def _monitor_candidates(self, trainer):
@@ -531,13 +536,11 @@ def _monitor_candidates(self, trainer):
         ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch})
         return ckpt_name_metrics
 
-    def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath):
+    def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
         should_save_last = self.monitor is None or self.save_last
         if not should_save_last:
             return
 
-        last_filepath = filepath
-
         # when user ALSO asked for the 'last.ckpt' change the name
         if self.save_last:
             last_filepath = self._format_checkpoint_name(
@@ -548,6 +551,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
                 prefix=self.prefix
             )
             last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}")
+        else:
+            last_filepath = self._get_metric_interpolated_filepath_name(
+                ckpt_name_metrics, trainer.current_epoch, trainer.global_step
+            )
 
         accelerator_backend = trainer.accelerator_backend
 
@@ -568,7 +575,7 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
         if self.monitor is None:
             self.best_model_path = self.last_model_path
 
-    def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath):
+    def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
         current = metrics.get(self.monitor)
         epoch = metrics.get("epoch")
         step = metrics.get("step")
@@ -577,7 +584,7 @@ def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath):
             current = torch.tensor(current, device=pl_module.device)
 
         if self.check_monitor_top_k(current):
-            self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module)
+            self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
         elif self.verbose:
             rank_zero_info(
                 f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}"
@@ -588,25 +595,26 @@ def _is_valid_monitor_key(self, metrics):
 
     def _update_best_and_save(
         self,
-        filepath: str,
         current: torch.Tensor,
         epoch: int,
         step: int,
         trainer,
         pl_module,
+        ckpt_name_metrics
     ):
         k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k
 
-        del_list = []
+        del_filepath = None
         if len(self.best_k_models) == k and k > 0:
-            delpath = self.kth_best_model_path
-            self.best_k_models.pop(self.kth_best_model_path)
-            del_list.append(delpath)
+            del_filepath = self.kth_best_model_path
+            self.best_k_models.pop(del_filepath)
 
         # do not save nan, replace with +/- inf
         if torch.isnan(current):
             current = torch.tensor(float('inf' if self.mode == "min" else '-inf'))
 
+        filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, del_filepath)
+
         # save the current score
         self.current_score = current
         self.best_k_models[filepath] = current
@@ -630,9 +638,8 @@ def _update_best_and_save(
             )
         self._save_model(filepath, trainer, pl_module)
 
-        for cur_path in del_list:
-            if cur_path != filepath:
-                self._del_model(cur_path)
+        if del_filepath is not None and filepath != del_filepath:
+            self._del_model(del_filepath)
 
     def to_yaml(self, filepath: Optional[Union[str, Path]] = None):
         """
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 9817dfa4526c6..106c34030051e 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -938,3 +938,42 @@ def __init__(self, hparams):
     else:
         # make sure it's not AttributeDict
         assert type(ckpt[model.CHECKPOINT_HYPER_PARAMS_KEY]) == hparams_type
+
+
+@pytest.mark.parametrize('max_epochs', [3, 4])
+@pytest.mark.parametrize(
+    'save_top_k, expected',
+    [
+        (1, ['curr_epoch.ckpt']),
+        (2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']),
+    ]
+)
+def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected):
+    """
+    Test that version is added to filename if required and it already exists in dirpath.
+    """
+    model_checkpoint = ModelCheckpoint(
+        dirpath=tmpdir,
+        filename='curr_epoch',
+        save_top_k=save_top_k,
+        monitor='epoch',
+        mode='max',
+    )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        callbacks=[model_checkpoint],
+        max_epochs=max_epochs,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        logger=None,
+        weights_summary=None,
+        progress_bar_refresh_rate=0,
+    )
+
+    model = BoringModel()
+    trainer.fit(model)
+    ckpt_files = os.listdir(tmpdir)
+    assert set(ckpt_files) == set(expected)
+
+    epochs_in_ckpt_files = [pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files]
+    assert sorted(epochs_in_ckpt_files) == list(range(max_epochs - save_top_k, max_epochs))

From afe5da7ade1bb359a9dd378d3c41a42326b57fc5 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 16 Dec 2020 15:09:26 +0900
Subject: [PATCH 008/136] Update isort config (#5142)

* Update isort config

* Apply isort with new config

* Fix typo in isort config

* fix rebase

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 benchmarks/test_parity.py         | 2 +-
 benchmarks/test_sharded_parity.py | 2 +-
 pyproject.toml                    | 4 +++-
 pytorch_lightning/setup_tools.py  | 4 ++--
 tests/conftest.py                 | 4 ++--
 tests/test_profiler.py            | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py
index 41bba9533e10d..3508d5a3c28ac 100644
--- a/benchmarks/test_parity.py
+++ b/benchmarks/test_parity.py
@@ -4,8 +4,8 @@
 import pytest
 import torch
 
+from pytorch_lightning import seed_everything, Trainer
 import tests.base.develop_utils as tutils
-from pytorch_lightning import Trainer, seed_everything
 from tests.base.models import ParityModuleMNIST, ParityModuleRNN
 
 
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 9fe4976442178..2e52613462621 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
 from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, NATIVE_AMP_AVAILABLE
diff --git a/pyproject.toml b/pyproject.toml
index 760421a56ece8..01e416aa51d8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|buil
 
 [tool.isort]
 known_first_party = [
-    "bencharmks",
+    "benchmarks",
     "docs",
     "pl_examples",
     "pytorch_lightning",
@@ -52,3 +52,5 @@ skip_glob = [
 ]
 profile = "black"
 line_length = 120
+force_sort_within_sections = "True"
+order_by_type = "False"
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index 3842bbe50cfc5..29ac3b814b3c2 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 import os
 import re
-import warnings
 from typing import Iterable, List
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
+import warnings
 
-from pytorch_lightning import PROJECT_ROOT, __homepage__, __version__
+from pytorch_lightning import __homepage__, __version__, PROJECT_ROOT
 
 _PATH_BADGES = os.path.join('.', 'docs', 'source', '_images', 'badges')
 # badge to download
diff --git a/tests/conftest.py b/tests/conftest.py
index 07188fed4dbed..c6a14a99b2478 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import threading
 from functools import partial, wraps
 from http.server import SimpleHTTPRequestHandler
+import sys
+import threading
 
 import pytest
 import torch.multiprocessing as mp
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 4728b11582dfc..91a8631a73287 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-import time
 from pathlib import Path
+import time
 
 import numpy as np
 import pytest

From b4d926baf25d1c02e3fdedafee85526b8ee1ea97 Mon Sep 17 00:00:00 2001
From: Loi Ly <VinhLoiIT@users.noreply.github.com>
Date: Wed, 16 Dec 2020 13:44:30 +0700
Subject: [PATCH 009/136] Fix reset TensorRunningAccum (#5106)

* Fix reset TensorRunningAccum

* add test for TensorRunningAccum's reset method

* fix CI failed due to PEP8

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/trainer/supporters.py |  2 +-
 tests/trainer/test_supporters.py        | 38 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 tests/trainer/test_supporters.py

diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py
index 57747be0d51fb..b2ba92846b241 100644
--- a/pytorch_lightning/trainer/supporters.py
+++ b/pytorch_lightning/trainer/supporters.py
@@ -50,7 +50,7 @@ def __init__(self, window_length: int):
 
     def reset(self) -> None:
         """Empty the accumulator."""
-        self = TensorRunningAccum(self.window_length)
+        self.__init__(self.window_length)
 
     def last(self):
         """Get the last added element."""
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
new file mode 100644
index 0000000000000..b8a0e066cdef8
--- /dev/null
+++ b/tests/trainer/test_supporters.py
@@ -0,0 +1,38 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+
+from pytorch_lightning.trainer.supporters import TensorRunningAccum
+
+
+def test_tensor_running_accum_reset():
+    """ Test that reset would set all attributes to the initialization state """
+
+    window_length = 10
+
+    accum = TensorRunningAccum(window_length=window_length)
+    assert accum.last() is None
+    assert accum.mean() is None
+
+    accum.append(torch.tensor(1.5))
+    assert accum.last() == torch.tensor(1.5)
+    assert accum.mean() == torch.tensor(1.5)
+
+    accum.reset()
+    assert accum.window_length == window_length
+    assert accum.memory is None
+    assert accum.current_idx == 0
+    assert accum.last_idx is None
+    assert not accum.rotated

From 94838d3be2b007746a432972c7a96085731df1f9 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 16 Dec 2020 12:07:11 -0800
Subject: [PATCH 010/136] Fix hang in DDP HPC accelerators (#5157)

* Fix hang in DDP HPC accelerators

init_device was never called

* Update CHANGELOG.md
---
 CHANGELOG.md                                              | 1 +
 pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py | 3 +++
 pytorch_lightning/accelerators/ddp_hpc_accelerator.py     | 1 +
 3 files changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 894d43d976f08..7ce51cdf014c4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
 
+- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157))
 
 
 ## [1.1.0] - 2020-12-09
diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
index a0545a4604aec..b9a71ed271744 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
@@ -48,3 +48,6 @@ def model_to_device(self, model, process_idx):
     def get_device_ids(self):
         device_ids = None
         return device_ids
+
+    def init_device(self, process_idx):
+        pass
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index ec4c087998614..b257884e34aef 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -126,6 +126,7 @@ def ddp_train(self, process_idx, model):
         """
         # determine which process we are and world size
         self.set_world_ranks(process_idx)
+        self.init_device(process_idx)
 
         # toggle prog bar
         if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:

From 140c37a56a5c51f62855332b42509aa5e3aa8ab1 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 16 Dec 2020 22:06:54 +0100
Subject: [PATCH 011/136] support number for logging with sync_dist=True
 (#5080)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support number

* add two tests

* wip

* add ddp in special test

* remove a test

* move device to bottom

* simplify test

* update test

* Update pytorch_lightning/core/step_result.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* resolve sync_ddp

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 pytorch_lightning/core/lightning.py           |  1 +
 pytorch_lightning/core/step_result.py         | 12 ++++--
 pytorch_lightning/utilities/distributed.py    | 15 ++++---
 tests/special_tests.sh                        |  2 +-
 .../test_train_loop_logging_1_0.py            | 39 +++++++++++++++++++
 5 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 8019d865c0ca0..f0c2201141d95 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -279,6 +279,7 @@ def log(
                 sync_dist_group,
                 accelerator.sync_tensor,
                 self._current_dataloader_idx,
+                self.device,
             )
 
     def log_dict(
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 142fe9048cb0e..a18eef2d406d7 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -15,15 +15,15 @@
 """[Train, Eval]Result for easier logging, checkpointing, early stopping, epoch-wise reduction."""
 
 import numbers
+import os
 from copy import copy
-from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any, List, Tuple, Iterable
+from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor
-import os
 
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 from pytorch_lightning.metrics import Metric
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
 
 
 class Result(Dict):
@@ -128,6 +128,7 @@ def log(
         sync_dist_group: Optional[Any] = None,
         sync_fn: Callable = None,
         dataloader_idx: Optional[int] = None,
+        device: torch.device = None,
     ):
         # no metrics should be logged with graphs
         if not enable_graph and isinstance(value, torch.Tensor):
@@ -138,7 +139,10 @@ def log(
         if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
             is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
             # TODO: Find a way to make the reduction only once, so we don't need to clone.
-            value = value.clone() if is_dist_initialized else value
+            if is_dist_initialized and isinstance(value, torch.Tensor):
+                value = value.clone()
+            else:
+                value = torch.tensor(value, device=device, dtype=torch.float)
             value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
 
         if 'meta' not in self:
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 9724f05247c00..c315c6633b6fb 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -15,14 +15,14 @@
 import os
 import warnings
 from functools import wraps
+from typing import Any, Optional, Union
 
 import torch
+
 from pytorch_lightning import _logger as log
-from typing import Union, Optional, Any
 
 if torch.distributed.is_available():
-    from torch.distributed import ReduceOp
-    from torch.distributed import group
+    from torch.distributed import ReduceOp, group
 else:
     class ReduceOp:
         SUM = None
@@ -145,15 +145,14 @@ def sync_ddp(
     if group is None:
         group = torch.distributed.group.WORLD
 
-    if reduce_op is None:
-        reduce_op = torch.distributed.ReduceOp.SUM
-    elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
-        reduce_op = torch.distributed.ReduceOp.SUM
+    op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM
+
+    if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"):
         divide_by_world_size = True
 
     # sync all processes before reduction
     torch.distributed.barrier(group=group)
-    torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)
+    torch.distributed.all_reduce(result, op=op, group=group, async_op=False)
 
     if divide_by_world_size:
         result = result / torch.distributed.get_world_size(group)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index f7cb581951783..950e3776bbc7f 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -19,4 +19,4 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
-# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
+python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index 0c27d8909d760..92b8e5a916474 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -18,6 +18,7 @@
 import collections
 import itertools
 import os
+import platform
 from unittest import mock
 
 import numpy as np
@@ -685,6 +686,7 @@ class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
             acc = self.step(batch[0])
             self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
             return acc
 
         def validation_step(self, batch, batch_idx):
@@ -704,9 +706,46 @@ def validation_step(self, batch, batch_idx):
     trainer.fit(model)
 
     assert trainer.logged_metrics['foo'] == fake_result
+    assert trainer.logged_metrics['foo_2'] == 2
     assert trainer.logged_metrics['bar'] == fake_result
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+def test_logging_sync_dist_true_ddp(tmpdir):
+    """
+    Tests to ensure that the sync_dist flag works with ddp
+    """
+    class TestLoggingSyncDistModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            acc = self.step(batch[0])
+            self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM')
+            return acc
+
+        def validation_step(self, batch, batch_idx):
+            self.training_step_called = True
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG')
+            return {"x": loss}
+
+    model = TestLoggingSyncDistModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        weights_summary=None,
+        accelerator="ddp",
+        gpus=2,
+    )
+    trainer.fit(model)
+
+    assert trainer.logged_metrics['foo'] == 2
+    assert trainer.logged_metrics['bar'] == 2
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_logging_sync_dist_true_gpu(tmpdir):
     """

From 61e3981a6ea63e1a2997b6a7b5d262669f2e8c6b Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 16 Dec 2020 22:07:17 +0100
Subject: [PATCH 012/136] Un-balanced logging properly supported (#5119)

* resolve bug

* clean code

* resolve comments

* Update tests/trainer/optimization/test_multiple_optimizers.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* resolve another bug

* add comments

* use abs to find diff

* update

* resolve flake8

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .../logger_connector/epoch_result_store.py    | 26 ++++----
 .../optimization/test_multiple_optimizers.py  | 63 +++++++++++++++++++
 2 files changed, 78 insertions(+), 11 deletions(-)
 create mode 100644 tests/trainer/optimization/test_multiple_optimizers.py

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 28025859814cc..6d206f3dd929e 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -91,11 +91,13 @@ def check_dataloader_idx(self, result: Result) -> bool:
         random_key = list(result.keys())[-1]
         return result["meta"][random_key]["dataloader_idx"] is not None
 
-    def get_latest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict:
+    def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict:
         results = {}
-        add_dataloader_idx = self.check_dataloader_idx(latest_result)
-        func = getattr(latest_result, func_name)
-        results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
+        for opt_idx in latest_result_opt:
+            latest_result = latest_result_opt[opt_idx]
+            add_dataloader_idx = self.check_dataloader_idx(latest_result)
+            func = getattr(latest_result, func_name)
+            results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
         return results
 
     def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]:
@@ -156,6 +158,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
         assert isinstance(result, Result)
         if dataloader_idx is None:
             dataloader_idx = 0
+
         if extra_info is None:
             extra_info = {}
 
@@ -166,6 +169,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
             if dataloader_idx not in self._internals:
                 self._internals[dataloader_idx] = {}
                 self._internals_reduced[dataloader_idx] = defaultdict(dict)
+                self._latest_ref[dataloader_idx] = {}
 
             # extract infos
             opt_idx = extra_info["opt_idx"]
@@ -173,7 +177,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
 
             self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result)
 
-            self._latest_ref[dataloader_idx] = result
+            self._latest_ref[dataloader_idx][opt_idx] = result
 
         # [dataloader_idx] is a list
         else:
@@ -181,7 +185,11 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
             self._internals.setdefault(dataloader_idx, [])
             self._internals[dataloader_idx].append(result)
 
-            self._latest_ref[dataloader_idx] = result
+            if dataloader_idx not in self._latest_ref:
+                self._latest_ref[dataloader_idx] = {}
+                self._latest_ref[dataloader_idx][0] = {}
+
+            self._latest_ref[dataloader_idx][0] = result
 
     def auto_reduce_results_on_epoch_end(self) -> None:
         """
@@ -206,13 +214,9 @@ def auto_reduce_results_on_epoch_end(self) -> None:
                     # TODO: How to start training in middle of epoch
                     opt_outputs = epoch_metrics[opt_idx]
 
-                    num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1
-                    assert num_batch_idx >= 0
-                    batch_indexes = self._internals[dl_idx][num_opt_idx].keys()
-
                     # reduce across time first
                     time_reduced_outputs = []
-                    for batch_idx in batch_indexes:
+                    for batch_idx in opt_outputs.keys():
                         tbptt_outs = opt_outputs[batch_idx]
                         tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs)
                         if len(tbptt_outs) > 1:
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
new file mode 100644
index 0000000000000..78b6f8f7ff84a
--- /dev/null
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -0,0 +1,63 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests to ensure that the behaviours related to multiple optimizers works
+"""
+import torch
+
+import pytorch_lightning as pl
+from tests.base.boring_model import BoringModel
+
+
+def test_unbalanced_logging_with_multiple_optimizers(tmpdir):
+    """
+    This tests ensures reduction works in un-balanced logging settings
+    """
+    class TestModel(BoringModel):
+
+        loss_1 = []
+        loss_2 = []
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            if optimizer_idx == 0 and self.trainer.global_step > 10:
+                self.log("loss_1", loss, on_epoch=True, prog_bar=True)
+                self.loss_1.append(loss.detach().clone())
+            elif optimizer_idx == 1:
+                self.log("loss_2", loss, on_epoch=True, prog_bar=True)
+                self.loss_2.append(loss.detach().clone())
+            return {"loss": loss}
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            return [optimizer, optimizer2]
+
+    model = TestModel()
+    model.training_epoch_end = None
+
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+    )
+
+    trainer.fit(model)
+
+    assert torch.equal(trainer.callback_metrics["loss_2_step"], model.loss_2[-1])
+    assert torch.equal(trainer.callback_metrics["loss_1_step"], model.loss_1[-1])
+    # test loss are properly reduced
+    assert torch.abs(trainer.callback_metrics["loss_2_epoch"] - torch.FloatTensor(model.loss_2).mean()) < 1e-6
+    assert torch.abs(trainer.callback_metrics["loss_1_epoch"] - torch.FloatTensor(model.loss_1).mean()) < 1e-6

From 8d1ca4ca13c1f694e56a6658a9d4d5ac170320de Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 16 Dec 2020 22:07:35 +0100
Subject: [PATCH 013/136] [bugfix] remove nan loss in manual optimization
 (#5121)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* remove nan loss whe missing

* Update pytorch_lightning/core/lightning.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* Apply suggestions from code review

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/core/lightning.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index f0c2201141d95..ab66435a2935d 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1393,12 +1393,15 @@ def get_progress_bar_dict(self):
         """
         # call .item() only once but store elements without graphs
         running_train_loss = self.trainer.train_loop.running_loss.mean()
-        avg_training_loss = (
-            running_train_loss.cpu().item()
-            if running_train_loss is not None
-            else float("NaN")
-        )
-        tqdm_dict = {"loss": "{:.3g}".format(avg_training_loss)}
+        avg_training_loss = None
+        if running_train_loss is not None:
+            avg_training_loss = running_train_loss.cpu().item()
+        elif self.trainer.train_loop.automatic_optimization:
+            avg_training_loss = float('NaN')
+
+        tqdm_dict = {}
+        if avg_training_loss is not None:
+            tqdm_dict["loss"] = f"{avg_training_loss:.3g}"
 
         if self.trainer.truncated_bptt_steps is not None:
             tqdm_dict["split_idx"] = self.trainer.split_idx

From 81fd33b431578129de6ea19b99af7da87b1081bf Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 16 Dec 2020 22:08:06 +0100
Subject: [PATCH 014/136] [bug-fix] Metric reduction with Logging (#5150)

* add test

* resolve bug

* udpate test

* wrongly copy / paste

* update test

* resolve a second bug

Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
---
 pytorch_lightning/callbacks/early_stopping.py | 11 +++--
 .../callbacks/model_checkpoint.py             | 11 +++--
 pytorch_lightning/core/step_result.py         |  5 +-
 .../test_train_loop_logging_1_0.py            | 49 ++++++++++++++++++-
 4 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 88f1881643c9a..4125a924cb2c5 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -19,6 +19,7 @@
 Monitor a metric and stop training when it stops improving.
 
 """
+import numbers
 import os
 
 import numpy as np
@@ -26,7 +27,8 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, TPU_AVAILABLE
+from pytorch_lightning.metrics.metric import Metric
+from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_info, rank_zero_warn
 
 
 class EarlyStopping(Callback):
@@ -201,8 +203,11 @@ def _run_early_stopping_check(self, trainer, pl_module):
         # when in dev debugging
         trainer.dev_debugger.track_early_stopping_history(self, current)
 
-        if not isinstance(current, torch.Tensor):
-            current = torch.tensor(current, device=pl_module.device)
+        if current is not None:
+            if isinstance(current, Metric):
+                current = current.compute()
+            elif isinstance(current, numbers.Number):
+                current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
 
         if trainer.use_tpu and TPU_AVAILABLE:
             current = current.cpu()
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 4ac800f456c06..82df32ce3996c 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -20,6 +20,7 @@
 
 """
 
+import numbers
 import os
 import re
 from copy import deepcopy
@@ -32,8 +33,9 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
+from pytorch_lightning.metrics.metric import Metric
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -580,8 +582,11 @@ def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
         epoch = metrics.get("epoch")
         step = metrics.get("step")
 
-        if not isinstance(current, torch.Tensor) and current is not None:
-            current = torch.tensor(current, device=pl_module.device)
+        if current is not None:
+            if isinstance(current, Metric):
+                current = current.compute()
+            elif isinstance(current, numbers.Number):
+                current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
 
         if self.check_monitor_top_k(current):
             self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index a18eef2d406d7..b6112a68b4e9b 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -371,7 +371,10 @@ def get_forked_metrics(self, add_dataloader_idx=False):
             dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx)
 
             if options['forked']:
-                result[dl_key] = self[k]
+                if isinstance(self[k], Metric):
+                    result[dl_key] = self[k].compute().detach()
+                else:
+                    result[dl_key] = self[k]
 
         return result
 
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index 92b8e5a916474..51b9c2ac69496 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -27,8 +27,8 @@
 from torch.utils.data import Dataset
 
 import pytorch_lightning as pl
-from pytorch_lightning import Trainer, callbacks
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import callbacks, Trainer
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset
 from tests.base.deterministic_model import DeterministicModel
@@ -810,3 +810,48 @@ def on_train_epoch_end(self, *_):
     trainer.fit(model)
     assert model.epoch_end_called
     assert model.on_train_epoch_end_called
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
+def test_metric_are_properly_reduced(tmpdir):
+    class TestingModel(BoringModel):
+        def __init__(self, *args, **kwargs):
+            super().__init__()
+            self.train_acc = pl.metrics.Accuracy()
+            self.val_acc = pl.metrics.Accuracy()
+
+        def training_step(self, batch, batch_idx):
+            self.train_acc(torch.rand(1, 3, device=self.device), torch.randint(0, 2, (1,), device=self.device))
+            self.log('train_acc', self.train_acc, on_step=True, on_epoch=True)
+            return super().training_step(batch, batch_idx)
+
+        def validation_step(self, batch, batch_idx):
+            preds = torch.tensor(0, device=self.device)
+            targets = torch.tensor(1, device=self.device)
+            if batch_idx < 8:
+                targets = preds
+            self.val_acc(preds, targets)
+            self.log('val_acc', self.val_acc, on_step=True, on_epoch=True)
+            return super().validation_step(batch, batch_idx)
+
+    early_stop = EarlyStopping(monitor='val_acc', mode='max')
+
+    checkpoint = ModelCheckpoint(
+        monitor='val_acc',
+        save_last=True,
+        save_top_k=2,
+        mode='max',
+    )
+
+    model = TestingModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        gpus=1,
+        max_epochs=2,
+        limit_train_batches=5,
+        limit_val_batches=32,
+        callbacks=[early_stop, checkpoint])
+    trainer.fit(model)
+
+    assert trainer.callback_metrics["val_acc"] == 8 / 32.
+    assert "train_acc" in trainer.callback_metrics

From 3910b283399f1b7888eecc63ab88ac375c284752 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Thu, 17 Dec 2020 01:08:12 +0000
Subject: [PATCH 015/136] Disable pl optimizer temporarily to fix AMP issues
 (#5163)

* Disable pl optimizer temporarily to fix AMP issues

* Add todo and enable pl optimizer in the test
---
 pytorch_lightning/trainer/trainer.py | 2 +-
 tests/callbacks/test_callbacks.py    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 35da90625adef..5a837956bc4ce 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -133,7 +133,7 @@ def __init__(
         distributed_backend: Optional[str] = None,
         automatic_optimization: Optional[bool] = None,
         move_metrics_to_cpu: bool = False,
-        enable_pl_optimizer: bool = True,
+        enable_pl_optimizer: bool = False,
     ):
         r"""
         Customize every aspect of training via flags
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index c00c712bb3b13..070bb4e9f6989 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -33,6 +33,8 @@ def test_trainer_callback_system(torch_save):
         limit_train_batches=3,
         limit_test_batches=2,
         progress_bar_refresh_rate=0,
+        # todo: enabled since internally we wrap the model for optimizer step, this should be fixed
+        enable_pl_optimizer=True
     )
 
     # no call yet

From e5569a9f6c7bf80a7e23d7e1a10605279681be0a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 17 Dec 2020 08:27:05 +0100
Subject: [PATCH 016/136] drop install FairScale for TPU (#5113)

* drop install FairScale for TPU

* typo

Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 dockers/base-xla/Dockerfile  | 2 ++
 dockers/tpu-tests/Dockerfile | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index 8eb093295c37b..5dfeac8c9e86e 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -97,6 +97,8 @@ RUN \
     python -c "fname = 'requirements.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torch')] ; open(fname, 'w').writelines(lines)" && \
     # drop Horovod as it is not needed
     python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    # drop fairscale as it is not needed
+    python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
     # drop TorchVision as it was installed with XLA
     python -c "fname = 'requirements/examples.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torchvision')] ; open(fname, 'w').writelines(lines)" && \
     pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index a514b1c3d35fe..464f7fd8f309e 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -27,8 +27,10 @@ COPY ./ ./pytorch-lightning/
 RUN \
     # Install pytorch-lightning at the current PR, plus dependencies.
     #pip install -r pytorch-lightning/requirements.txt --no-cache-dir && \
-    # drop Horovod
+    # drop Horovod as it is not needed
     python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    # drop fairscale as it is not needed
+    python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
     pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed
 
 #RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"

From 405a8402b8a82b2bf4a0ebf982939c548e07a27f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 17 Dec 2020 09:20:59 +0100
Subject: [PATCH 017/136] temporarily suspend all mergify rules (#5112)

---
 .mergify.yml | 112 +++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/.mergify.yml b/.mergify.yml
index 44c48f2ddced5..cb5ef3ec7519a 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-pull_request_rules:
-
-  - name: Automatic merge on approval
-    conditions:
-      - base=master
-      # number of review approvals
-      - "#approved-reviews-by>=3"
-      # no waiting or assigned review
-      - "#review-requested=0"
-      # no requested chnages from any reviewer
-      - "#changes-requested-reviews-by=0"
-      # this serves as ALL check has to pass as we have actually around 40 tests in total
-      - "#status-success>=54"
-      # this is just in case since we rely on GPU tests (note: redundand to the above)
-      - status-success=continuous-integration/drone/pr
-      - "status-success=ci/circleci: TPU-tests"
-      # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
-      #- "status-success~=^ci/circleci:"
-      # no conflict with master branch
-      - -conflict
-      # was not closed yet
-      - -closed
-      # filter-out GH draft PRs
-      - -draft
-    actions:
-      delete_head_branch: {}
-      merge:
-        # https://doc.mergify.io/merge-action.html#strict-merge
-        # (on head branch) $ git merge --no-ff base
-        # (on head branch) # Wait for CI to go green
-        # (on head branch) # Squash all commits
-        # (on base branch) $ git merge --ff head
-        strict: true
-        method: squash
-      comment:
-        message: Great job! =)
-
-  - name: warn on conflicts
-    conditions:
-      - conflict
-      # filter-out GH draft PRs
-      - -draft
-    actions:
-      comment:
-        message: This pull request is now in conflict... :(
-
-  - name: add core reviewer
-    conditions:
-      # filter-out GH draft PRs
-      - -draft
-      # number of review approvals
-      - "#approved-reviews-by<3"
-    actions:
-      request_reviews:
-        teams:
-          - core-contributors
+#pull_request_rules:
+#
+#  - name: Automatic merge on approval
+#    conditions:
+#      - base=master
+#      # number of review approvals
+#      - "#approved-reviews-by>=3"
+#      # no waiting or assigned review
+#      - "#review-requested=0"
+#      # no requested chnages from any reviewer
+#      - "#changes-requested-reviews-by=0"
+#      # this serves as ALL check has to pass as we have actually around 40 tests in total
+#      - "#status-success>=54"
+#      # this is just in case since we rely on GPU tests (note: redundand to the above)
+#      - status-success=continuous-integration/drone/pr
+#      - "status-success=ci/circleci: TPU-tests"
+#      # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
+#      #- "status-success~=^ci/circleci:"
+#      # no conflict with master branch
+#      - -conflict
+#      # was not closed yet
+#      - -closed
+#      # filter-out GH draft PRs
+#      - -draft
+#    actions:
+#      delete_head_branch: {}
+#      merge:
+#        # https://doc.mergify.io/merge-action.html#strict-merge
+#        # (on head branch) $ git merge --no-ff base
+#        # (on head branch) # Wait for CI to go green
+#        # (on head branch) # Squash all commits
+#        # (on base branch) $ git merge --ff head
+#        strict: true
+#        method: squash
+#      comment:
+#        message: Great job! =)
+#
+#  - name: warn on conflicts
+#    conditions:
+#      - conflict
+#      # filter-out GH draft PRs
+#      - -draft
+#    actions:
+#      comment:
+#        message: This pull request is now in conflict... :(
+#
+#  - name: add core reviewer
+#    conditions:
+#      # filter-out GH draft PRs
+#      - -draft
+#      # number of review approvals
+#      - "#approved-reviews-by<3"
+#    actions:
+#      request_reviews:
+#        teams:
+#          - core-contributors

From a94f6622f50a66259cff2367154e4550ce5f1edc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 17 Dec 2020 10:21:00 +0100
Subject: [PATCH 018/136] prune ecosystem example (#5085)

* draft

* wip

* CI

* drop pl geometry

* copy

* logo
---
 pl_examples/__init__.py                       |  37 ++
 pl_examples/basic_examples/autoencoder.py     |   3 +-
 .../backbone_image_classifier.py              |   3 +-
 .../basic_examples/conv_sequential_example.py |   2 +
 .../basic_examples/dali_image_classifier.py   |   3 +-
 .../basic_examples/simple_image_classifier.py |   2 +
 pl_examples/bug_report_model.py               |   3 +
 .../computer_vision_fine_tuning.py            |  15 +
 .../generative_adversarial_net.py             |  15 +
 pl_examples/domain_templates/imagenet.py      |  15 +
 .../domain_templates/reinforce_learn_Qnet.py  |  15 +
 .../domain_templates/semantic_segmentation.py |  16 +
 pl_examples/domain_templates/unet.py          |  14 +
 pl_examples/pytorch_ecosystem/__init__.py     |  13 +
 .../pytorch_geometric/README.md               |  38 --
 .../pytorch_geometric/__init__.py             |   0
 .../pytorch_geometric/cora_dna.py             | 375 ------------------
 .../pytorch_geometric/lightning.py            |  31 --
 .../pytorch_geometric/pyproject.toml          |  25 --
 pl_examples/test_examples.py                  |  14 +
 requirements/examples.txt                     |   2 +-
 21 files changed, 168 insertions(+), 473 deletions(-)
 create mode 100644 pl_examples/pytorch_ecosystem/__init__.py
 delete mode 100644 pl_examples/pytorch_ecosystem/pytorch_geometric/README.md
 delete mode 100644 pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py
 delete mode 100644 pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py
 delete mode 100644 pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py
 delete mode 100644 pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml

diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
index d7cec9fc1bc3a..147fc330ecd59 100644
--- a/pl_examples/__init__.py
+++ b/pl_examples/__init__.py
@@ -8,3 +8,40 @@
 
 TORCHVISION_AVAILABLE = _module_available("torchvision")
 DALI_AVAILABLE = _module_available("nvidia.dali")
+
+
+LIGHTNING_LOGO = """
+                    ####
+                ###########
+             ####################
+         ############################
+    #####################################
+##############################################
+#########################  ###################
+#######################    ###################
+####################      ####################
+##################       #####################
+################        ######################
+#####################        #################
+######################     ###################
+#####################    #####################
+####################   #######################
+###################  #########################
+##############################################
+    #####################################
+         ############################
+             ####################
+                  ##########
+                     ####
+"""
+
+
+def nice_print(msg, last=False):
+    print()
+    print("\033[0;35m" + msg + "\033[0m")
+    if last:
+        print()
+
+
+def cli_lightning_logo():
+    nice_print(LIGHTNING_LOGO)
diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index 58a117a648458..72bfcb17c0872 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -21,7 +21,7 @@
 from torch.utils.data import random_split
 
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE
+from pl_examples import TORCHVISION_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
@@ -105,4 +105,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index 91a8481de7fd9..b0ca2efd5d76b 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -19,7 +19,7 @@
 from torch.utils.data import DataLoader, random_split
 
 import pytorch_lightning as pl
-from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE
+from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
@@ -125,4 +125,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 36c8c2c1f69b3..06fddd689260f 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -29,6 +29,7 @@
 import torchvision
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer
 from pytorch_lightning.metrics.functional import accuracy
 from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
@@ -190,6 +191,7 @@ def instantiate_datamodule(args):
 
 
 if __name__ == "__main__":
+    cli_lightning_logo()
     parser = ArgumentParser(description="Pipe Example")
     parser.add_argument("--use_ddp_sequential", action="store_true")
     parser = Trainer.add_argparse_args(parser)
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index 0a39f1cb9a9ae..9f3ba5e08b37e 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -22,7 +22,7 @@
 from torch.utils.data import random_split
 
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE, DALI_AVAILABLE
+from pl_examples import TORCHVISION_AVAILABLE, DALI_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
     from torchvision.datasets.mnist import MNIST
@@ -204,4 +204,5 @@ def cli_main():
 
 
 if __name__ == "__main__":
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index a341728554d31..6b8457e0e4897 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -19,6 +19,7 @@
 from torch.nn import functional as F
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
 
 
@@ -103,4 +104,5 @@ def cli_main():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     cli_main()
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index dbea2013d1110..e2201db12f894 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -22,6 +22,8 @@
 import os
 import torch
 from torch.utils.data import Dataset
+
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer, LightningModule
 
 
@@ -137,4 +139,5 @@ def on_train_epoch_start(self) -> None:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     run_test()
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 21f6644b09a5b..1c60e3aa6d23f 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Computer vision example on Transfer Learning.
 
 This computer vision example illustrates how one could fine-tune a pre-trained
@@ -40,6 +53,7 @@
 from torchvision.datasets.utils import download_and_extract_archive
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import _logger as log
 
 BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)
@@ -451,4 +465,5 @@ def get_args() -> argparse.Namespace:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     main(get_args())
diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py
index 088b625e31d01..210a80721d9a9 100644
--- a/pl_examples/domain_templates/generative_adversarial_net.py
+++ b/pl_examples/domain_templates/generative_adversarial_net.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 To run this template just do:
 python generative_adversarial_net.py
@@ -18,6 +31,7 @@
 from torch.utils.data import DataLoader
 from torchvision.datasets import MNIST
 
+from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningModule, LightningDataModule
 from pytorch_lightning.trainer import Trainer
 
@@ -211,6 +225,7 @@ def main(args: Namespace) -> None:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     parser = ArgumentParser()
 
     # Add program level args, if any.
diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index b7116547d389b..b1eea307478f9 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 This example is largely adapted from https://github.com/pytorch/examples/blob/master/imagenet/main.py
 
@@ -32,6 +45,7 @@
 import torchvision.transforms as transforms
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningModule
 
 
@@ -246,4 +260,5 @@ def run_cli():
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     run_cli()
diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index 4b01f83e36639..a8b9db095f377 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Deep Reinforcement Learning: Deep Q-network (DQN)
 
@@ -33,6 +46,7 @@
 from torch.utils.data.dataset import IterableDataset
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 
 
 class DQN(nn.Module):
@@ -349,6 +363,7 @@ def main(args) -> None:
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     torch.manual_seed(0)
     np.random.seed(0)
 
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 4ca1ebc2aec76..08bdc1140916a 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from argparse import ArgumentParser, Namespace
@@ -10,6 +24,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pl_examples.domain_templates.unet import UNet
 from pytorch_lightning.loggers import WandbLogger
 
@@ -225,6 +240,7 @@ def main(hparams: Namespace):
 
 
 if __name__ == '__main__':
+    cli_lightning_logo()
     parser = ArgumentParser()
     parser.add_argument("--data_path", type=str, help="path where dataset is stored")
     parser.add_argument("--gpus", type=int, default=-1, help="number of available GPUs")
diff --git a/pl_examples/domain_templates/unet.py b/pl_examples/domain_templates/unet.py
index 6117447e5ed33..20b4bdb2a4bf9 100644
--- a/pl_examples/domain_templates/unet.py
+++ b/pl_examples/domain_templates/unet.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/pl_examples/pytorch_ecosystem/__init__.py b/pl_examples/pytorch_ecosystem/__init__.py
new file mode 100644
index 0000000000000..d7aa17d7f8468
--- /dev/null
+++ b/pl_examples/pytorch_ecosystem/__init__.py
@@ -0,0 +1,13 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md b/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md
deleted file mode 100644
index 5c9a42d5a8942..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-#  [Pytorch Geometric](https://github.com/rusty1s/pytorch_geometric) examples with Lighting
-
-### Introduction
-
-PyTorch Geometric (PyG) is a geometric deep learning extension library for PyTorch. It relies on lower level libraries such as
-
-* PyTorch Cluster: A package consists of a small extension library of highly optimized graph cluster algorithms in Pytorch
-* PyTorch Sparse: A package consists of a small extension library of optimized sparse matrix operations with autograd support in Pytorch
-* PyTorch Scatter: A package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in PyTorch
-
-## Setup
-
-```
-pyenv install 3.7.8
-pyenv local 3.7.8
-python -m venv
-source .venv/bin/activate
-poetry install
-```
-
-Run example
-
-```
-python cora_dna.py
-```
-
-## Current example lists
-
-| `DATASET` | `MODEL` | `TASK` | DATASET DESCRIPTION | MODEL DESCRIPTION                                                                                                                                                                   |                                                                                                                                                                     |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| Cora | DNA | Node Classification | The citation network datasets "Cora", "CiteSeer" and "PubMed" from the "Revisiting Semi-Supervised Learning with Graph Embeddings" <https://arxiv.org/abs/1603.08861> | The dynamic neighborhood aggregation operator from the "Just Jump: Towards Dynamic Neighborhood Aggregation in Graph Neural Networks"
-
-
-## DATASET SIZES
-
-```
- 16M    ./cora
-```
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py
deleted file mode 100644
index e4e040ff7072e..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/cora_dna.py
+++ /dev/null
@@ -1,375 +0,0 @@
-"""Graph Convolution Example using Pytorch Geometric
-
-This example illustrates how one could train a graph convolution model with DNA Conv
-on Cora Dataset using pytorch-lightning. This example will also demonstrate how this
-model can be easily torch-scripted, thanks to Pytorch Geometric.
-"""
-# python imports
-import os
-import os.path as osp
-import sys
-from functools import partial
-from collections import namedtuple
-from argparse import ArgumentParser
-from typing import List, Optional, NamedTuple
-
-# thrid parties libraries
-import numpy as np
-from torch import nn
-import torch
-from torch import Tensor
-from torch.optim import Adam
-import torch.nn.functional as F
-
-# Lightning imports
-from pytorch_lightning import (
-    Trainer,
-    LightningDataModule,
-    LightningModule
-)
-from pytorch_lightning.metrics import Accuracy
-
-try:
-    # Pytorch Geometric imports
-    from torch_geometric.nn import DNAConv, MessagePassing
-    from torch_geometric.data import DataLoader
-    from torch_geometric.datasets import Planetoid
-    import torch_geometric.transforms as T
-    from torch_geometric.data import NeighborSampler
-    from lightning import lightning_logo, nice_print
-except Exception:
-    HAS_PYTORCH_GEOMETRIC = False
-else:
-    HAS_PYTORCH_GEOMETRIC = True
-
-
-# use to make model jittable
-OptTensor = Optional[Tensor]
-ListTensor = List[Tensor]
-
-
-class TensorBatch(NamedTuple):
-    x: Tensor
-    edge_index: ListTensor
-    edge_attr: OptTensor
-    batch: OptTensor
-
-###################################
-#       LightningDataModule       #
-###################################
-
-
-class CoraDataset(LightningDataModule):
-
-    r"""The citation network datasets "Cora", "CiteSeer" and "PubMed" from the
-    `"Revisiting Semi-Supervised Learning with Graph Embeddings"
-    <https://arxiv.org/abs/1603.08861>`_ paper.
-    Nodes represent documents and edges represent citation links.
-    Training, validation and test splits are given by binary masks.
-    c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/datasets/planetoid.py
-    """
-
-    NAME = "cora"
-
-    def __init__(self,
-                 num_workers: int = 1,
-                 batch_size: int = 8,
-                 drop_last: bool = True,
-                 pin_memory: bool = True,
-                 num_layers: int = None):
-        super().__init__()
-
-        assert num_layers is not None
-
-        self._num_workers = num_workers
-        self._batch_size = batch_size
-        self._drop_last = drop_last
-        self._pin_memory = pin_memory
-        self._num_layers = num_layers
-
-        self._transform = T.NormalizeFeatures()
-
-    @property
-    def num_features(self):
-        return 1433
-
-    @property
-    def num_classes(self):
-        return 7
-
-    @property
-    def hyper_parameters(self):
-        # used to inform the model the dataset specifications
-        return {"num_features": self.num_features, "num_classes": self.num_classes}
-
-    def prepare_data(self):
-        path = osp.join(
-            osp.dirname(osp.realpath(__file__)), "..", "..", "data", self.NAME
-        )
-        self.dataset = Planetoid(path, self.NAME, transform=self._transform)
-        self.data = self.dataset[0]
-
-    def create_neighbor_sampler(self, batch_size=2, stage=None):
-        # https://github.com/rusty1s/pytorch_geometric/tree/master/torch_geometric/data/sampler.py#L18
-        return NeighborSampler(
-            self.data.edge_index,
-            # the nodes that should be considered for sampling.
-            node_idx=getattr(self.data, f"{stage}_mask"),
-            # -1 indicates all neighbors will be selected
-            sizes=[self._num_layers, -1],
-            num_workers=self._num_workers,
-            drop_last=self._drop_last,
-            pin_memory=self._pin_memory,
-        )
-
-    def train_dataloader(self):
-        return self.create_neighbor_sampler(stage="train")
-
-    def validation_dataloader(self):
-        return self.create_neighbor_sampler(stage="val")
-
-    def test_dataloader(self):
-        return self.create_neighbor_sampler(stage="test")
-
-    def gather_data_and_convert_to_namedtuple(self, batch, batch_nb):
-        """
-        This function will select features using node_idx
-        and create a NamedTuple Object.
-        """
-
-        usual_keys = ["x", "edge_index", "edge_attr", "batch"]
-        Batch: TensorBatch = namedtuple("Batch", usual_keys)
-        return (
-            Batch(
-                self.data.x[batch[1]],
-                [e.edge_index for e in batch[2]],
-                None,
-                None,
-            ),
-            self.data.y[batch[1]],
-        )
-
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--num_workers", type=int, default=1)
-        parser.add_argument("--batch_size", type=int, default=2)
-        parser.add_argument("--drop_last", default=True)
-        parser.add_argument("--pin_memory", default=True)
-        return parser
-
-
-###############################
-#       LightningModule       #
-###############################
-
-
-class DNAConvNet(LightningModule):
-
-    r"""The dynamic neighborhood aggregation operator from the `"Just Jump:
-    Towards Dynamic Neighborhood Aggregation in Graph Neural Networks"
-    <https://arxiv.org/abs/1904.04849>`_ paper
-    c.f https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172
-    """
-
-    def __init__(self,
-                 num_layers: int = 2,
-                 hidden_channels: int = 128,
-                 heads: int = 8,
-                 groups: int = 16,
-                 dropout: float = 0.8,
-                 cached: bool = False,
-                 num_features: int = None,
-                 num_classes: int = None,
-                 ):
-        super().__init__()
-
-        assert num_features is not None
-        assert num_classes is not None
-
-        # utils from Lightning to save __init__ arguments
-        self.save_hyperparameters()
-        hparams = self.hparams
-
-        # Instantiate metrics
-        self.val_acc = Accuracy(hparams["num_classes"])
-        self.test_acc = Accuracy(hparams["num_classes"])
-
-        # Define DNA graph convolution model
-        self.hidden_channels = hparams["hidden_channels"]
-        self.lin1 = nn.Linear(hparams["num_features"], hparams["hidden_channels"])
-
-        # Create ModuleList to hold all convolutions
-        self.convs = nn.ModuleList()
-
-        # Iterate through the number of layers
-        for _ in range(hparams["num_layers"]):
-
-            # Create a DNA Convolution - This graph convolution relies on MultiHead Attention mechanism
-            # to route information similar to Transformers.
-            # https://github.com/rusty1s/pytorch_geometric/blob/master/torch_geometric/nn/conv/dna_conv.py#L172
-            self.convs.append(
-                DNAConv(
-                    hparams["hidden_channels"],
-                    hparams["heads"],
-                    hparams["groups"],
-                    dropout=hparams["dropout"],
-                    cached=False,
-                )
-            )
-        # classification MLP
-        self.lin2 = nn.Linear(hparams["hidden_channels"], hparams["num_classes"], bias=False)
-
-    def forward(self, batch: TensorBatch):
-        # batch needs to be typed for making this model jittable.
-        x = batch.x
-        x = F.relu(self.lin1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x_all = x.view(-1, 1, self.hidden_channels)
-
-        # iterate over all convolutions
-        for idx, conv in enumerate(self.convs):
-            # perform convolution using previously concatenated embedding
-            # through edge_index
-            x = F.relu(conv(x_all, batch.edge_index[idx]))
-            x = x.view(-1, 1, self.hidden_channels)
-
-            # concatenate with previously computed embedding
-            x_all = torch.cat([x_all, x], dim=1)
-
-        # extra latest layer embedding
-        x = x_all[:, -1]
-
-        x = F.dropout(x, p=0.5, training=self.training)
-
-        # return logits per nodes
-        return F.log_softmax(self.lin2(x), -1)
-
-    def step(self, batch, batch_nb):
-        typed_batch, targets = self.gather_data_and_convert_to_namedtuple(batch, batch_nb)
-        logits = self(typed_batch)
-        return logits, targets
-
-    def training_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        train_loss = F.nll_loss(logits, targets)
-        self.log("train_loss", train_loss, on_step=True, on_epoch=True, prog_bar=True)
-        return train_loss
-
-    def validation_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        val_loss = F.nll_loss(logits, targets)
-        self.log("val_loss", val_loss, on_step=False, on_epoch=True, prog_bar=True)
-        self.log("val_acc", self.val_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True)
-
-    def test_step(self, batch, batch_nb):
-        logits, targets = self.step(batch, batch_nb)
-        test_loss = F.nll_loss(logits, targets)
-        self.log("test_loss", test_loss, on_step=False, on_epoch=True, prog_bar=True)
-        self.log("test_acc", self.test_acc(logits, targets), on_step=False, on_epoch=True, prog_bar=True)
-
-    # Use for jittable demonstration.
-
-    def _convert_to_jittable(self, module):
-        for key, m in module._modules.items():
-            if isinstance(m, MessagePassing) and m.jittable is not None:
-                # Pytorch Geometric MessagePassing implements a `.jittable` function
-                # which converts the current module into its jittable version.
-                module._modules[key] = m.jittable()
-            else:
-                self._convert_to_jittable(m)
-        return module
-
-    def jittable(self):
-        for key, m in self._modules.items():
-            self._modules[key] = self._convert_to_jittable(m)
-
-    def configure_optimizers(self):
-        return Adam(self.parameters(), lr=1e-3)
-
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--num_layers", type=int, default=2)
-        parser.add_argument("--hidden_channels", type=int, default=128)
-        parser.add_argument("--heads", type=int, default=8)
-        parser.add_argument("--groups", type=int, default=16)
-        parser.add_argument("--dropout", type=float, default=0.8)
-        parser.add_argument("--cached", type=int, default=0)
-        parser.add_argument("--jit", default=True)
-        return parser
-
-#################################
-#     Instantiate Functions     #
-#################################
-
-
-def instantiate_datamodule(args):
-    datamodule = CoraDataset(
-        num_workers=args.num_workers,
-        batch_size=args.batch_size,
-        drop_last=args.drop_last,
-        pin_memory=args.pin_memory,
-        num_layers=args.num_layers,
-    )
-    return datamodule
-
-
-def instantiate_model(args, datamodule):
-    model = DNAConvNet(
-        num_layers=args.num_layers,
-        hidden_channels=args.hidden_channels,
-        heads=args.heads,
-        groups=args.groups,
-        dropout=args.dropout,
-        # provide dataset specific arguments
-        **datamodule.hyper_parameters,
-    )
-    if args.jit:
-        model.jittable()
-
-    # Attached datamodule function to model
-    model.gather_data_and_convert_to_namedtuple = datamodule.gather_data_and_convert_to_namedtuple
-    return model
-
-
-def get_single_batch(datamodule):
-    for batch in datamodule.test_dataloader():
-        return datamodule.gather_data_and_convert_to_namedtuple(batch, 0)
-
-#######################
-#     Trainer Run     #
-#######################
-
-
-def run(args):
-
-    nice_print("You are about to train a TorchScripted Pytorch Geometric Lightning model !")
-    nice_print(lightning_logo)
-
-    datamodule: LightningDataModule = instantiate_datamodule(args)
-    model: LightningModule = instantiate_model(args, datamodule)
-    trainer = Trainer.from_argparse_args(args)
-    trainer.fit(model, datamodule)
-    trainer.test()
-
-    batch = get_single_batch(datamodule)
-    model.to_torchscript(file_path="model_trace.pt",
-                         method='script',
-                         example_inputs=batch)
-
-    nice_print("Congratulations !")
-    nice_print("You trained your first TorchScripted Pytorch Geometric Lightning model !", last=True)
-
-
-if __name__ == "__main__":
-    if not HAS_PYTORCH_GEOMETRIC:
-        print("Skip training. Pytorch Geometric isn't installed. Please, check README.md !")
-
-    else:
-        parser = ArgumentParser(description="Pytorch Geometric Example")
-        parser = Trainer.add_argparse_args(parser)
-        parser = CoraDataset.add_argparse_args(parser)
-        parser = DNAConvNet.add_argparse_args(parser)
-
-        cmd_line = '--max_epochs 1'.split(' ')
-
-        run(parser.parse_args(cmd_line))
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py b/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py
deleted file mode 100644
index 2c765d1449c57..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/lightning.py
+++ /dev/null
@@ -1,31 +0,0 @@
-def nice_print(msg, last=False):
-    print()
-    print("\033[0;35m" + msg + "\033[0m")
-    if last:
-        print()
-
-
-lightning_logo = """
-                    ####
-                ###########
-             ####################
-         ############################
-    #####################################
-##############################################
-#########################  ###################
-#######################    ###################
-####################      ####################
-##################       #####################
-################        ######################
-#####################        #################
-######################     ###################
-#####################    #####################
-####################   #######################
-###################  #########################
-##############################################
-    #####################################
-         ############################
-             ####################
-                  ##########
-                     ####
-"""
diff --git a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml b/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml
deleted file mode 100644
index 99f516323e976..0000000000000
--- a/pl_examples/pytorch_ecosystem/pytorch_geometric/pyproject.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-[tool.poetry]
-name = "lightning-geometric"
-version = "0.1.0"
-description = "TorchScripted Pytorch Geometric Examples with Pytorch Lightning"
-authors = ["Thomas Chaton <thomas.ai@grid.com>"]
-
-[tool.poetry.dependencies]
-python = "3.7.8"
-torch = "^1.6.0"
-torch-cluster = "^1.5.7"
-torch-sparse = "^0.6.7"
-torch-scatter = "^2.0.5"
-torch-geometric = "^1.6.1"
-pytorch-lightning = "^ 1.0.5"
-openmesh = "^1.1.4"
-torch-spline-conv = "^1.2.0"
-tqdm = "^4.50.0"
-pytest = "^6.1.0"
-
-[tool.poetry.dev-dependencies]
-black = {version = "^20.8b1", allow-prereleases = true}
-
-[build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index da21384190163..91145c5bd0d0b 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib
 import platform
 from unittest import mock
diff --git a/requirements/examples.txt b/requirements/examples.txt
index 6e48778cb222a..c87d10a39346f 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,2 +1,2 @@
 torchvision>=0.4.1
-gym>=0.17.0
+gym>=0.17.0
\ No newline at end of file

From 1b599ff39d0757729856ab9870f3771920e8aad1 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 17 Dec 2020 11:13:48 +0100
Subject: [PATCH 019/136] add doctests for example 1/n (#5079)

* define tests

* fix basic

* fix gans

* unet

* test

* drop

* format

* fix

* revert

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
---
 pl_examples/basic_examples/autoencoder.py     |   7 +
 .../backbone_image_classifier.py              |  13 ++
 .../basic_examples/conv_sequential_example.py |   6 +
 .../basic_examples/mnist_datamodule.py        |   3 +
 .../basic_examples/simple_image_classifier.py |   7 +
 pl_examples/bug_report_model.py               |  15 +-
 .../computer_vision_fine_tuning.py            |  36 +++--
 .../generative_adversarial_net.py             |  63 ++++++--
 pl_examples/domain_templates/imagenet.py      |  20 ++-
 .../domain_templates/reinforce_learn_Qnet.py  | 136 +++++++++++-------
 .../domain_templates/semantic_segmentation.py |  57 ++++----
 pl_examples/domain_templates/unet.py          |  52 +++++--
 12 files changed, 288 insertions(+), 127 deletions(-)

diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index 72bfcb17c0872..91f7ac0a1569d 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -31,6 +31,13 @@
 
 
 class LitAutoEncoder(pl.LightningModule):
+    """
+    >>> LitAutoEncoder()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitAutoEncoder(
+      (encoder): ...
+      (decoder): ...
+    )
+    """
 
     def __init__(self):
         super().__init__()
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index b0ca2efd5d76b..bb1daad301d08 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -29,6 +29,13 @@
 
 
 class Backbone(torch.nn.Module):
+    """
+    >>> Backbone()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Backbone(
+      (l1): Linear(...)
+      (l2): Linear(...)
+    )
+    """
     def __init__(self, hidden_dim=128):
         super().__init__()
         self.l1 = torch.nn.Linear(28 * 28, hidden_dim)
@@ -42,6 +49,12 @@ def forward(self, x):
 
 
 class LitClassifier(pl.LightningModule):
+    """
+    >>> LitClassifier(Backbone())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitClassifier(
+      (backbone): ...
+    )
+    """
     def __init__(self, backbone, learning_rate=1e-3):
         super().__init__()
         self.save_hyperparameters()
diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 06fddd689260f..39634084860c2 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -55,6 +55,12 @@ def forward(self, x):
 
 
 class LitResnet(pl.LightningModule):
+    """
+    >>> LitResnet()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitResnet(
+      (sequential_module): Sequential(...)
+    )
+    """
     def __init__(self, lr=0.05, batch_size=32, manual_optimization=False):
         super().__init__()
 
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index eb1415cf8b981..95e20d22e1fdd 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -29,6 +29,9 @@
 class MNISTDataModule(LightningDataModule):
     """
     Standard MNIST, train, val, test splits and transforms
+
+    >>> MNISTDataModule()  # doctest: +ELLIPSIS
+    <...mnist_datamodule.MNISTDataModule object at ...>
     """
 
     name = "mnist"
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index 6b8457e0e4897..894eeea619ba9 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -24,6 +24,13 @@
 
 
 class LitClassifier(pl.LightningModule):
+    """
+    >>> LitClassifier()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    LitClassifier(
+      (l1): Linear(...)
+      (l2): Linear(...)
+    )
+    """
     def __init__(self, hidden_dim=128, learning_rate=1e-3):
         super().__init__()
         self.save_hyperparameters()
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index e2201db12f894..30345122e251f 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -28,6 +28,10 @@
 
 
 class RandomDataset(Dataset):
+    """
+    >>> RandomDataset(size=10, length=20)  # doctest: +ELLIPSIS
+    <...bug_report_model.RandomDataset object at ...>
+    """
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
@@ -40,6 +44,12 @@ def __len__(self):
 
 
 class BoringModel(LightningModule):
+    """
+    >>> BoringModel()  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    BoringModel(
+      (layer): Linear(...)
+    )
+    """
 
     def __init__(self):
         """
@@ -113,10 +123,9 @@ def configure_optimizers(self):
 #  parser = ArgumentParser()
 #  args = parser.parse_args(opt)
 
-def run_test():
+def test_run():
 
     class TestModel(BoringModel):
-
         def on_train_epoch_start(self) -> None:
             print('override any method to prove your bug')
 
@@ -140,4 +149,4 @@ def on_train_epoch_start(self) -> None:
 
 if __name__ == '__main__':
     cli_lightning_logo()
-    run_test()
+    test_run()
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 1c60e3aa6d23f..4392ac47e837f 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -159,20 +159,30 @@ def _unfreeze_and_add_param_group(module: Module,
 class TransferLearningModel(pl.LightningModule):
     """Transfer Learning with pre-trained ResNet50.
 
-    Args:
-        hparams: Model hyperparameters
-        dl_path: Path where the data will be downloaded
+    >>> with TemporaryDirectory(dir='.') as tmp_dir:
+    ...     TransferLearningModel(tmp_dir)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    TransferLearningModel(
+      (feature_extractor): Sequential(...)
+      (fc): Sequential(...)
+    )
     """
-    def __init__(self,
-                 dl_path: Union[str, Path],
-                 backbone: str = 'resnet50',
-                 train_bn: bool = True,
-                 milestones: tuple = (5, 10),
-                 batch_size: int = 8,
-                 lr: float = 1e-2,
-                 lr_scheduler_gamma: float = 1e-1,
-                 num_workers: int = 6, **kwargs) -> None:
-        super().__init__()
+    def __init__(
+            self,
+            dl_path: Union[str, Path],
+            backbone: str = 'resnet50',
+            train_bn: bool = True,
+            milestones: tuple = (5, 10),
+            batch_size: int = 8,
+            lr: float = 1e-2,
+            lr_scheduler_gamma: float = 1e-1,
+            num_workers: int = 6,
+            **kwargs,
+    ) -> None:
+        """
+        Args:
+            dl_path: Path where the data will be downloaded
+        """
+        super().__init__(**kwargs)
         self.dl_path = dl_path
         self.backbone = backbone
         self.train_bn = train_bn
diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py
index 210a80721d9a9..b0c324c193574 100644
--- a/pl_examples/domain_templates/generative_adversarial_net.py
+++ b/pl_examples/domain_templates/generative_adversarial_net.py
@@ -37,7 +37,13 @@
 
 
 class Generator(nn.Module):
-    def __init__(self, latent_dim, img_shape):
+    """
+    >>> Generator(img_shape=(1, 8, 8))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Generator(
+      (model): Sequential(...)
+    )
+    """
+    def __init__(self, latent_dim: int = 100, img_shape: tuple = (1, 28, 28)):
         super().__init__()
         self.img_shape = img_shape
 
@@ -64,6 +70,12 @@ def forward(self, z):
 
 
 class Discriminator(nn.Module):
+    """
+    >>> Discriminator(img_shape=(1, 28, 28))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Discriminator(
+      (model): Sequential(...)
+    )
+    """
     def __init__(self, img_shape):
         super().__init__()
 
@@ -83,6 +95,37 @@ def forward(self, img):
 
 
 class GAN(LightningModule):
+    """
+    >>> GAN(img_shape=(1, 8, 8))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    GAN(
+      (generator): Generator(
+        (model): Sequential(...)
+      )
+      (discriminator): Discriminator(
+        (model): Sequential(...)
+      )
+    )
+    """
+    def __init__(
+            self,
+            img_shape: tuple = (1, 28, 28),
+            lr: float = 0.0002,
+            b1: float = 0.5,
+            b2: float = 0.999,
+            latent_dim: int = 100,
+    ):
+        super().__init__()
+
+        self.save_hyperparameters()
+
+        # networks
+        self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=img_shape)
+        self.discriminator = Discriminator(img_shape=img_shape)
+
+        self.validation_z = torch.randn(8, self.hparams.latent_dim)
+
+        self.example_input_array = torch.zeros(2, self.hparams.latent_dim)
+
     @staticmethod
     def add_argparse_args(parent_parser: ArgumentParser):
         parser = ArgumentParser(parents=[parent_parser], add_help=False)
@@ -96,20 +139,6 @@ def add_argparse_args(parent_parser: ArgumentParser):
 
         return parser
 
-    def __init__(self, hparams: Namespace):
-        super().__init__()
-
-        self.hparams = hparams
-
-        # networks
-        mnist_shape = (1, 28, 28)
-        self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=mnist_shape)
-        self.discriminator = Discriminator(img_shape=mnist_shape)
-
-        self.validation_z = torch.randn(8, self.hparams.latent_dim)
-
-        self.example_input_array = torch.zeros(2, self.hparams.latent_dim)
-
     def forward(self, z):
         return self.generator(z)
 
@@ -180,6 +209,10 @@ def on_epoch_end(self):
 
 
 class MNISTDataModule(LightningDataModule):
+    """
+    >>> MNISTDataModule()  # doctest: +ELLIPSIS
+    <...generative_adversarial_net.MNISTDataModule object at ...>
+    """
     def __init__(self, batch_size: int = 64, data_path: str = os.getcwd(), num_workers: int = 4):
         super().__init__()
         self.batch_size = batch_size
diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index b1eea307478f9..cc36f3542a1c8 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -50,6 +50,12 @@
 
 
 class ImageNetLightningModel(LightningModule):
+    """
+    >>> ImageNetLightningModel(data_path='missing')  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    ImageNetLightningModel(
+      (model): ResNet(...)
+    )
+    """
     # pull out resnet names from torchvision models
     MODEL_NAMES = sorted(
         name for name in models.__dict__
@@ -58,14 +64,14 @@ class ImageNetLightningModel(LightningModule):
 
     def __init__(
             self,
-            arch: str,
-            pretrained: bool,
-            lr: float,
-            momentum: float,
-            weight_decay: int,
             data_path: str,
-            batch_size: int,
-            workers: int,
+            arch: str = 'resnet18',
+            pretrained: bool = False,
+            lr: float = 0.1,
+            momentum: float = 0.9,
+            weight_decay: float = 1e-4,
+            batch_size: int = 4,
+            workers: int = 2,
             **kwargs,
     ):
         super().__init__()
diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index a8b9db095f377..6aee8bb6038c1 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -53,13 +53,19 @@ class DQN(nn.Module):
     """
     Simple MLP network
 
-    Args:
-        obs_size: observation/state size of the environment
-        n_actions: number of discrete actions available in the environment
-        hidden_size: size of hidden layers
+    >>> DQN(10, 5)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DQN(
+      (net): Sequential(...)
+    )
     """
 
     def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128):
+        """
+        Args:
+            obs_size: observation/state size of the environment
+            n_actions: number of discrete actions available in the environment
+            hidden_size: size of hidden layers
+        """
         super(DQN, self).__init__()
         self.net = nn.Sequential(
             nn.Linear(obs_size, hidden_size),
@@ -81,11 +87,15 @@ class ReplayBuffer:
     """
     Replay Buffer for storing past experiences allowing the agent to learn from them
 
-    Args:
-        capacity: size of the buffer
+    >>> ReplayBuffer(5)  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.ReplayBuffer object at ...>
     """
 
     def __init__(self, capacity: int) -> None:
+        """
+        Args:
+            capacity: size of the buffer
+        """
         self.buffer = deque(maxlen=capacity)
 
     def __len__(self) -> int:
@@ -113,12 +123,16 @@ class RLDataset(IterableDataset):
     Iterable Dataset containing the ExperienceBuffer
     which will be updated with new experiences during training
 
-    Args:
-        buffer: replay buffer
-        sample_size: number of experiences to sample at a time
+    >>> RLDataset(ReplayBuffer(5))  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.RLDataset object at ...>
     """
 
     def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None:
+        """
+        Args:
+            buffer: replay buffer
+            sample_size: number of experiences to sample at a time
+        """
         self.buffer = buffer
         self.sample_size = sample_size
 
@@ -132,12 +146,18 @@ class Agent:
     """
     Base Agent class handling the interaction with the environment
 
-    Args:
-        env: training environment
-        replay_buffer: replay buffer storing experiences
+    >>> env = gym.make("CartPole-v0")
+    >>> buffer = ReplayBuffer(10)
+    >>> Agent(env, buffer)  # doctest: +ELLIPSIS
+    <...reinforce_learn_Qnet.Agent object at ...>
     """
 
     def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None:
+        """
+        Args:
+            env: training environment
+            replay_buffer: replay buffer storing experiences
+        """
         self.env = env
         self.replay_buffer = replay_buffer
         self.reset()
@@ -204,20 +224,34 @@ def play_step(self, net: nn.Module, epsilon: float = 0.0, device: str = 'cpu') -
 
 
 class DQNLightning(pl.LightningModule):
-    """ Basic DQN Model """
-
-    def __init__(self,
-                 replay_size,
-                 warm_start_steps: int,
-                 gamma: float,
-                 eps_start: int,
-                 eps_end: int,
-                 eps_last_frame: int,
-                 sync_rate,
-                 lr: float,
-                 episode_length,
-                 batch_size, **kwargs) -> None:
-        super().__init__()
+    """ Basic DQN Model
+
+    >>> DQNLightning(env="CartPole-v0")  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DQNLightning(
+      (net): DQN(
+        (net): Sequential(...)
+      )
+      (target_net): DQN(
+        (net): Sequential(...)
+      )
+    )
+    """
+    def __init__(
+            self,
+            env: str,
+            replay_size: int = 200,
+            warm_start_steps: int = 200,
+            gamma: float = 0.99,
+            eps_start: float = 1.0,
+            eps_end: float = 0.01,
+            eps_last_frame: int = 200,
+            sync_rate: int = 10,
+            lr: float = 1e-2,
+            episode_length: int = 50,
+            batch_size: int = 4,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
         self.replay_size = replay_size
         self.warm_start_steps = warm_start_steps
         self.gamma = gamma
@@ -229,7 +263,7 @@ def __init__(self,
         self.episode_length = episode_length
         self.batch_size = batch_size
 
-        self.env = gym.make(self.env)
+        self.env = gym.make(env)
         obs_size = self.env.observation_space.shape[0]
         n_actions = self.env.action_space.n
 
@@ -302,8 +336,7 @@ def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], nb_batch) -> O
             Training loss and log metrics
         """
         device = self.get_device(batch)
-        epsilon = max(self.eps_end, self.eps_start -
-                      self.global_step + 1 / self.eps_last_frame)
+        epsilon = max(self.eps_end, self.eps_start - self.global_step + 1 / self.eps_last_frame)
 
         # step through environment with agent
         reward, done = self.agent.play_step(self.net, epsilon, device)
@@ -349,6 +382,30 @@ def get_device(self, batch) -> str:
         """Retrieve device currently being used by minibatch"""
         return batch[0].device.index if self.on_gpu else 'cpu'
 
+    @staticmethod
+    def add_model_specific_args(parent_parser):  # pragma: no-cover
+        parser = argparse.ArgumentParser(parents=[parent_parser])
+        parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
+        parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
+        parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag")
+        parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
+        parser.add_argument("--sync_rate", type=int, default=10,
+                            help="how many frames do we update the target network")
+        parser.add_argument("--replay_size", type=int, default=1000,
+                            help="capacity of the replay buffer")
+        parser.add_argument("--warm_start_size", type=int, default=1000,
+                            help="how many samples do we use to fill our buffer at the start of training")
+        parser.add_argument("--eps_last_frame", type=int, default=1000,
+                            help="what frame should epsilon stop decaying")
+        parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
+        parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon")
+        parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode")
+        parser.add_argument("--max_episode_reward", type=int, default=200,
+                            help="max episode reward in the environment")
+        parser.add_argument("--warm_start_steps", type=int, default=1000,
+                            help="max episode reward in the environment")
+        return parser
+
 
 def main(args) -> None:
     model = DQNLightning(**vars(args))
@@ -368,26 +425,7 @@ def main(args) -> None:
     np.random.seed(0)
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
-    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
-    parser.add_argument("--env", type=str, default="CartPole-v0", help="gym environment tag")
-    parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")
-    parser.add_argument("--sync_rate", type=int, default=10,
-                        help="how many frames do we update the target network")
-    parser.add_argument("--replay_size", type=int, default=1000,
-                        help="capacity of the replay buffer")
-    parser.add_argument("--warm_start_size", type=int, default=1000,
-                        help="how many samples do we use to fill our buffer at the start of training")
-    parser.add_argument("--eps_last_frame", type=int, default=1000,
-                        help="what frame should epsilon stop decaying")
-    parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
-    parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon")
-    parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode")
-    parser.add_argument("--max_episode_reward", type=int, default=200,
-                        help="max episode reward in the environment")
-    parser.add_argument("--warm_start_steps", type=int, default=1000,
-                        help="max episode reward in the environment")
-
+    parser = DQNLightning.add_model_specific_args(parser)
     args = parser.parse_args()
 
     main(args)
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 08bdc1140916a..7bcad597a9a68 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -142,15 +142,17 @@ class SegModel(pl.LightningModule):
 
     Adam optimizer is used along with Cosine Annealing learning rate scheduler.
     """
-
-    def __init__(self,
-                 data_path: str,
-                 batch_size: int,
-                 lr: float,
-                 num_layers: int,
-                 features_start: int,
-                 bilinear: bool, **kwargs):
-        super().__init__()
+    def __init__(
+            self,
+            data_path: str,
+            batch_size: int = 4,
+            lr: float = 1e-3,
+            num_layers: int = 3,
+            features_start: int = 64,
+            bilinear: bool = False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
         self.data_path = data_path
         self.batch_size = batch_size
         self.lr = lr
@@ -204,6 +206,18 @@ def train_dataloader(self):
     def val_dataloader(self):
         return DataLoader(self.validset, batch_size=self.batch_size, shuffle=False)
 
+    @staticmethod
+    def add_model_specific_args(parent_parser):  # pragma: no-cover
+        parser = ArgumentParser(parents=[parent_parser])
+        parser.add_argument("--data_path", type=str, help="path where dataset is stored")
+        parser.add_argument("--batch_size", type=int, default=16, help="size of the batches")
+        parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate")
+        parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net")
+        parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer")
+        parser.add_argument("--bilinear", action='store_true', default=False,
+                            help="whether to use bilinear interpolation or transposed")
+        return parser
+
 
 def main(hparams: Namespace):
     # ------------------------
@@ -224,14 +238,7 @@ def main(hparams: Namespace):
     # ------------------------
     # 3 INIT TRAINER
     # ------------------------
-    trainer = pl.Trainer(
-        gpus=hparams.gpus,
-        logger=logger,
-        max_epochs=hparams.epochs,
-        accumulate_grad_batches=hparams.grad_batches,
-        accelerator=hparams.accelerator,
-        precision=16 if hparams.use_amp else 32,
-    )
+    trainer = pl.Trainer.from_argparse_args(hparams)
 
     # ------------------------
     # 5 START TRAINING
@@ -242,21 +249,7 @@ def main(hparams: Namespace):
 if __name__ == '__main__':
     cli_lightning_logo()
     parser = ArgumentParser()
-    parser.add_argument("--data_path", type=str, help="path where dataset is stored")
-    parser.add_argument("--gpus", type=int, default=-1, help="number of available GPUs")
-    parser.add_argument('--distributed-backend', type=str, default='dp', choices=('dp', 'ddp', 'ddp2'),
-                        help='supports three options dp, ddp, ddp2')
-    parser.add_argument('--use_amp', action='store_true', help='if true uses 16 bit precision')
-    parser.add_argument("--batch_size", type=int, default=4, help="size of the batches")
-    parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate")
-    parser.add_argument("--num_layers", type=int, default=5, help="number of layers on u-net")
-    parser.add_argument("--features_start", type=float, default=64, help="number of features in first layer")
-    parser.add_argument("--bilinear", action='store_true', default=False,
-                        help="whether to use bilinear interpolation or transposed")
-    parser.add_argument("--grad_batches", type=int, default=1, help="number of batches to accumulate")
-    parser.add_argument("--epochs", type=int, default=20, help="number of epochs to train")
-    parser.add_argument("--log_wandb", action='store_true', help="log training on Weights & Biases")
-
+    parser = SegModel.add_model_specific_args(parser)
     hparams = parser.parse_args()
 
     main(hparams)
diff --git a/pl_examples/domain_templates/unet.py b/pl_examples/domain_templates/unet.py
index 20b4bdb2a4bf9..2314e19ddbfc9 100644
--- a/pl_examples/domain_templates/unet.py
+++ b/pl_examples/domain_templates/unet.py
@@ -22,20 +22,33 @@ class UNet(nn.Module):
     Architecture based on U-Net: Convolutional Networks for Biomedical Image Segmentation
     Link - https://arxiv.org/abs/1505.04597
 
-    Parameters:
-        num_classes: Number of output classes required (default 19 for KITTI dataset)
-        num_layers: Number of layers in each side of U-net
-        features_start: Number of features in first layer
-        bilinear: Whether to use bilinear interpolation or transposed
-            convolutions for upsampling.
+    >>> UNet(num_classes=2, num_layers=3)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    UNet(
+      (layers): ModuleList(
+        (0): DoubleConv(...)
+        (1): Down(...)
+        (2): Down(...)
+        (3): Up(...)
+        (4): Up(...)
+        (5): Conv2d(64, 2, kernel_size=(1, 1), stride=(1, 1))
+      )
+    )
     """
 
     def __init__(
-            self, num_classes: int = 19,
+            self,
+            num_classes: int = 19,
             num_layers: int = 5,
             features_start: int = 64,
-            bilinear: bool = False
+            bilinear: bool = False,
     ):
+        """
+        Args:
+            num_classes: Number of output classes required (default 19 for KITTI dataset)
+            num_layers: Number of layers in each side of U-net
+            features_start: Number of features in first layer
+            bilinear: Whether to use bilinear interpolation or transposed convolutions for upsampling.
+        """
         super().__init__()
         self.num_layers = num_layers
 
@@ -69,6 +82,11 @@ class DoubleConv(nn.Module):
     """
     Double Convolution and BN and ReLU
     (3x3 conv -> BN -> ReLU) ** 2
+
+    >>> DoubleConv(4, 4)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    DoubleConv(
+      (net): Sequential(...)
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int):
@@ -89,6 +107,16 @@ def forward(self, x):
 class Down(nn.Module):
     """
     Combination of MaxPool2d and DoubleConv in series
+
+    >>> Down(4, 8)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Down(
+      (net): Sequential(
+        (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+        (1): DoubleConv(
+          (net): Sequential(...)
+        )
+      )
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int):
@@ -107,6 +135,14 @@ class Up(nn.Module):
     Upsampling (by either bilinear interpolation or transpose convolutions)
     followed by concatenation of feature map from contracting path,
     followed by double 3x3 convolution.
+
+    >>> Up(8, 4)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    Up(
+      (upsample): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2))
+      (conv): DoubleConv(
+        (net): Sequential(...)
+      )
+    )
     """
 
     def __init__(self, in_ch: int, out_ch: int, bilinear: bool = False):

From b16441f6a54b9e8e56f0990226d5e81b6b4313ec Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 17 Dec 2020 12:03:45 +0100
Subject: [PATCH 020/136] Document speed comparison (#2072)

* docs

* script

* dump

* desc

* import

* import

* if

* norm

* t

* finished

* isort

* typing

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>

* xlabel

* pandas

* time

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 benchmarks/__init__.py                        |  17 +++++
 benchmarks/generate_comparison.py             |  60 +++++++++++++++++
 .../{test_parity.py => test_basic_parity.py}  |  62 ++++++++++++------
 benchmarks/test_sharded_parity.py             |  14 ++++
 .../benchmarks/figure-parity-times.png        | Bin 0 -> 31513 bytes
 docs/source/benchmarking.rst                  |  14 ++++
 docs/source/index.rst                         |   1 +
 requirements/test.txt                         |   1 +
 tests/base/datasets.py                        |   9 ++-
 9 files changed, 155 insertions(+), 23 deletions(-)
 create mode 100644 benchmarks/generate_comparison.py
 rename benchmarks/{test_parity.py => test_basic_parity.py} (61%)
 create mode 100644 docs/source/_images/benchmarks/figure-parity-times.png
 create mode 100644 docs/source/benchmarking.rst

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index e69de29bb2d1d..734288b07235d 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -0,0 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+BENCHMARK_ROOT = os.path.dirname(__file__)
+PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT)
diff --git a/benchmarks/generate_comparison.py b/benchmarks/generate_comparison.py
new file mode 100644
index 0000000000000..69eb47cb7e759
--- /dev/null
+++ b/benchmarks/generate_comparison.py
@@ -0,0 +1,60 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import matplotlib.pylab as plt
+import pandas as pd
+
+from benchmarks.test_basic_parity import lightning_loop, vanilla_loop
+from tests.base.models import ParityModuleMNIST, ParityModuleRNN
+
+NUM_EPOCHS = 20
+NUM_RUNS = 50
+MODEL_CLASSES = (ParityModuleRNN, ParityModuleMNIST)
+PATH_HERE = os.path.dirname(__file__)
+FIGURE_EXTENSION = '.png'
+
+
+def _main():
+    fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES))
+
+    for i, cls_model in enumerate(MODEL_CLASSES):
+        path_csv = os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv')
+        if os.path.isfile(path_csv):
+            df_time = pd.read_csv(path_csv, index_col=0)
+        else:
+            vanilla = vanilla_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+            lightning = lightning_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+
+            df_time = pd.DataFrame({'vanilla PT': vanilla['durations'][1:], 'PT Lightning': lightning['durations'][1:]})
+            df_time /= NUM_RUNS
+            df_time.to_csv(os.path.join(PATH_HERE, f'dump-times_{cls_model.__name__}.csv'))
+        # todo: add also relative X-axis ticks to see both: relative and absolute time differences
+        df_time.plot.hist(
+            ax=axarr[i],
+            bins=20,
+            alpha=0.5,
+            title=cls_model.__name__,
+            legend=True,
+            grid=True,
+        )
+        axarr[i].set(xlabel='time [seconds]')
+
+    path_fig = os.path.join(PATH_HERE, f'figure-parity-times{FIGURE_EXTENSION}')
+    fig.tight_layout()
+    fig.savefig(path_fig)
+
+
+if __name__ == '__main__':
+    _main()
diff --git a/benchmarks/test_parity.py b/benchmarks/test_basic_parity.py
similarity index 61%
rename from benchmarks/test_parity.py
rename to benchmarks/test_basic_parity.py
index 3508d5a3c28ac..c85984b092b9d 100644
--- a/benchmarks/test_parity.py
+++ b/benchmarks/test_basic_parity.py
@@ -1,8 +1,23 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 
 import numpy as np
 import pytest
 import torch
+from tqdm import tqdm
 
 from pytorch_lightning import seed_everything, Trainer
 import tests.base.develop_utils as tutils
@@ -15,34 +30,33 @@
     (ParityModuleMNIST, 0.25),  # todo: lower this thr
 ])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_pytorch_parity(tmpdir, cls_model, max_diff):
+def test_pytorch_parity(tmpdir, cls_model, max_diff: float, num_epochs: int = 4, num_runs: int = 3):
     """
     Verify that the same  pytorch and lightning models achieve the same results
     """
-    num_epochs = 4
-    num_rums = 3
-    lightning_outs, pl_times = lightning_loop(cls_model, num_rums, num_epochs)
-    manual_outs, pt_times = vanilla_loop(cls_model, num_rums, num_epochs)
+    lightning = lightning_loop(cls_model, num_runs, num_epochs)
+    vanilla = vanilla_loop(cls_model, num_runs, num_epochs)
 
     # make sure the losses match exactly  to 5 decimal places
-    for pl_out, pt_out in zip(lightning_outs, manual_outs):
+    for pl_out, pt_out in zip(lightning['losses'], vanilla['losses']):
         np.testing.assert_almost_equal(pl_out, pt_out, 5)
 
     # the fist run initialize dataset (download & filter)
-    tutils.assert_speed_parity_absolute(pl_times[1:], pt_times[1:],
-                                        nb_epochs=num_epochs, max_diff=max_diff)
+    tutils.assert_speed_parity_absolute(
+        lightning['durations'][1:], vanilla['durations'][1:], nb_epochs=num_epochs, max_diff=max_diff
+    )
 
 
 def vanilla_loop(cls_model, num_runs=10, num_epochs=10):
     """
     Returns an array with the last loss from each epoch for each run
     """
-    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
-    errors = []
-    times = []
+    hist_losses = []
+    hist_durations = []
 
+    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
     torch.backends.cudnn.deterministic = True
-    for i in range(num_runs):
+    for i in tqdm(range(num_runs), desc=f'Vanilla PT with {cls_model.__name__}'):
         time_start = time.perf_counter()
 
         # set seed
@@ -74,18 +88,21 @@ def vanilla_loop(cls_model, num_runs=10, num_epochs=10):
             epoch_losses.append(loss.item())
 
         time_end = time.perf_counter()
-        times.append(time_end - time_start)
+        hist_durations.append(time_end - time_start)
 
-        errors.append(epoch_losses[-1])
+        hist_losses.append(epoch_losses[-1])
 
-    return errors, times
+    return {
+        'losses': hist_losses,
+        'durations': hist_durations,
+    }
 
 
 def lightning_loop(cls_model, num_runs=10, num_epochs=10):
-    errors = []
-    times = []
+    hist_losses = []
+    hist_durations = []
 
-    for i in range(num_runs):
+    for i in tqdm(range(num_runs), desc=f'PT Lightning with {cls_model.__name__}'):
         time_start = time.perf_counter()
 
         # set seed
@@ -108,9 +125,12 @@ def lightning_loop(cls_model, num_runs=10, num_epochs=10):
         trainer.fit(model)
 
         final_loss = trainer.train_loop.running_loss.last().item()
-        errors.append(final_loss)
+        hist_losses.append(final_loss)
 
         time_end = time.perf_counter()
-        times.append(time_end - time_start)
+        hist_durations.append(time_end - time_start)
 
-    return errors, times
+    return {
+        'losses': hist_losses,
+        'durations': hist_durations,
+    }
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 2e52613462621..7bb29ab31b5e2 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -1,3 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import platform
 import time
diff --git a/docs/source/_images/benchmarks/figure-parity-times.png b/docs/source/_images/benchmarks/figure-parity-times.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e8c5899020d999bdca6f066cd10ef2262f22f2c
GIT binary patch
literal 31513
zcmd432{hMh`#$<bB1$MjA{m;HSs98#Nr{pvi4YPp7L}>YQ=$|iMW_(U7$IXBlSmn(
z&?L${p6k(m-~aob^Z%W-&RS=kb@p0&?=pP8pJ%%7>%Ok*eu9o@s;y$+VxUkctJDvu
zYEvlG-V_Q|Fa2_SMW(s)GyW&%vd_Tfn7yTotErO(<*=!XgRQ-bt+m<43l>h!*7oP5
zM5RO}MK+#sadB{#7ZbDl_Ya8LJ6VZ|1+y^XAuAjX9CxNrR-2N4sU9k&T2m;O{MA+W
z>bS-I?7C=lQ+HNrq^Tim<<`KhY5do{W7iH3lw3(vOgU1RT5vm-F~Tz0+wuWhip9*>
zE(SwU>nGtS?q4mmFigA^e6Kfh>u~VamG#r}pTehiFl}8qDNgg}VySWQ*+!3H=O=}n
zHFeJRk<7~8f%vE1-!GeHBR@ZXVsI7pb$sDfeHKL!f0vXJp#+g1Sjo5tf902|_<!x&
zN!JBeG&eU_GpVqD`t)gJVxrBaXK3imfk;97J*uirFY!S+f`Wp3A_W`5wIBJGmMU%5
zj0^n!K6AZow@LsLpS(Pm>#y$C%b|(UYT+AB{+L*;G<T$BaG>GNr6zNs@`{RFPyHP`
zc8p9;)}$YQHZnD89>1_}zGSM8G2U)p>7;qhHS3ST!N>IV^*;{`eEZS=^JkdX{CAVz
zCENHPGh@^b4W3ywRlD|h1Z0*jaM^X_RYh((^d{x-9WzEMk6o9>=y^THA_m6qs{}Bw
zS##_OH8bVr!A(1C+r8)J+~e(R+S0FRKN5LAl~P?@y}ZX`lBZ;LSfR42>SagAdgp=q
zmk*C79qb%@)}$!h@=HQNVf~?7A`t>c{E{q7AHDnXMCF_NDiW8*&R`Bs)!y9p>eZ{V
zTiZ{JZ8(;pXQP*b&l1_b{R@M>fS};A$zMG+jqwV(A6N6p`-X<D92@RBF(DWB`t4g*
z%v2S5kZJj{xv>^4w=YrpS(Xj4X8N{wq^+-FHqF1jO-{a>q<&-b)Q@LPBSXysaw=Q+
z1<a@ps3yP7wb7^S*}s3e;^Lfhj$H?P?%CF>85!GSciLa$^_n|s#j74E?%1cAZkWpu
zwtlC7SlFryKR+ga&~srr<>DgUefigJig$Q;_`nehOUupj=fC-um6e&9pM7(W>eATI
z{Z_qHO&*sY?JQ~fnQdoHa?i3EU;4Uf*|KG8720OTzc&|ObZ-CnF%ggCly}*>urO;M
z?|EmZ{r)|B$W&8PQ)gMXY>roa{rWYw68>xWGbSbmf5bmk)gxLxUW+d_Z`iVB%Svup
z=G$T>p2MDzg7-aUCzzFc%l#PTJ34Z00-l^KGFzCNP8J=)toUI*oY}5FWH2S8cUqXc
zNZFHNn49JBNm<(N<65#Gwj{VyA3AiXc(i(bdx=L;qvz$x;i`9!j!B#<qlg$4WVMHE
zDlab=Kl5fIrR*v#qjZ3De0==qz#aR(&#!Mx{b^FloshdUwiT-tfSvO#bf??MskvWX
ziz=F$Oq6@~?osdxHN7u=LOkp-jG>-VrL_#SX6EJ+mi3&rodqmEJM!Pl3(n&~{kiXr
zeFW75!@~BJdd`cRRnlkt8Tc&jI&8FZ{f?EMo}PgrAr&{b9m}$6+JH&U)(LneB_%~g
zk+EqLxp3LV#U+N}ITMfk&B8N+{A;!Z1}^X^DjHLmi(G%P?>yH@CXC(Y{S!7UIcr{C
zUSDmLF#pb-`ZfvJ-Ym?_8=qhJ`M8y7kAZ<fVClllA*&4(ej%ZQe`c_w*HBdV@Aua$
z#iJOKwYQ~cB|K?Ww0<jiG{aCP;>L|6nE~GwYbr4kzues1so|nuzRb$DJtZe`^d+UF
z_UY){q-;FbS#Uq7tEy@rwnHG!w{`W$Ji9xTjaa2*Yn!(Zf*f33Imr3XnV<X9MD0ED
z`}f<NO`A5Ypzv?orY<~yQ$V$+vZlu7V{qTL$e5UIYZ?5a6<nXRm85FMv);OO%kXDK
zz-o@x$GU6CXvx1{zkcn*ymCYe26=l^?(3!HR&T8hXI6Ik`gUJIVWG9>ZG?>^weWA1
z7y7GLW!v{C;uET>tn)5kycpE|;xZ%uWA_5;1FBUF3=CJFJ(JqCd$)u`?;f2@W1c%Z
z?3Cl>&!4ifN&D(L*vQja<cg!}QielxUrG~iG}0F$_Kim^j7#QCzP$<J)pBKe${tfw
zKF_I7bYtJ&uf~hEd`QLU=~t|H87pH$q2SQ$@5r<J^yN!(%Vc}D`K!jp>Z#HGTv6t)
zT}3;&cGx~V-dbN<t9<Yvga2yYYg{sC&&<%$(cS(&zuT_AI_yYVE7@J=x{Du8J32f2
zBqZ=gMn>k$D7jBDH#RmpT)5z$nJLCEAkh5eq=M~-r(a_7ZwFnwR-J3}{-B5GshVp%
z&B=#2o=~&u{*7269Af*;%=qyPhnX>JqXLJ$&Lv%Wb{!+N!kK#>>6E{Iy#kLlczSxp
za92^P)V1(%twQs4J8U^cPd@jZpPi(kT=>-;hTY%au(L<RG2x{Toq&V{%Wct<<p}AA
zcFkQL?b}DuYTV*(SmD1)rXytG``}<|OJ;$`c+{<1%lVeg&CN;soosH3k+QlTAD_``
z@<@jt4+shh3JecdOWGeszN-6xLoMOzlk|N8EHR?fVlM>}h3@O7>78j#^wXSOn4cZ}
zmX?`_)G&n;+dDkWBrGh9P4wN!jjFfM?|}8#P&3VwCr_ppW`8wjo2$7m&QCfxJF6Ty
zawKTBc;;JLR(AHwkxvy7w{HjFVYd~oy%n<ll981)QaJm#_3VL2-*^RA<@-m5Y%`5t
zJa4n{Q@$S+uwnP5d!`lsoIB5P=bh`CsV_`SOng)5Y&AFAv*@3c)bDKmy**n%-gP*1
z`p;m0Mt=9Db6)(CCrO&nK)RtMYX>EG&a6PHi=CL7nsN%8O|OVz@T2yjTfJXdnHrN9
z&~Rs`d404*75jmGwsv;SnI=l)&4)fb+j92(lascki52CK9zBW`jt_Z|l5#eWV}9rM
z?T4*G-@Vhxc)yRsWS7Uw{mnvs&)#L4@G8^eGs=2;77OPS28M^PJvx@kzj^a=gaBdV
zi(jHxXD@tvw*sTAF249j97#|+jF*QeV;fh{!{2>;r}%}|_$gDVN9O1~!%5G+_-DW=
z?1YP`iMoDa$@IXT0_Q=d>ag_zadB~GnF}6<r%wxFGQ^QCS!Z+3G*gR;iW)q3+4lSQ
zZ|R>3uEUjBYU#;UF0QU>H#Q&6GOJ?9Q0c;Y%X-Y@v_%^m8z0%acIwM*lbnyGc=%v7
zH#_@sKjkaY(VPu(;ro1<3(kLUbFOh4jCVa{Wfh>MG(T485yJG{bM|L`(@(}T9AQ<v
z?R*1;=$=qmkH#RUu6J~FB*zcoIU+tjtSW?~;{AJ$En0WqUGBs*)42UE$h$&AS6*Ao
z;4w4Syw?BlogMp-tlBb7!uN-9-9P@!vWZ7ZTs*7tGCh0t)2HQRju5w9Fl9c=SCx-{
ze=n1t9wUGLaf-dIt;nUx;joFH9p;~3MHG0<?kJi0CM<of)4;iIUGHl@Vf*6I<>X7)
zRLGU)hy?P3zt~be=Vzv-`oks6t5=@3v`jQ9^~}o8S3?TlVfWED+q{NfQu23<`Mbxu
zi~{>Z%Xap-`$$<e-aDGKf78UzcaPSP9E}XQduJa1(W6I)-X<S<G`l9^>nDbdEK%y%
zE7`UmB#nwLtU8vWL86ejefKV3fB#!Hoq#50jd60aj(y7jy(EzTItv_C(weeND=57&
z99}~LyLNFa&5h9f{_{t|W5$K-?)_)ZY-Qq>y?6ec(SZZ>WR}Xym9bB0*6%#0>hA8I
zJ$TS_yj6dwBVX*6=*i%4#TokXZ><%<Y{CcGnr45MhOb(`qpZA~T0%l1Ff_EXE#pM<
zi_5O$942!K@bgo%v$IF<wC{GFW$(IPltWICX1siqH7~#9_DHuGP8yboAj1L&z7);)
z;QD%vcWL@f?Vp%gbPdVT(q>lPC@#*7?Dw{iV*?9I)T$;bLH-*PBcD`kyPmVDtE<bN
z|F$;I_Ji3E;ANrR*dr0@b@lbW;o+;fd3Yo|=iKV!72db=v9Rb~!ULBubSCxY|HRax
z$BmaI>ZLi3f75O6=urAIdQ27n%&;nBgVBwQjClL`S!Wc+SWvm)C)_Sw;!;>iew_xX
zd4JdQ^I<&x?9W6fKY#wD+BNMbD=W(>$JKx2k<QfgU;>#4gX727Da0N0D{SF)PE{Jv
z^s0|vh*p??q$6b8UAzv7tnD37N~H*6Sz<*A6BCo=Ny^6$A6~wEdBy8tUEu%em)O2A
zd8^+&XK6_HVDz);_w6V0d|$lSZDeHh2El{`jq!_fQxcmsqP-1s&)&Z@{>|-Y?t6#%
znezhsG%zNe!e}F^0%S~$XmKhMD_Ga8xv%}`4c>B>`_z#Zt8ecfP4z4;1mFmm_mp@P
zTpAYwj0)wHVPRo8`o+{Kz}pqTa$KH|j|xFxYJReZ#9xv|>uznY_|}@%yaZL<-FG3+
z`I>mmvt<_l4wfh<^Zi&_;~edX_2h6D4MkQ?j*8;BIPdyUM`#%>t@VtwZ96mBRG%sW
z!dL~!r&*Xe341R-&`XoBs9l?5^PY){ii+Y5DD?8<N3Ltvt{HlqJbBXb=f_;Ln%8x8
zD#pfl2gZ@VsCLQiJ(jG!ZnN7O{jrjU@%h_y|2gXtt}{~OaWO7bvIL1^g=7H@3af*F
z-ZV5qHD{ZY7SjurYm7E+-pr-odobj`=D+B1&1uH6pfy{}aLkt?+PS+!0|WkmVx2E8
z>)Wu6AOs*0313KGCuNB`PWjIa^3TZFm?}oyFtbA(QIVdBX&`ytqO$XA_|n5mB?r%W
z#eMSm#ymQwxY;}5@6Pz}w5r|o=Bv*Pd-#@hUmWDI{g^|b^~mUG^T%!ga%?Korn|ez
z@=_>B8U-$2SwH4jANvuF1cdb0Y@Pq~spRZxY3F<YYs*E+yGB0ezbY}uE!j8t<eNkx
zwL`h@3WFDyWN?6}FJHdgaM45N>^pCi7H8i*yn<-NhvTcMsp;V0P^R#!*ayLDgy9s{
zGErL`zx+CK6U7@4+9tQAriKI-O4g~D)PSPBz{FUQ4X7@z*q2PFTou?vHb9)l6UtiU
zUHPaZEBmTKvfW1e7;qp110SB1n4YDuV%Z~#4VL~NvyR=HoI*=~+KuGPF{BhwRXuI_
z|A%iMGN;><Vz;+X&gYtvOncA0{xx_*QG_16>AzpR`ZGhBiRUyeQqYxv06IZ+f}e8p
z0s;b5A_X%Nu3u;FUv`L`$rUk1R8xp&%a$)U8||ywG=YL+ub$qm0jq{s88%KtE*w2e
zCypy)KR&EMLF#VFyMFz8AYt7jWQJ6ALtbZ4ZtZt+k|c59-d>vG`H2qX)LYpqm~<Sw
z(<i8k9X?STyM3Z9^cOyg7>_gE+u!f=XRxW?XSL!UY$V625!0<}gUu%Y>nu@(q8Nmt
zYI`N_BqmPI7f*@>l`{k-Np7<nKEUz!R9Q5}KlILxIQaYW=Rr-cl)F;u^n9N=Um+U+
zXP2Y69+~-`sq#qY)@2R#NKz*PL}ykia)=u3nV%WYzC3x(<$GI3yCN8fz@VUJYZGk7
zA3uK7@@&Ddn_HZmoh4A2kpP3xSk0ssVoQ4pNV>1TKijf_>+|5Cd6_pAr_9;4h<C|e
z4#`_7RAnDNtOKBz0D{r~G0~?-*}Z$W<DY@UH)j2G)<zxltE^OAMn_jsTl=`uGvvnG
zRa`Z`q9x0DdYBL1QFvRmbSzC`ob#0$NA{i%6vtWV#mF!bC-Q71K2RUYQz<(fEBz9c
z>u42+$tk@3fddEj%uf9L^y^o6gn(KfaI^2TXU~jUtc?ntX!Fl?T`elQ9OPkZYkSJv
zJgMcJg@wfg@6h0vJqN0I^=>;a)CMQ;&Lb|F50B}8>s0?1OkX|X{3<Y^=&{<w8Xm9<
zEbQ$5Bp~D5AJEcL0~#YGjcvR5X$y<p&Wmwz+z%c+*tlsEE#-Pd#Fd*j*BU){Ss^DU
z=l1Kx<jD8TQhq6^HJ2xT>_tHT^!<AXfbOe#J#y?G3w~fM1Ijj?n;JzOr)FftjY7i_
z^|s*V&3%aGNcq2<C+$DxY>VAt$3Vfa>}g3kEP={(>sp4mT^Ct1OJ;=T=I15OzS}Bo
z-_1Qc`Ah7AUpRTT%j*mDvZA6)qobn-4j<kJpq=IP<>1)2Rz`Mqt+N`uyu2V($~rn`
zWBOf&JJ&VIZy!xf@Qf5bP4?_F!4p&kr(d&%a?7!Tl-hJRe6X+Yszu$c*D{;1FIc|r
z&7&6IzWryRM|wtvg!`03VT$x+M%JdUFERxgKKX3XpXQoTUX2GyZhsn2!^z7Vj`{nZ
zlW9`AL0B*Sx(yfk5TPv}7|Pn)4c<LDc_I}hE^UJQ?;xWVW^&%-Uh7eeF^h_dit;~!
zU=dy;|KnTF3krI{Bgp^Uq8Ue+iO;eAWX!eddDIR#fzi^|tSG>ape6^B`q5Ryabt^C
zb=eh~FaQ3zH1EOs7$3kLZ2wiaZfW`JrnPdS3M1$Sug&+A&AY9wZR%41ueenscbu#v
zHHv??ugSN2zkJ#Bel0oof;4!4{)L4FP;0j@8;~JI2H|+1T4iNrC3M4SD=QVSRCYZ@
zu9-&1$iy9$d!<Lwyt`{1AvI8*g1p(N9%-b40LBs#dRmwQY#9PLeqTS+*lCqI`IzM!
zJM2zjnz40+-#d`6oZg7H`&BY4$}Va|&*y8TvJ|+fUR|J{!Wa)6JXnsL&|KhXjwEGm
z*Hhq_^dBECFE7t8Ev@t96Pd7Ad+?F{{rz9Kg2`W39>{a1P!aY8ltp#v;=KJXw-E}(
z0kG@y=g+U?R*@k@`Pd43bBGzMB94)owYYc}InLeP8~zM5kfZ~QD)e55yr>S-(~PCj
zZa2&+Lor0^e(Tk%R#n~IeffVa8Cw}(LqEv5th~Iovy&4O)nKf)zn5Nmf+n7Tl5s6N
z`?nku9OsFN38%v<<Xc~`N816M+^4_Xj(r~~$hTz+9k9ClE^acHD<Xa}k8qf{U%_eX
zMVzSYuL&oS>_Kw!eGq8|&n(0Ky`S=^4f4;OBx}s;K0-xcFxbt$7A>W!(f*oGeSO;B
z96hJMaEyIzq1&)w1Bf+{Z*3n%eapa4kmC5ni4(w3e5e(l<nJZJ{_;AHnw|o31$^C^
z!fLmv5k6`f8VRS*2YmhfzS<@zc`34QH}qu_)+?*2p+n)J8opsSp$oP~)jBmC#v@KM
zVPW*27){szM3pbxDvP|MiZl_@wb{4qwuxsDDFRTSwAO$B_AQvywY_CmEEr?Sny$pz
z!4^CIJ@;U;^0{-O*gK|HR{GzlM~+KMN;>@fC_@n6M}ja=D)ggH|BK5H;!uxZ=g8Yx
zTU)!K_!|Xr`C76DOb3O6YTxiD{u66g^<Nok*EVfkp(v5;bv}YF_C?3qJnE$UxrNLZ
zLd>3oX0KoW?<947eA}a{G{I6e&*3qvZ*n)rxyw?gr>8%WS;o>kPW(8oqB*XSot?du
zh8BSg@zPelfuBDKRB(2OH`R#;FI@N`YxQor4tz*+JD-l>UmPE$=(=hZ7aiqP4O2xE
zpMdA^;|)IS-eor!Rh}5X5UMjcckUd;8%eLCtLtP7%EI%~)^8tF&Rf<;UkMDPZ}^Uw
zS<%w6rmM&`z{kf2BdO?n;PPTJN|mpeH$g4ssiN<L>k^Nj#=NO|f5!Ryp=517uqh}H
zvViX{$VM^zYlvM7vK4f#e0|d%8WbhhnPZQ1+Mim4WLIpBu%+$AMCTea@?DRL@*^xd
zQ1cB7Cd6Q%2m&Kxul@o~m38_|-;Ju9b@Ewu_LZ=(FwLT<DBa|RJ#|M5R;Zi4qt22X
zEPd+b85;B#xM~k-YU`_t`zNUEpX=1!ji7*(-g0TXI%`~fJSha+ks*@_^Go;j>tg=X
zsK40>*#Exqi%Zdx77U6DQ|43C)7Ix``agZTFQr7zH=9--^^uHQgC-i_wp747NoR&I
zbusZOawBF%@37<Apy(cm3c3&c)tR1>-8fBEmzSVdROHJhDv>m7Ar<<geHXxqU!Jh7
zzq`ALI*A0w$MV?Iix(rR{!~`!maWJmhge4hF%(C$AB-JJwliSFDZQPA&R{Igw57+S
zBnYZI9kzbUE~wYtafg{DCg(x>({T=V3YTKx^L&2Mk&Ta+;i0o*Uul4z(+#qz!Rjs>
z`8@CjRsEg-CY}+ns<kp~WL$__`3j5R7n#?uPusRYxd#%FPy}@@Y~tkm>uzD45Q!W?
zYQ|5X^d{Bkf5aM2F0Nn}f&Hhz85THwVR&LxcolW`sdo>L(lavN_;*Tcq(b)JhBQv1
zO#{vK?c2AL?Cd`A!RkLAb_PTXfcf$SkBN1S2}-Le4^2uHAM2)V{&#9&qT+ZB)>2Q8
z9faNy<T$JDcQGVL3*E(gW}L`>lW}6_)a+~$GR3RfT53utmrQWJHwuQo+3Q%Ure9PP
z8zS^F1qFq}v))uRSvK!Q3scBXBsxX@<St9_T383NXU(^^jI`F_!NCA<m$fo~gZvGA
z2#Dcay@lY;5|8yLYRKEOX=LaLC@GoT%GMEL12P(gnuO3?E16O#p4@r-`0>5>k8sK?
z=NIC|vVv4U^`$l{TFQ#adrQ+C5^OXA-*cC*{D|#3S3rP7L_|>K8r~gDW#H>Y8CQ2(
z>`sI8%P(eU3!PnDeDB=3qh6`3YO&=6m8{o-J4w`JM|yfHB0T$}dS=YG@2SQjk*qEh
z$V`pT$G$<e=j*p`T5FwNTI;LVulJ&=wOH2_7kzNpm-$sg!@dg_c2LT?y1LF}Ge<07
z^UzRJI}CTqxzCTkClnF%2O?acI`!PktgB2*ON(Ff3kXolPIVnh8gh85{pd86!Tyw+
z3JCj@J$M>Wzng6IL)OS-iq0b)LX3ef8U=Ewl@N>xq752csaW&wl$TG_Q8u=CD@!{&
zyUzz+t!0}}+LNrEO?juLo&ITXNU~zY`AG7nIwgUO^mK>cpVj63`T5sWI2S*Zoik$A
zJ@=GHjfg*>mp1<!1jN%A9693srS7&ls@pT&#f6Ph-u-=jpJRgqUp)k`_z5Kj&KA+M
z4rprb$5D0w=PnBX0Q!J1SBQCGJVGo-vujUMj0+s7y1ToHG}D}ZT#8aw==^c;{OA#?
zqpe5p{@LNj{Hm_bCoJsnqf!oyQEp8wEiIs5l1i7~5y0bc-Y6rV72Z`;(2&A|WZsiS
z7p(K51O)_W@Z9Ag;ekT=P8{A4WUyV8wY6Ejw}gA1$T`Debn;{!4>wE1HtnrzQM^k{
z?Lw&m%<5;DQH=yl(k4-I0RQ(MI1rd(Vqm}t*hFH>;XA<jFZ%@&YtuRhDGYo$#=klX
z*R1AM6#12<s%kOuGrtEUoXTKzD^g|Es|W!q%3xE18o}Ynn!wjdfV996`FdXV_V(_-
zUvJ@*a}xaV^XJh_<K6DafU*B5Y1_7KOAbzxc4k+L6hz2=iBS?|5^6&>U>?OATXmoY
zw8(`E7Yyx+C2t4b-3KHUvg;%>D~s;Uo!xFUzl#4b0g{qKOVQe(R*0hrEsBP6{l*O+
z1Y^_T&O$<(LtSc4)#3${l!&j!W)qpVK1Pa;029x-Q8iLp4uPNKl0I`}px~gw;2$W1
z${Bh|0B!XW*Vje%>q(VU?@>AB#l3O&xa!7k%M@+lNrroZ>hgoXN8*(DCH+BHCR#P!
z1?!NT^0S85OBKmTCj7z5l`AU?9Q&D>nJG$2O5`&E*=tDNs>VcdREP7fHZd{LNKoVf
z9%zgErmdG8|L6GIw{Mw&?tx~?y1R`tWdj3)Z(gQ7LVsms#G$>tokH25<QWR8{?MRM
zFO<<+u0s5YgItoQ>8nH6`H%pJz+wQwb2$_D!;6>p<M1UWC1rceULczbI25?3Jo(V=
zoa&1}c`}P8E?e`4My)J#KguOQCfQ9irL(zQ;k-)K!kNWaaTLC@(XcctMHNxF72ONk
zzC&BQ<@(Ah_}aDopqKPcfL?Y0)bMFMq#C7?py;vDw+z&B^HYmMKYB`e8QDeN*oKFO
zszSgaQt#BvOnF-yJ2JLBqE&7+xazk^uaS*~e#zt&EbZ;Xl@_LGK(IQTKks)i`6>{E
zu>>>A{INpDvZ*OIxxTG8PzKw?@3FJqHv1@4RY!eW)IoqiQ|MTvmc;u==Rc(OhDgB=
z{^tJu`;@!sjZlfXq&IsE=454gLwrK?q}F?+<G=DoVEuvQ^f&}w;m_<VRdwW6t&?1~
zIP<=A<h_a4SMI2hKYuFlabyR94u>l6?Zs1l!|Lc8fkVfIEU2QP53OFkdTZ3d9P5_l
zfLcw(k)mQ^t1n)>82cXDIo-%-z5)Lkj$A>8RgzDPg8QV#Wxs5k{8ezK_;CqWlJ=vG
zzTSozZ9;mruF0iu=JInYhK_ALkKz#w70zoFijUmtDaepSu3wVLZQfVoqZ}3&JQ^;y
zs?NOo?v=-u%Q3(B%=orNV_&yxvscJ(<F|@UTy@Ov&dT^Q0|Kyl)vY_4_nkv`v9$DY
z=xnpIlYUhxO<mo)#cS79Hx=2}0bW2n#l&%K6c%O#=RE}qd~`6u>z12^k>lVh>N4zC
zf*psh2MHx7MqNOtGB|N!y#lwS1Q2I|`?Ms)?&^-W%uyS6?&QGCSHHMCsWCj)bx@z8
z_!FoG8mcl%aYz-qwh8}>zwnt?X5W&${<Klyfkb8?Qh{T~d6gf3+!-4Grb@qU6d4h0
zE(<qzy7T=#y1$VbjY{0tA@m&)ih3Hl&9@AnbjseosbgPM#G^06h-|Mwlc%Pp+9rf+
zcbMc^)us8;vO3SP7|nC1-z~Y#v%5vg`8{8`>|R~gf3qK1+8d}Bwy3G8A@d9De)0Kg
zU3K+-=R2`pPk~0;O?sn}k%toBRz4O}nZLGbdRl_k&(9AP3wCJiiHA4!oB3)J9UqNp
zY<I<1t_!}F^<o+m=}Q-sVcfHu?M@N61S_V{#qHXfpf8YgcXf7kh4C-|9UV<R@|B_N
z)vE(PI`UV;ULfl-#H_5Ma{tIyqVbyt>?dGuo6d8UGE^-u@9p(*_};c%RzV>Nk#}nR
zy~(E^KWu-zS;8-Vm}jg&TRR^q^>xoi4KptRu&CAl8*hzJD%RT^z7LC!h`k`}KWrT3
zbh71t%q7&dZ=>qz)9TrmehIChvM}BtxPJY;9Ia<Vl1UoQz5xN|Rl#hOyPfGp2&AO4
zSDfml3Ei;U@L7f|2nEdjJ8P{22g=|Ka6tMro9sZH{9`Bl2mhHVZ&~-Gr;EHxKYpyu
z4J;`kL3K4z_=5i;;wF#6g;QW~Q3{k3KI&^~!r7U^q9OrB#SMIXd=!eGa%pNG@b+ug
ze520J&QKm|mKfO$)SDB9YT$-JWrD3%-eF%FcJJOsBBv1Ym*f&Ux8Jg_6W`BtYmI$X
zDFdz{Tmd9YlF?h!^ef<<0kxUj<_TFx73vWfyI$Zoph9X25<gL-DSN<P@~N}#59Oqy
z41LTM-h4RL*SaO;B?M@EaNxCTZCsNH0p9RJ;HP$+>kI@`ENg6Jx^m@;tou|}o8sbh
zlUMK8uPXo_p>6Stiax0q4p-H=2uD<+P!r-JP7~XX9w=LcAQ0TL#T=#T)I^6pM8S*}
zF{q(H(>1SeY-vu@7h8!)1-Ry)n!1IEwG_fhfn(JhEr499lcu*y?Hc#Xx#>R=Nb4ZK
z6s1yNV`2tCJfN<A1)e7yci*d5sVQZ^d!ub9>`i}vevNPKb#M?LXp9#&Q4W0dd4yS9
z_aa!R*YbD4LeZRZ)Ry;sPzLCAyk^TZJs)9Ymb^a@I)zR*!o^~Sg_ng3#W3d#6=e$U
z6Go)I_!2N)u{8#sfCC1ACn(5v`uh5wVG=GXC@5=gW<kOs!IDxu_q#4IIGCD(3hEPR
zzG|=<%-q~YW=42zvS8j@{H#zdMBKf5&41PUzO>8|Wz>2s>(;HH5H}WbQx@!0Pq*CT
z9t_mpbd06+yX0LOjpQ1(J^YWIU>Wd7AOm$vvNfmpDep#8Z54IEqU7XF$l)p0f5ykR
zPb7wd4LJU15%~nUM<rX>uj%7zHz(2lKNGWqio?UhlzShx$jGpvfOz|Dam<GQu_5th
z=xPEDW`W^DWUK(1xpgaLT6c0Rsv=TwZ-(Cb)1_X+0`rT)S%;)JSac5?QDMbX!SIfZ
zkMDZ{O{#dx29+B89S(C-j)zZNx+D+$axBFWR09<S5$9>OejHY;cc?kZ&`zs#bK>JF
z>BZw3s;A8u7#SlXBNJOK&CPufV6AgyAUy5eyO&epg2*kA<J2%36weM9W##6!rWZ%~
z-@eTO;vRGH273sA;??Mm;PFO<U-!V8!q-Dmc$Vug;~$-c&oo9^kA5k~+$Cxg*bkmG
z*K=m<>oMD&C}I8b)>bwgR&rK=JV>z!sztj=AI7W7%F15oXaQ+yX?w-YtlvN3AtGx*
zfh^GjK*vSPpFbPZw<)pKPuN>kXUCTUA+3k4ZtjbU={(#Uk8=+6B%m%%4p$4*9%o#=
zTFuSPEvVzhjT^bfMMXt;OP<*Gh=_YpC%AifB*;y+XmQ2JId8UXj9Wp8P4mVMWDH4l
znjAClS&+-i&(Ci(g0?1aXI>rpGFrks$9Pf77;8pUm)Ewxx?f;uFAs^m@I_S8xU*vw
z(#ueLc2c&Aa5FOCwcg%S<SY?S0My|TFyWjEBK(rfM9Yx`BQe$Mv)bWPIW1(?By2q(
zuc#kQjJI0sOXvNGs{)71?UNrz)|qC0gvyWYrYsRxhygP4IDpujB(>G%0Y^83ahJM4
zj|$F>g&qO?zoemNsi=vA$0fzwgXxccsdgsrK|MV^Bp?`zyaColcG8lvtf{F6g~|DI
z7RpVQLWOITH!3OdlAplV83@`0nUusBGkBHWWt`w3(`RX!B4W<K2U}i}q!OEthL|g$
z_Cx*54U88h(;paATr;^g8>@GLDAuY7kO6S<=zx~$hscXlDn{l=y@$1GZH)FXd}IJn
zhZC7M@z0Zwu|nB_vhit{?vW!fkywxMr#v+s4m_Y4b>XKc^dLAg{k0#dV$`FE)aw)o
zKb&~*zBaozk@4{tW;QlIf&vjZ@p^PmpFIoX;UW|BI_)OyDF6wAb-^4g(U2$&flUH{
zRM#s=<Cpnw=zndbs>g)<DqXy`PHN#-Ee|6jBN)0*Am*xuZy?KnVQu&he;pO&Tu;gN
zj@Lisod-IlUKKb+sPZ}eaf;A)Tdf>9I49~n>N0$;>$$~`>q4ubvvhVEbv{2IGMF88
zH2L*O&5~?^H}_fDwVbwmsW{j-&sdjOJJ{&m)3eXZlB%UR;cT)n#lE{Z2!wgT^XF%C
zi=IF4`|;sf)$p42>qB4_iN654%BS+cK$W{a{~lGH5Bv0AiLBnRYY)6yLZ_d+ee6Pe
zPw`)*noX`@SVz8nl3s?P<>mE0|4?ku>-`a=vMV0xK!5+MxYYdBdyWokCMd1~I5iJQ
zN_xxYjT*yW`F<Hnklc?@4VZTB%r~0^VkNAY-kqeRr2ES0I^5jcQvrrre`&yfKQ0sI
zhRqi%Tv|eg<PeXDdlKVNYe~^dBgGo<Xmwx&mWM@`De4?8GX-lP4h4hwY7ivL3Aq}u
zn%Ar{kfoX_41nd;XWd$A2dG@Aj+PU1BqZ?rD5=|XZEiRE@##oH3{>5>?<+S~>L4Vf
zzXSTmX^?VPK|Q)J_cfuqPK!qkbMGNbF2wskYO8wG;9&go^LN77X_$X5tOwaAX8t`a
zA;=)gk97KyZyT3i<56&#iIa0)PShVl<DPAOd>@r85lQqCz}h6?6(8zlaAAVpz#agT
z?ty3XsB<_`c;RckLqmZ?Ti&uo?e5O)YmPO<;{yfN-_A_K8T;j8i{zuBPLv?uK0g?;
z0;3o!L1=y$`znAA;5s0(7Mxxif87z=a;)K`V+VqyF!_u5T+E9LKe@Wy{~Sh<d>{D&
z93HG~?e}Dsm;M@S;wk+V+|-3Q!@|YI0Dumw#yzja1^Ly5;zNU)*VRuHUC4k2zz!aW
z!)t#0Hm8(jsDq2kS`1$-ZxOKxJ~3u!rW~nS3GBpm4BW*E)8N}NbuBFp0Jpvulih2)
z=0*chT5*6cXy*C@Zn^yB%e|G=)xk=ObI0O#U)Gx&saV|y_#LYVjK(i6eq;_3soCPf
zeEIwL`YXAlUpc(qDsZeZ{`T$L(Yr6_fvtEu2l<0jbe+ZN&`!3sYtMWU5Eed!yb*{5
z9YVKy!<iZ4&IE>YL^=dD%5=2of*r9QRcuMVsUhR4O#jz|IevL;D8h7&_VVf(srB64
zB8GXk+pclT@q+;r8>o*F5D^inBv#0;6~7eboojw6baj6D{5g=r*ljNmSTIDarGHC2
zW`jWl1>>kT$okK9W>cgcv2|J3Y}R;FEGi<h7Agh*jvaaJ;FKphP~5bgwds1E3uj&2
zz&Ln7;;2rLvqwa$Qsec|>6>Ut)ncUdA$Y~+L8W0(kF10OGhd%A9Q<fNGjNMZAPQr*
zMg{Q==3j=bA-=K*s9k$k+?{P6kd>9ijOB01*?#<~D%LZwxVTs{%x-H+tODQNwchZ1
z3bn|xy*4OJ)60l8D0E8JyMx>~)#SAx;M+Ud^=y}ztMbU~6IsxZH^{S)4qI}%;N^e^
z7S)s=v;Sh}D09||x_<&B=J@a(KR?>5SFbuivp1WR160-XqTt;FQc6J>VV4sf7^+Vh
z6h#2yYN-A)^GH38pI@;Q6cpS;-b1mz5BtWUw~Q*|B3w$%iTfB2pEz=a84KWmACv3$
zER1#<4tAgUn)*;TjUD1^aFgO}{b?q1E@4-#S-=H=n5#HL)(wB&9ul+e8hZh6Zh_aL
z;=;^#F^}#i=^Apb`-lzbW3EjSjFko@?sC3CoeXW!g*F{|+liZ%(5irvII301h0Sm}
zHtd=!X%TbXH|3EUUxc84Ao9Wa8)uTWS0EuY^l<nukd-zikl)nwe&ZTV5NAvNo5@1Z
za8k24Q39a)VS?PItgJi&B2PeCdabFcX?wmsFNIjWDc)!-2wf+&yy(JD<!6=+gpGnA
zW<3ByUvq2?Z1Olno2vC9@b(mOIF-{+7O}(PMLd~`)1TJ>aU9N@G}%yH+tq>%=ugai
zn9ip7)H8RoW1NO5rzU=UfR@ICr5pWdlSy>#DL6^TJc^!5S`LFuqC;LnTzHA<2gtYr
zL~if*?`bgsJ+3ofn0hBqIu>4rGaq!f3}KoI3k!*M1~UC9q+CFrhF#e1y0vOABNo6H
zQ103HlmMOyQSKoglZuoeH-vxdnb7q+HUGGO!Sl@kxykIqJ07;CIO26o5m9{##7gHR
zksTz?b;=?(#Qre%sZEkDoK3|tnu0NomUFiBt2ul2bA6Xm2U&ikgg#ihD;}udSUJTM
znwnF12Pkg1`$Y&y)FjM*d+|l7naN+_M9V~wmNcBJ7PK11Qr=O(!lXm_2FTh0qHGyf
z6H$g9<(ZG)zN+*FN7^^Otxry_XYf-Ve0+~6RSgXp?H;HX?|vx7le2PiDnrCd9=4`p
z!^NiEKuu278$-TNoV>X&K$D@e5|zjcK3!WVtchcN6tr-Cef?mm9F3IT@WNNZ@<9kh
z3ekIfF*tACoo$0pWXah8GQNKM_GD*?zh!!7ypq=jIL2gdiJ4pjjNpUq4|*yr!L;mR
z#_42@v~J3{R(+c6B4Sw$;y7`CLJRFH`c+bah!I8tl~g#NlB)>iaCC024C#*mPkQDf
z77IIr*q7E8X(1@&%+TyHGfPY<#Sb9}1`&s?;$UOE5AeSCmfJrrySeRpMt%^b`N=uB
zjAMTS|9}wQkI{|0*Zh|F(#f`E(<WN581i-V)u+-s=Ubjy)Qw=Xm4Enf-0Cjm4{MKY
zn>TMJ(SNY6jT2kC3R`(Erv0!PTeU%9T|)ycC0>^UN#*!!3~F(k&2TURuN}c2Nl5||
z!(AaDEr^XW_TtK!Lg7gZ!P7@@T-m@`f<fOKr|4lA%A=qMLjCNMtL}dqc2<Fj%KYjz
z(!1Bx+Ikyd5FH2V*Rim%B|Bf70Wk+bCW;iu`wR_Z$+_@FJ_9y-KXyFp)D?KVn6daA
z5MTR}!WCH$CTl-PO|_Xp?HP!D9E_M7NJGa&j}rAp(e|s`28I#F&0b!m<=?bvU!0Pc
zG4V2!!GkPMzbg61y}CA@SsBNB1>!kKl=FQ;&sS+6!0tPEFqnWBSlf3Lc9cb%4a09O
zwD2ckF%YvBl#?Spio$^&Wj`^R&{!n?2#qL$#f}&d<Xwg~l8p7XrKJi5hp3=BD{$WX
zP?E5mYJL?(F4l{hlo)}0HNeY<z@(rF`U9>{`i%CK(}m_#1}ofQWhLZx`NxljIcHlB
zlI$C62ho5P$mRXF(sNz%h~Q#4ocj<Wa!V?MMNZ^tfQltO6p+!Y5YW%hkc9sH{C5Vt
zWpm2m_3@>)*s_~Ya|%Az&lCm70e%z68J4qT(2#H~U0f(SgA52elt0+GM4Y3(8(NCZ
z&$aGt6W1FCYUP8YCwJIyTnp3|skcG<>JeWGsYR8OwWEGD)Yh+c&UYTvY<{4==IDz{
z=SdR>3n%ARgd9V&yJ1zFg{ZpXAO)@gL@IA=)FkwPeEyD4lkuK~{gWqO7v53yc#h1&
zhJvWPsY#27vtaIR#wFm~Vl7u4rb4@ysw&&n!T<KL{XaM09rZh;sp&7MUI2HOUlZhD
zLG`OiN#^+DPI$mzHrE~>N1Iu(tNG{nKn+H)ZjulGVc87r5YIu(ume__-i!v#<p^yo
zsoqWmfw2k!B$Q`4OAI|R;HfSt9-xx;-{}D>f>GlSoKT)Weg1s<hj`#CRR`MZfh#wI
z)`mH&0_jI4fER4jc_ykYa30}TVB%B(k4sGF;IB!vH^?@lEq?Lh=rha18nMzVP!7h9
z{`vizdEL4oII`~%MA`LRCi2FOI_^Q(K;MApLMdDE@+B245H&zv(jBY0Wxb(1mVuwT
zd-pDJ0zt2S$W{aQ#wU<j0PEOFW*?qf#Qwx<sQ{P26-MLg>RP<GFuQf@);G4$kfi>v
zWLdDp_U)@cA^5^OiD(=M7hA?pFksQLj{Jzo_jQEOf#L^>LUQnGU_FgKCFs>D_;UiC
z2OIe|Zlr*1T0U}mlaQ3uu~d7+V&eZmpy>mu#hy_CF994vpd&g)s4`znZ~-_3P|={%
zl63n4pqWFaffu?Ap@5l{^(s#4%ciExcaBUT*5I!vO?mf@Y`q>CNhlOb89~|jF~}2~
zfYUY5UPyzQMw~1Q3}F;+G+L1QlJHwNQsAquAdXX0NN*Xjh=BnESliSH!wFz4+E&i8
zU4(H98*~{?|5Y4R;x@$_sepe&a3_97C~62%y{Hr916nySJR}%1*t1EMj+hXN*SIZD
z2M6ggmI$iz-`=Tfz@w739bHNkX4m0P7G-)CUS2Uu;MkbU;RJ~Agm)b6eMxip1kfCj
zrqCT_RP1&X&bQIY9xt1)E}$2@T_|fm)kb8N2d$7}#NfON{;34PUE;zIgN!_6W29Tg
zQMB_##KsCyuxyoPlZ&81RI&4Xp=RW4H!e1Sj&u?_HUr<?&ksRZ8_wq1e$WO2IZ4k~
ziGBz{5C*y3-*HBAyX)Y?6(CJtGEFUa8x>e$#Fi}YS<jxSpjMK}gem$HS}0K&*yLJR
zS<z5{NBn@)LD<`bk!l{BILdk$5fKav7iowDmK~{HZ-2&24ow3v?gRlZTF)@@-P21W
z3xk!0s(ugutGecabV9V7NI~n5WP`wZNIZ@JEHq1Wq~R>sXPfTtkbVeri+Z|Y0#kEl
z9Dh>7Qb9KKhVgaoyUC(5GC|!<I(|uBkS4DL*YY@R=U3OxwgV(3q9hF(M3B;X07}w&
zltQZCf-tUzBZ6piVB6LJ85k5@*bWJRh%s0$)3*=Q<!8aYtVaKsMQoa2)1}O0q9mU_
zefktqJ7m_K1Rnvx&%t%^v{Fq~cPaF2wXw0uVBGonE!@*s1Ag>?q9ttWq}yL-T@7g}
zgv5aMBLZiUzAL`Aq#_rc_~9w`VvLnQi=Q0<cx?PnP!Z*bQ5${Vq*HI$<;D!N*uxX~
z=N_5aynDz(_8=f+K~WJ6#dCgKKi(5EGk8_X@|B$4B>tcpB>g3s3!C_PiW#B<SJr}N
z2Q3ZqX*1fxj;H8mSdD_Wz0Rj14)Z7JLR`0Q-4sM-I7&j$#&`Qt9D$F0l|hi(tI-zY
z2qeJ0`_fvx7d3?dS~yb5D}&aM;sI$E-DI?DwrIX9E`{r!w6_op35(YCxVTVw0I4Vt
zUBhu~`r}=@Uw(LIm2FYS4yJ*4!r9rAXBn8#Uk7TObhY6zr%=I8p|s7NK{K8y=1KPQ
zgb21Q=_MeD@?hskY~be_v=grsl9XgcXOvjU6hGp%<JY&FK2)O-tp-#+G0Y_H^;tgj
zK9l#_hTV_lGM`9;F++0oL(F*D)5D7+rABD%HEU>kUl(2^vAxiF5Lkq6sW;TJ;jKKc
zJ)G%oFjSEC57JqHQ`+3Q8O}O+Nal2eQ3tu&haRpahjOzTZ<PFg98+Sl$6;4SP!@-T
zh{h59v~cuS3^XMWiTNHl4(K+O;6DzYJk1($F~XO*ynQcPRo4F7+uYQCU#9>G2+fML
zh$^3IH%7#mL$$Ho7bytglQjH*Pn5VI%a(#oTL4=&6zY0R3L!INZevv_Zwj3X9(Uk9
zq3C&U;Rs=lxY&=rAHsBg`tXwE=>P9b@ynsjKoLiaiwK+y2rL=zOJLQ$E~wt^4Zmv;
zErzFl;DEQFdMqyrMK3Qe65>Ia>})T_W>ZG@F=~{wNzQ!|nAqp-KY2Z;?x!>%TaMqY
z(Sb?^FFeZj-sU7V`Hq&iZ>4`tqHcVj1J9VvM{od>W;GdT(bGGD6O3-xd#$$;62ko$
zITE2H0kC=lCv)vQ7fS44AlWGQ#*bfW({gfpx^f*Kf1v@F0)=1l8g_04Y>q^vM^_jT
z9f<S6Yhii|F`DmQKnKn#>uE0&3G%`k8Y{^_hlU<b7931wUBAPYG-uI}81YXpIdO81
zZ|HevU}%U&T#iLFP`xSj^2)HouDLWk8-_TwxG<^oV!#u<8CgJVq}aj+ymBzPoE##%
zjvYjshIgwHP3@_<P4HYqL`Oet>3(=r;Dy;n{)$sn2TrG4n)nffJx8JJ58n`mn9T<-
zH7F$_ZH{rF_Z&7TUc4YsdZU93>g}xa-}OjfMEYpW;gxrxM{FS7AvjaZed_I2qG&h+
zZJ{mwc+xQnoQYiEWkg5kcUZq&1{_oY+UE+$#VG4;Ob2OiL6fd1kt{RpP?P0aZgL%Z
zbKkIo1TM&9-*VPOSbstvAvMM2=SQ|B-#I#GxCwXz69k`a2vBlA>b`d$Ae)jsf!R?8
zo;HQ<vSiF*gD;zhCv39&@@fit5t8IYqGMu$a6(b{Wwl3>OaT(*Om6F=V|0K$l}Kp0
zAN4Z~{camyQXdqzj6hgP0+xhQvkyQ(!e!{l9C#7=`5pG%q39dNVFM|43NnnmM=(>i
zqK5&n-x}C-`S|X)b1r;Esr0_G|MO>G?1{eLU+StRcy_soUl{Iubbe|Mbk2c8hboY7
z4#kn^N2<1Qd5%922r1quiND$&+qxFaj9yHQAb$(FB7rj3V=_g$)Z!*(AR!Vj52>5i
z+ZSI!Wd8mmA1aYXygVDt@|9(Pq9HE~_(eoCo;@x=!;Jf6mn&(%?)~v26q{`gIvFeB
zjeC^agku5xTWF>qxqS78(8c**UeI5tC~McQJ%wbyiCu-*xc|4k=|Yb^7pAR>CqCLh
z`~Y!H;whxqD^P@@gaVRf{>S$8c4xtg^74&KlI%n46KKak3-eTc)NVRDIx+yV-$$^p
zH@EE<6BDCoF?<Jpko*Zhi}yzUXRJ#Q3gvkS42*%u9U%aj!Pvel;8t_|Qp*-`=T2I#
zmZxV4Jv}`$HbU!Jx{ZW(#3oCYT1_Ff>c#mFRte}I7Z6ZM>%pV^v>&Yn0DXTpLE1VJ
zv~n!wpqa}Vx_2KkFDc{51S-?><$;AjL5LEHw7^qHGx?j!;l4h8*tIILt)<PeKMzxw
z!92aqk+FWOid-B5r}^2BWUi?8xLv$x{ShXqWt51UHy^M$qj{dP5haL>-M-Pauj}j4
zx~@!ef=v&=&<fl>kTHS883Oiu1)ncTH~Y}+0hCCP+|Hdlk)e`e{(!V;39`p(5x;68
zj#yu-zRC6ro&2QY1T1bVh9H%Q2U2^J;tZ&e6%;%>4DM^2B;f*lS!x)l!3#KFuLFXR
zls8|$24T-8qPed<$9fGq_GmD-hf7iZ5%7y+s*;^{fSBM2Y2(TbO@9v@XQW+*o>&6>
z{76>;=@*1lci1WcS}s5Cu0W44)yQd?bWBmKZkm_;g&&+yl1Kv0_`?eFBMiXite(uX
zr%!?LuVUQj<mQU?gc146B`+<Q7X*NW2*3i6KkEr{h!_IUmtr=#9_I=r6R1KN(h32(
z_VU1|9)7$RF+Y&F#jWI7{Lb-Uq%UcpCapVX+tP#Jq&<Z-CJqH~#<k~VcCxJFMXe~6
z>o#hJ7=-%;R)C1FMmCTh?7(ZULaQMIbPIGXlasXR(5<CnpdXFWHx+J?iUZ<Td#RTa
z<rJ`If#<wDl!6A<$#r-?5QYSXz~50=X*Pu3G|JsLt*IO`X0&0yN4cD8;yC+F7&RC=
zucxP9g6H^-4h|`F`-@&E1)cB`4G2V#gXH7~ZW3I6zK%2Lw1Avyp~&$Dj3aU%A9{0r
zaASZO0?N`IAYdC19kVseSrAZl%Tz64!qtN=yJ!tK<p!!^slOfE4*GKf;HSlQJTF>I
zxtDABFq4qX(3Js`cVoE%A>eI5IZV&MAmo^!Z{k6CB2GnjS+r@pJh?9OAoAP~BLjn}
zPLI~JG7z-^&ETGK$y4(1Dgj%+17#p->4iM&C&}i#FU=cy-olV}ZVPHta&reM3Q4X7
zGDHmB`&~~7UW^efQ5NeDS*A3Ych8xxTTp+V`G|&&I+@{Tcm0Vw2QFlCa|(Kw2)cw6
zdCc~TP9>^i9El3lQQ>eckgG+iCWPfh;$mYT7rre#aQ5tD%gfDP00b`8aB8$(I*E#$
z@<wi@VNURr2qJ#~Gc{Us;9~GPx9G)%1q5lxM2Jmp*({A&%@|2q_)|8SnVHp|=iN^<
zeUKrD=q=pOHz(Ex1b7D>2vNQd)MA)+9N_|Q=3H34UH*3F1LqndAHe}mhh#;}IFE(}
zI8gpSe7LnfPVQNyB3enjLqZq{(+i6H@GPd0kZhzaz3c|t6BO>JVEi!*c)kW(3M~~G
z5-*C2Q5lcaiItwo^}<Rvg~^GCtOWjrcP!R<LkK+|yh<6YlRw`kGeU*&f!*(Uy`O%w
zP%rio4%_d}1#ogiT+^III3%5VFa)i^Ee;mJR{}NY5E~90f;7*<!4ZJsYUPGqoF6`X
zFerADfa326EANXMyY6B+*j;Vb%Q|dKPfvdXgWh3NM2}-6MPeb&z(xM(oDnxCr$OKD
zWg(=w6br^FVYZAkq63b9LSt*}i8dqv-Ikp@cCZ6H><98D-dLg|n0Sn{VQ-`tepxy4
zAo@~&*v=6|xxJBs<f;OI3E|PX7haEHF;|4)pLCXE(n&^w)s>W(;15iJ$<RD5a=`?v
z=swo0))b#4NV=wQd5Z%o8Z)sc)2XD&80mdUE=&LsNC8MBDW8xw05rj%)PKha=cu0k
zNE{}HsF8yK*gv68C9MVkK0bg8eA3c6M*V1aWhIyYfWu+lu;CgRAm~v;51jYql&0b-
z6<#n9Burih+(fz8-?O-|6SbxzQEV#iWB(hJdhUjogotdU5Cf@Ebys*AgT4<<fLxBC
zI5Q-GR`%7PEw<|J!r4?Lz9S3{48TUzKd1}$Aq<g=E^5c~&V$4?gUc-W8-TwWalhhw
zDnz@-H4L<j>}7RzjL`T6Hf!Cr&5{jzH6l1VCPB~7FL3;S(>Q;8siLOmW9H-xB-Or~
z*e<k&(UI1&q)X5<K(_`zcpw-je}1W_@d6rabvF>g=pWNI4g7tfyh<BD9m)Lt=D&<_
zY2N>J36=RRjw=sa&l<hB#D-G0$$b}TbciVNWp*3srP{S?7h2H_ZO&O9ryxGEfeou>
z@`0dTUk3NVQFJ_+2i&g`VcY{V3`NV657qGgYt>`KF#!wbQ1=T3?9@;m&E&GUC?4k&
z-hqrC(a)ZmH`Zz<Yn!6gi*xs-oRnelO?W&z&%}8Lhf`DrXjWx_F$0xBFYlbTS!3-r
zxt7&`^e)sN{~%#yp?_0~viaik7;B#E%gJ%Pr<I^c+WQ675dfaGO~5k@$!mNKE2~-D
zDKbzYz8k_6UbLH(mF*ERE-t7fitUmfi_WVQY5`)!c-*Nm%<O=Iv>6ne)dcYf+_`gt
zb|0zJbaK&dZ@`7VZ3jn3f+3fhV&+C`2y_gZ2eSzEBlV&O{HYYJYfWi7LOwMoKCpcW
zEEL*3kp3fDYF^ylBGtYT8x4{Q#o66kZ|`0zkU`))Xb#mKJTCuAGg1(R2<|djTh#S#
zQ6qLNsTZ*wBppfchdZ$OVfUbebpj0i2IArdJ$()Q+3l%AU{r`HS#uW9T4u``H1S1M
zB8g!oH@k}MloW-B0@^Y#FkRKTriO+p{K*lX=Nm;sIdDj56`^+h_C7NlAt?!)xoTqD
zIoda9WP&~@U1$=xP5Y6~0SygRtVMG1jE$rA^Sw0*Fsi0zJRPj({5v%Lwz!j}o;+}~
z1>l|@RAU;ygEdvwDE9(h5TJLwWU-H)@A{oP1WlHKWhGr;9lAnn<i3Zew5O6yYCK>U
zIs&yDZ2xTo%E$KhG8j@*yJ~BLO`<K19RIWFL7%?=m*44eI&E?5*8JT@2X8zBAx2sY
z@w}#8tq&g`OKXMo7p24(p&<f`RzcT6X7?wJyK4N0k0vi}H~x1oZ&9`iAAh1y%Fr&+
zyP~R_w$0c^&&YXcNfok{kyhN)#U8#QTzdrUQwFaiQV2TAUfN{7SPo1`Rski64_wvg
zjW?})7!>$WsNYmeNZn%IHoB9c7nSTA#AQNdP+(8JLKH-VE*OMlFa8!Z_88X%k!Ox@
zjeZNciC;Y?KlqHX3nk;wN(#?4ntv%;!*5ZwqwHp+pfedIa#C|&Qe;dg1EIkZE(Xnm
zQPjE8jHe$}hwKo6o-k~a^|sf1eV2uDN>G>GXx#z64o#J$;~0cVfaM8w&_caGf7)eD
z2?^s&;BW}}ptD5CrR34e1vYsTYxIsYToi}9hDQ6?G2GeY*jKrNJPh4IL@>$SK(`)U
z%&5!Ik9DGD0K(SDl#sv;g=OSgiZ6fuc1qBk%{5GWfdER{ASk`4w#c0$keP|#k%O2G
zu-^9V7wAnb#pz`LpjTn%1YbXjUq1^VLqB!#pp=y(<c2$U)+7GBd1j>r9$)=g!hbbL
zM18I846a1c7WgpuB61b63JY%9Wc?BLrZChbrmwE^QSKVKe1PW<tO8w4IIkK!w=nbh
z9J(!P9X5`B&U^bp*om%<emOhR6hTmm@RPTV)4R~8i#weNlP}CfZf`?7CP~+%UxOSp
zI}*lm-A@=yEffmKZz2IB4VN`Ftwu`Pi)@??NSu)keD9av<GEg}c!V0ez#nU5!*<K!
zD185$07LO6c!lk7bZ6>w{~gRh(%3-kE*Dwb03*e5vqeK#oU@Y?(Jz43UqbT$L0SWL
z)<v)1*mK&VVUdF;xS_GJO*jRwo&%Uq?hGMZGU^b*s3T5&Wuw{sGGq_J;p2OX9y5FB
zNkO~RxB@86E@E)+!v;Jaa)kTB^dJnHez-lt*lYegdh>0<{*1L~!`UO3pOu4Vkrw3p
zQPhs)vKUxHiOJH&b4F>Y{qylgT%eSdB~I?J1A|LC@rcXxgC{sl@I-`r#l|eeHutH1
z@GA}nbY#5MV_5!cIJqF=D4WdbS4+HdIVwp%SX<B9{rR7|5rS)A48(9A!Uaz5!@~6$
zoV(nz+VWt!BuhnD3>Z6!-!OwsSw*}$PG{HB<qlt8uC=e9lSUFp%FAuHgZr@^t<`!R
zxJNC=n1ko;{Gd>m(y!@3@MPm-Pf`;G*nlpOVk8@;b5d*EZo$T*JudLiMG-vzqYLZ5
z?vePv)!zg|fIt5)_VgdM*ZxiloH$JCktpV&W`eQ~0sn+fojo=u9F_3yRo7y!U#F<4
z)vykBqJAR|pzpJwj{lMFVkR5VD22>s5tjP6_n9}QY%F5!iR?e;V~!4-1m#3JZ-9Xb
zn+EHTH>wjNIFabm($v+(15AWwG%5-NlNZV^PEK!alvHrT8N8dDLtteE=(vfC3Q461
z;fm~Z*l^pQpBIMoM}<(?u$n@YZDYtV-(`AQ0=!XrzA19GN5fG$=sXlr#HUs=_xn)B
zJm~SDH*b!#<K7$`BV;353b}O&XN2O7U8Ec-h%IC@5DPU9M`<~LBMf4=xzf<4(`y5D
zBl;kK0Wbw<4y4BC5po#nl;P~$pwi+M#2BO|X0%_zzGL;|lAhkuhOPI-Mn_wYux$yc
zEpfI9>kb4tf@lO4;Y6wl$V5^R1I8vI4ifhpBFrlEgV?rbQBkhP#s)(Wz-@&`p7o&e
zBCZo|Ij3cCnAITcfD@dmk5-%;QNb;K(Aa6AM{r7?VTS9OLP27#JYoOB5AJpWd<#`L
z__`;d>m+#o9wu47W#k$+QcokGWn7oTG7_gqztZCLYGUjI5%CEP_F||ZAGGX*n1%y?
z6<LYcY_Lmvu^v9k_up{qU2t>+pjnSax&Pr8F7ev0qq7zrp+tcomPOnvanbT$hI8Bm
z0n)<<{00B+-8>Xh1Kfw04O~yI4TYLuV$sBRdMPIEBLzEg%EH=;cO%y}##cP>kBp6F
z0I?!%)27iffDMbq`!?qBxAzLu1FCp!k@i&jy!VztNkx0O#QV|zPKotE(rJFIE1a>Q
zRr*9|$rpT-trhC5$nmGkaE}5(dGHx6<l0+UB##?L3IetI{FrR=5LBo3)^5;kB~mf6
z!DE;fQ8c0b?gfL`0^uOx`3E~;)N%_ZdN3?Et+_!3lN$l=r;oCwqHutJ|Dj$-OJ*8F
z%Umi2_&4SOzy*^3ncNTU?c_!;LpyN2svvKwuknO{l_<l#WaRb)^i#+P+b2V|rG<O|
zT9_!G$Q}<tcyrtDt05QeKx#na0~79$c?oVO8;D%8>csI^dDL!jQ5T{`6HS^?5Wujj
z&(1?)PHis{)_>xgmL^OX0UQb9wgRGq>md0qai$V10_t-$u3H7~aP&!jViGn8DOmdY
z`bfb*@*g-Qi{SjH*X?NE;9@mGCH-@ei~j4=7@_r$qxnGa&Fj~Xd@E7@Ydfz&bK{}T
zyK!;p;~#PK6)yaEQ(S^BWl%_nJG1qL37+3-P}WEw#aT7d?itLsU*zuS=r~~|iee)^
z4|UWj)P;tbi|g!l`8pVS2_FqB>^giLdNL~@zmCBD{90xgzMw0KL{$Yh7pWa_v8pNV
z9)L@FKg{l7JcdEU_d+gHKo%h;7jhrnN?ygzcGK6IdDK~OY8jwrA_XJL5Ev;Lk<Qd`
zI~>OEfY$566wGPj!9B^^pUBlXU;+Mdi=x5gyU`1qgG=fW9UU=Qm4i^q{<l~CwdO>~
z9)NRlT?-bg|BijhOH4mWzN;0dDA28HaM7IfWETXq#K&on>WFI)4f;1gsjO4*W#F5&
zJB0=Jhc^_L1@OQWhm%uDd`{8PfrxjBKt@hlOP6w3Z0ukl)^5@VgfN2o$`q~6Luix~
z5xM&=8r%qhez5Z!9b3AbV*?tEZ;Kqiiu-BM=T}^*QVhR@L7r{Q()V=(-x_Z^G?4n#
zbH{0gWpTG|`Q!SkU4P~jZEU^|8ms^LuUe_f`0Z}^a=@KdfW<+~2Cs6Xe>>o$^|t2*
zd^>l(e-&~4kq}VvZ@=wT5q7lV=td&216=t1))LS8OStFv)(q=6SlZxzWrk?4YG4rE
zIDvth!wE^sd0aY&+XgkrHIJb9242tw1s>1?V*)M>uzWcMfnN7@s`9RIII_@>4GAq>
zF1N08#Zs*RY>C8yl895`fn-DmB{VFo3Zc$E{sIRF2Mkkr?`Ti`Q?HrjuizU&P$Pz?
zw?-P;F1JVjxyxdvlnSe4hBQItokB0Irc4hP^TYaZM=3hIL~PRm3;q*SZlDaoE)M13
z=21c}KCLG^5*97Kty>SY=q2r6m2<9(3*!{crqut}9c6_|5eJhaH|0G}->McRqaP{$
zgQha1?vO=OjD)z=jA?hRW<nhC<becL#S|y+9!%Rs+Fgd;@DcIo)~%;Kd!J}W3a-E<
zAE*ro+Wc_to7ZNG+3o+jn?`^7E)D@CiAalUcqJ+daCGiR*aKubg8${IpCB#xD43sC
zS~7tG4dc0L?6P>AKa)qHF{&Wr=|Fw--$Jb>E~*RkirEk51DkQp19C+<jEDq#UaYKx
z9a#M0@6$=VvGEe-%W-<7Gk^7zu11=3`PI!s%KFfBCzMzJss<Havr$MRqs!cilP(D?
z$3Hk*uB?yojLs*;{WUpZ1$Z=Zu^p#EC@XFfX#^KS*rK+KJC+v>GW5*bvrMNx-_SZ8
zlQhIROAPu?c3@M<Be!e!d4k9M*X3t@bcHL0Y)X7jx36(dh`Lc5D`jrhI95>%GJy~R
zt$ZEHyZ*UihqUAp+=(ia9Jg%uX(#k0)ky~65<%b-(ztcF1Lv1mcwv9oe@hN%IPkIj
zho;YClam3ZrKR1ogP%Wd+&#a@^uglvX>u16>OSH;NzsT|GQptjJhr0@%{ljQ6{O_<
z(c78F^_Z`H{7y{2Njkq!WNSG^Vj5DWXq3v5eMv{z(<U>REQQA2B70<Lq6Nv4wd|BA
zF_N*>V5TJ7*oz9u^S&}OoSE~yUe9x$^LqZ7F(ZEW@4mnH_j_HR?Q$I`=)g<99(Am^
z;U%64Ioy@N^lKGhYik4JAZik+Ut$Y-+<VgCe-wT}Vgi{&t5&X@$L{lYV`HbMZ|dTX
zeYO6(!oJaqDE-ExfqWg_hmZET#(n@0|Cgwrhdqo>oH%mul(C#Fms#S%OL+qc`^}7)
zCL7~9SMsm4bXz!Kpoc753_kVnW}5)E$)Nr{e1;g9{o3tQ*M?!wPOkob&=21jWo<k8
zdW`2l_d90UYbVbgr;%W?$5D6Xl_NKfp0_Bff;0N6UIH90(Au45%jKr(xRUHd0P?Ox
z`LEo0K3%oc3M+pPTC!@WryN|Yib6HjW59sraleH%$$3aBElR~=-a>)`L>5G^(UDlp
z(mQv?5v|SD@x5K|uy4v;62-NEyd1Z&6&#bP{*|@+<ToVeAfuks0s@uA;q{m@S^T{p
zf}8Z6z(qLWH&k%Yis^jU1i8|bIPNIz>2h9^A>~vU6j8>OmQgDrNWhdtjFxq3$o`O?
z;#gJ5k`PiFh0=rXN9NZY@@3T@2_D*+2n&Hc8;QV0^f;gjgllNKgzNlx#A1<r+S0xU
z$T{b%5!%x}LU1V4^{ggBLCV(Oh-lEf^&bI9`tvA1ugyvwvJD)iKE(@r=ic|3@_PG5
z)e9w&AvIobi{#`axZ`6g`GDL-xv$$7{~nNY)w5;oC*1BBprLX&+@X|OH+*Al;#D6v
zat2_%w!idJ&8L`^6boqP4S!@Cc_<YR3kE(+H*-Vo*^?zFp=r4vLOv3_QrAwj;PP*~
zMcG!j80FV=j&72Fhbnt%5L|t{$gW!&8EG-Sh~DgQvZM$mSx2WT4s)vQmK!<hvQ?nD
zR1~@(EfhmkwG&>uMVl3@kw3jzixd!AR-Tf?eu*Jf!{M;~3zV567ZGjchh<?uZ%pWO
zN%*$SHd3Su6dvlgpioKUl5BVAx@p%Z-#j#*^82IjFc$TkaI5^b#B)nJ3tjj;N<R}g
z-apnySFI^5L^;wy(n#Ihs55E8Vt(ir6yb;b7HuaQ5E1iA4E&VZ=yM<eQF|Uvu6-*F
zN$@iGEt-WQPX+m>G6{LuBq_*iWypnu+XEX^d~v_Zz0Yfh`Od?Y0M(-vnwy6>tXe!U
z!Pqo6ME5@t-J*U5*}qhp3o4hZft57csEs~)M>09{0#mE?52I%y_wGa1R-HZ6YvRMW
z%ZdOd`H;v08cE;j<w&i(=Ic;-&f5pD)zA#}Vtq|{RESLV-gqJ|tFXFGc~kLq3x|+F
zyBO5KUC)79wcY*MXTBNhB%&teTy$!Xwn{~W2!KSv7|901Pc8%(jXH^0Rs}Z7#>SzL
zW&4&(Ipt&J(^dgi6dA>|2SY+Z{)2R0^@DYh#(@q6CAes;z(x#11#U~)?zFbfMe2B{
z`ZeQI?ggR>MfoS;iv&lQU_F*paSqo{XU~4~r%XkMwj-Cc8qv$V-MPWOc`;_%%Q|oE
zc=uXF+v-W}-rVbFko0J)?~a;x-L_wPGD5#>N3V@F+s@l}TGXdvc5!_EzD_Sky1{gx
zae^=oqkHc5_Qg>9@OJ6!JXuf#l#FB{y6^oK&5qE|mrroX-n&P(=9(>cW5^S72xbAC
z4jbH$rZ?>85im3SM5w)Vi{Nw*9W)8;8?b9z8YNT~YAcFp^NRI<J=cFOonw;R`%ht1
zz6b2KmF9;hew0rz#J#H8*EVs<?!7K4FXB6SKKlGk<>o%ajXRz(I(aq!<aV1C3m!`F
zY4U&-4sVC9pX%Xxw(96zzUhbjs=;Jg)Nf)%R$y}x?a&f_qSyrwcyoA5c&Nb?B0I1z
ztw1*cImhvz{mT*_Oq5Znb;{0H)_xkT&3X_D+5bVMm#3ruK8s!*z0J4v{l|Itu$$-H
z(_*^#$HZvNDe!xHZ+}$VV2%55*6d<{ff^xY_YVTFoxFNA{pLyiHZV34<cU8(ItSQ5
zPz~{F8JE+L=nb>IW2MA*3nSlV1CEh#Gb29x5pRP;6JUX?4^<~gRU)1tlA!rl18SF$
zYeiR!AExzpTYPQTsEHS!1FuAGKN2)XbIc3716yzKUy->sXM<FB4h|!yt`d)q_|6}U
ztal9ISlNKu<ILJ3SOc^Cl+*%VtIwXUlwHrqiEEKey~s`HJXeN1d9}fEJ|OKUEi<;4
zE>5}BJIeM^72E8LQ0tcAHSIO39Sm3M#V@Rlw9Cm9Q1ss(Ix_T0oB3p9Er<%)zBDm@
zsf(-oU}568q<=HD_13b42kQ;YHZJ^Nr|&SMx8H-9zD3PuEZFQ7-qJ`pa=yd;XP@Nj
zDaGa5>o!IAjk29l*WL9(QP>{kqPoyXt9Rh5oNrsX%-yI{ecb2V|LuBy&!DX`Q=4Yp
zs((<&@oZSG|0IvT+6f*bsw;l(wMQKmzM_{==(&1&!}M^i^PK<N$t~{J>rDB7vG;0c
zVxABa3(9~?X`aJ}8(UbY*q}Rm`^b8UaoKWI?*(lS9w!yBK<W(s5r^@x@LQAv?+Ges
zA0?AMnNv%rq5tR$UoKoaQ778Mb<$|7GVQ0`duxx67}_>>!+xz5^?f4GrEN2x&tBsY
zGIH#`FBY7LSv~oYdRoe&dW5-t)Aw9N%;wJI*43d3AJ*CbpiXe3(D!CxWu7`~Z2A2i
zkV~&+nuc&wN10lpmqf#CdRaZnWWR7W=tfdj9_}MQCQg~ZxOD&6tdKbfpbrBI5xp>H
zg#OKpxkj*%lE8vN<6lvWyDa?aH^a&3Wp^C>K--CWNpel4JZHFv;?VZK_=up54UVp0
z`{is)yN+<aL5m4jZx<J5I#&6f{u5id3(y5v`*>d~%TXFC0f<DCLE^)Flp?q2tWU$n
zbtHZMeTIoo89_OApkW}gnh2G$_>LE`d_20I-uGUXnj2wyX;|C(P71nz@5K?JZll@r
zNP)OyzjT0kZU6KU;VD6fi`tI<UN{!lPsY#kH?J*nyZ=`IeKb?3(?9q4@S)|M-T;##
z&)g7C_c-A-q9$(;Vgjh=gL?|`XFv7sy3$RPUktt$r!laVqtNAY25~)rF*!);(qD`1
zj00Uf3lbUtF4XjX%cc<~Wfk%7s<k*Hr6v=%MP2;nK#rY?kERR{f4x)RlhPO7;HFEu
zWe4{Qa9K+}ZPYpwP(=Rr%F5l=b9*+3*-x$Aut75$$*Yz`-08o%Qx_IGJx%hCvGs?+
z&#w9TCt^rb=lqkYc~oMNe@unt*h~4`c`P)M0*bV6zw^qQEd;3lTz#2oC*|*8m`~wp
z5cG(&|E6F|r>GkT(bEf~bggPx_xQM%Pm7Gux}L%Lg^h3tCr8TFca_!W1(Tu{U=e3$
z)V%zBCu&Pky9lNQ1Lkr(M%y>?m^pzZ{7?XPOvE#sy%AI~w-JBHgc6;X9+pc@f;`O7
zEP^Q@>RNbnT=N){Akn{)vH%iY43Xl>kIE)uS*~cflzxQ{KLpLJ^sj;va2Q=~+?|>r
zd{#UC<el~-z`dkk$V0MMM=<VXU;J`0?Jb&ZHvLRGryq|@pD1yr;8I#3s_8Y$ZJT!Y
zt3MB=$mLFNI2AutN~pBWE0}C@5RPuM#|V%caP3H)!Le&Em43x1CP6fnfX(qkMHv~y
zkR0?FAx>Wqyi(QW(%hJXTd^>!ApQ4xXGaC~4474V-m`o6`P_$8i9l(GoZRYl#p8Bk
z5?Z_NiW6V&v1eKJO-cQ`tLndEr2Y3D#w{REW{1ZJSJy?-YqS^_yXI2qqX`sg4Vf7O
zL)3o+YSYuydN#Y^Y6#DZ;8Dc1yW-F23(FWJI<k_$%<V#+C0si|x`&uF?Sh}QP>N~0
zNQ75V-%eOMXxSvLTnvaJ07nVKybU9sMhGtI<a{`4y~d3<r;kC{EMDL!-2$IAttSt}
z*C$C_t5*-L*E6t7pYQjJ{unPXbfxu>&B>c=>~BDFP{=NnC5?WITzdgDNDB@)ox5zS
zjeV^{mu&nzlk?O)GU5eb@4LNOo!b(!&yW)}CTGRL)3*lkOL|Q!e9KPw;ltj3ndy@G
zq94SXz`3%+K+aTGX1wdcV-e^AR9#1T#zAl!V|GB{9utK1qG1s&sG;@g+#j2tK};vE
zWW=+gg@?*NO)%y+@T1rtG$l1DxxBQQ91>I8<@KLXR*?qeOn!^Y)^(kmY&rXr=&PAb
z0s7tai{Xt1$HtAqO(b9~o`pS_i(u);<H-dKV#~5F>zWe69}-;w@!V;Xi&{CWkhT_!
zE}z7?v8DHyJ*CwwJUq3H$L`Uo6OPU-Z_V{Cc{{rpenh{~@UWeoqTLp>`k~NSB{Df_
zhVvc*nMKk-N6EiweWR$uwAsnPMU2O%B`4b0*VolGKsGIj{g)4*jy7?NCSp<FSvAhr
z*IYg*y$Atd*pekn9(DY_&f&6(frvV4m76S%poobLOLKn$3<?8XttH2g9yY3Dm3NN}
zje!h@h~nMqACI-c8SK!|Y|{AXcK-N234j?UY?2-wLt}8J#vDZ6oF(BpMHc5?2j63!
zl_^3}bD62~HNy9UiZL#-%~vZWQUFPHw5(oj;}}m?wmhM{hoYPoSrO57Dn*i^*$8@I
zOY~+f=4f9Iz!Wx^cp0PuE|;$%Ek|aC{#LY$d3g=kwF<63eLt;W$Nee&drm0xC`EaH
zh79f~;fVl6JNG=)`qoVQ*(io2(+SO3UPV%)N&-a%a^}zeBjjd|du;ytqen%mgT^iG
zwag^Jltzj|=bra_4vme=i?eS;&R%O&><x;~7HJZWD6xgl<*>-ga>%!9<UICO*_{v?
z-t=x?yg$%8{92@W_f_-6Anx0IQf%%{pEPOL1*1t}&VwJtMBYt+qL9H9K|w(_U$e9q
z;>RaKw7wE>Ik^G1bW{2mZ*Nl>hLE^7Hu_R&14Uk6POGuN4>Dn>qqD7H{tup{MSb(@
zm9|3F%n@OQST+b+B)`+f`$f*+*nAG5$168(-1V|0NS8f;_xGcoYpt>JH-G!3l5p;3
z!+u8M;F~q;$c0x20gEI7^I8!ahGvut+YV)DS^8{P)lUGc(;)hi<y=oeqt^!9!pds&
zF^fg&5?<gY+@0j)cgWmMa|Vbrmw_g0);Y}FCwT-Scz9vhE%Z>z(5=f0nY_jLIkhYw
z?vw#Sk{!<~{Z+nsKk*^1c-^w;jjbdCK&E2dx_7S)w>|a2)V~^n&3Ry_nHfajnqd^j
zG09=t)H=^^uW%GjGB2OMP=0TLNl-=F_8F=|wf#{9hy;{2L4?;B##}rCS)7!i$eHwO
zYIIJj?UzJFHH2#tZEFb{_Yw@c1^~28cv*J8zfR9pF!NEI4o$k=MbAJ7y9}N}EqNN?
zo%$f_CH(JCzWuz%a3ywq(v2eyZ#&W6n4Dgn(SrSV^8-f^y}=oCzQyhsMv%jOa+idG
zzqSr|MncvNI075-a?6oXa_Vo6SGp7>TDO~cfqM=6WC|<~eJsXZ^2Svg$_u#^G=y(u
z09wG!Lwgw|H@n{PO;O__j}c0!-__6E-97twM@{Q1{*U9gduaru#8I^L2G#dM=~^3%
zA%gISahd1NtL<r7P2rTwE5k_DYw9?sy3=G6kM;Lg@8>CM1Jj61Wv$&4q`Bs7Yi-0~
zEawzQeqd&BV*iF&OAGv@3p}s>Ed~Cg0!<3P|HaV9B<=6w;&j}iC0b59enhlc@94S^
z{rqxLw3(>qw*2}Kk;2TUIk{K}4NxNZ43cS42(r_zGvQ5Y8Fr3S79F$CoVg8B+H?Jl
z;rOwlsOFk7{>EBwZI14|$nz^O(1VaK?VJIMOAsx;=tzru3dvjHIQ6Q<#Tb_i`-Ozq
ziARpZB~aJVk7Sn?bqKK&^Nys90pLmQ8@Hhg9RgK>cF646n}G7fJj;G>d;8v<J0>*X
zm^Qn^XI!L8fsFgk{sK!e0gRkgNp!ZRxcym06Al>JO;LDEzn6vNNXT%TcFX0gLZBuk
z<ja@Y6m2#)D!DA`aLI$<+?3)8p<TjHzmSF@24InDtzO-TZ>VN^mi~AzuVw<dp;RGY
zRz{k6-9Tm15oasA15s&G|8iLPUZ|(HUdC0eItjb4W{+}#VD!u1$QIbg-On$1KoR{>
zd#^MVrFAy1ni|Rw6k0@}oc7`h8$J86tqi@vFQO<gwR$3T=iQ*H6}hYlihUA97R1Mo
z*><}H!h%>bBH)&t5(f$SfW=#x2XrPwDFw%rNCkF{v7pN07lt*|P}u%XQNsv-NiB%m
zLX+dZt|E$B;Sq5`?PXvCji7;;mxgS+Vp-;~bGT%di%5a~u_up}D~g~Tg-$0_6s$yF
zV)P&|rs?kZoUbRkKwh>%#MzEK5o$;Plht%x@CB5snvl$^Wt;ODF=R-zVvNn@?~Bx?
zK>vzN0F3PhrO*g!ss#OE>;Ys7k-#4KEDnV{PlE9(;pn^8>?j}Lji%f@N7IRd#cf9(
zfIM>Z#y94&dc9GmczsR4K@(V(61+5~wzk6*HIFU($-9$UMv<&MRk~N|gb~L=_6qB6
zL3WL^!`Z=X>**B}f7-WY9hP~ub0kQo8!F4d;|07UfyciK{$*w_d{4tBZru9ihG0|c
zG1BU<79TmM;>6KJS7iZK!Fd)0{vdRvC~fgw(6|&dYeV2Ei`eq=O=$(coMj{PNTvOe
zV*GRvvbi|sYOpU{deOw$37;1aMxL~Dq7(m$YWaqJOb$A_qcjcyU4TsioY_#cln`q7
z^ops$!3D<(NcGQTu@l7;`}phYX+QpIgcONKWfO2wLB(2CkWmvx!|`3)zq$H!Bi?P5
zn?;vWFW2bJ-ZwCED8?kYK)7bStLcjE#Uo>`!LluS__cT%<GDRjlQ9x*&y2@jlFi5u
z^SC6OWLgD8bt~c-ez5t6+s2op`)*>|8i!UUeS=JUc|pqrp;?6t%y;pM$nEd99-&&A
zOv>GYtB+^k3KNFDd(e~TY|WCU!$IXO{S@;9f!YB8!|?+#^X*{aTbWeK>XTGZ{@N}#
znlpBaJ<3Uy=vfc6z!(+?sqxQ4vZVx)HAt8xRthaeVjsG^1qbLrv#ry#gChG60I)#I
zqu=&*?Gi#PyWtunM&dZ9skn(oxZe)1{1Pt6#3g(zJfuj55JrkJ!30q}Nfe*Iw4)uU
z_1I#frR^-Cx|N7W#$nY}SWaE2>K1_?h@c02nTY=S80=GK&ZQR>5e2jO{BSHA+uIw`
zduS@6!{)Wg5iy7)M$!RLIy+;8Z{NO8rGAUXjq75%MBmVYQ6!nv^-@^{zFf@AQSOrv
zn{i!(b;<RJK$RSG=$5VCSrN<`x-&(F@_%&b^B(Niw}rf3))GCJNSS^PF6sfrD+*%V
z!|qc@k3!z5EkGuF#zK@lB5K06Ap-|9S5GaMy&Yu=N4stSqhi>(w@mN$3(d34{*iiL
zdJW#vO^5qqAeb(VKs6~ODiWVX!z2_4{JZ7!rM#wMY2syH#HKtB`d?H{9!4~a=i?oX
zuwFRv{a4`=1W8zPB-hNe+>|Y0??PkK?G~JCY}dF7;`h32o5BIa)V%uaDyL$)n|2c0
z@%bny2g?`<RDvh-el*3hAB*-IgfyTA>$oY;P1UB>pYv3dgsFF?3_HkmGM-EsXk}nx
wTAo9)W9=Ede_`zz{huGi{4cP=@^{LMfg?4#Mt3>Em}<qX#{kz|&SMw+7r-L91^@s6

literal 0
HcmV?d00001

diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst
new file mode 100644
index 0000000000000..8dec7ca8c123c
--- /dev/null
+++ b/docs/source/benchmarking.rst
@@ -0,0 +1,14 @@
+Benchmark with vanilla PyTorch
+==============================
+
+In this section we set grounds for comparison between vanilla PyTorch and PT Lightning for most common scenarios.
+
+Time comparison
+---------------
+
+We have set regular benchmarking against PyTorch vanilla training loop on with RNN and simple MNIST classifier as per of out CI.
+In average for simple MNIST CNN classifier we are only about 0.06s slower per epoch, see detail chart bellow.
+
+.. figure:: _images/benchmarks/figure-parity-times.png
+   :alt: Speed parity to vanilla PT, created on 2020-12-16
+   :width: 500
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1049a6d16a75d..2b7d9c3b58e26 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -24,6 +24,7 @@ PyTorch Lightning Documentation
    style_guide
    performance
    Lightning project template<https://github.com/PyTorchLightning/pytorch-lightning-conference-seed>
+   benchmarking
 
 
 .. toctree::
diff --git a/requirements/test.txt b/requirements/test.txt
index 3cb538a98d7c8..632f40e0287b4 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -17,3 +17,4 @@ pre-commit>=1.0
 
 cloudpickle>=1.3
 nltk>=3.3
+pandas  # needed in benchmarks
diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index 223f27731fef9..08a1db65ce1e2 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -63,8 +63,13 @@ class MNIST(Dataset):
     TEST_FILE_NAME = 'test.pt'
     cache_folder_name = 'complete'
 
-    def __init__(self, root: str = PATH_DATASETS, train: bool = True,
-                 normalize: tuple = (0.5, 1.0), download: bool = True):
+    def __init__(
+            self,
+            root: str = PATH_DATASETS,
+            train: bool = True,
+            normalize: tuple = (0.5, 1.0),
+            download: bool = True,
+    ):
         super().__init__()
         self.root = root
         self.train = train  # training set or test set

From e721b1f51f50d61b6ed77d628e8e08fbae18f2eb Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Thu, 17 Dec 2020 13:45:02 +0000
Subject: [PATCH 021/136] Prelease 1.1.2rc (#5171)

* update CHANGELOG.md, increment for RC

* Add missing changelog update

* Added a few more

* Move to added

* Address code review

* Missing space

* Remove unreleased

* Remove lines

* Update CHANGELOG.md

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 CHANGELOG.md                  | 25 ++++++++++++++++++++++---
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 051fe5fae09e5..eb2ae514f2133 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,10 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [unreleased.BugFix] - YYYY-MM-DD
+## [1.1.2rc1] - 2020-12-17
 
 ### Added
 
+- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)
+
 
 ### Changed
 
@@ -18,9 +20,28 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
+- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)
+
 
 ### Fixed
 
+- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)
+
+
+- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)
+
+
+- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)
+
+
+- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)
+
+
+- Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)
+
+
+- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)
+
 
 ## [1.1.1] - 2020-12-15
 
@@ -34,8 +55,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)
 - Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
 
-
-=======
 ### Removed
 
 - Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 222263ea2d385..af63190037711 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '1.1.1'
+__version__ = '1.1.2rc1'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From 81070bef4e5b31d4be3cf8367ee9a92d00677092 Mon Sep 17 00:00:00 2001
From: Haswanth Aekula <haswanth.kumar.39@gmail.com>
Date: Fri, 18 Dec 2020 19:00:32 +0530
Subject: [PATCH 022/136] Fixed docs for WandbLogger (#5128)

Fixed a small bug with the `WandbLogger` docs.

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 pytorch_lightning/loggers/wandb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 24007c3a04307..1c02dd65be072 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -59,7 +59,7 @@ class WandbLogger(LightningLoggerBase):
 
     Example::
 
-    .. code::
+    .. code-block:: python
 
         from pytorch_lightning.loggers import WandbLogger
         from pytorch_lightning import Trainer

From 16e819e456d32c948be76ad9906c541e542d11d2 Mon Sep 17 00:00:00 2001
From: Ganesh Anand <gan3sh500@gmail.com>
Date: Fri, 18 Dec 2020 20:12:36 +0530
Subject: [PATCH 023/136] update DALIClassificationLoader to not use deprecated
 arguments (#4925)

* update DALIClassificationLoader to not use deprecated arguments

* fix line length

* dali version check added and changed args accordingly

* versions

* checking version using disutils.version.LooseVersion now

* .

* ver

* import

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .drone.yml                                    |  3 +--
 .../basic_examples/dali_image_classifier.py   | 23 +++++++++++++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index c87130844c040..1041ebdf872c8 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -36,8 +36,7 @@ steps:
     - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir
     - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 -v --no-cache-dir
     # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0"
-    # todo: temprarl fix till https://github.com/PyTorchLightning/pytorch-lightning/pull/4922 is resolved
-    - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda100<0.27" --upgrade-strategy only-if-needed
+    - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed
     - pip list
     - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8
     # Running special tests
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index 9f3ba5e08b37e..e628f5daf8a53 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -15,6 +15,7 @@
 from argparse import ArgumentParser
 from random import shuffle
 from warnings import warn
+from distutils.version import LooseVersion
 
 import numpy as np
 import torch
@@ -31,12 +32,17 @@
     from tests.base.datasets import MNIST
 
 if DALI_AVAILABLE:
-    import nvidia.dali.ops as ops
+    from nvidia.dali import ops
     from nvidia.dali.pipeline import Pipeline
     from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+    from nvidia.dali import __version__ as dali_version
+
+    NEW_DALI_API = LooseVersion(dali_version) >= LooseVersion('0.28.0')
+    if NEW_DALI_API:
+        from nvidia.dali.plugin.base_iterator import LastBatchPolicy
 else:
     warn('NVIDIA DALI is not available')
-    ops, Pipeline, DALIClassificationIterator = ..., ABC, ABC
+    ops, Pipeline, DALIClassificationIterator, LastBatchPolicy = ..., ABC, ABC, ABC
 
 
 class ExternalMNISTInputIterator(object):
@@ -97,11 +103,18 @@ def __init__(
             dynamic_shape=False,
             last_batch_padded=False,
     ):
-        super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, dynamic_shape, last_batch_padded)
+        if NEW_DALI_API:
+            last_batch_policy = LastBatchPolicy.FILL if fill_last_batch else LastBatchPolicy.DROP
+            super().__init__(pipelines, size, reader_name, auto_reset, dynamic_shape,
+                             last_batch_policy=last_batch_policy, last_batch_padded=last_batch_padded)
+        else:
+            super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch,
+                             dynamic_shape, last_batch_padded)
+        self._fill_last_batch = fill_last_batch
 
     def __len__(self):
         batch_count = self._size // (self._num_gpus * self.batch_size)
-        last_batch = 1 if self._fill_last_batch else 0
+        last_batch = 1 if self._fill_last_batch else 1
         return batch_count + last_batch
 
 
@@ -178,7 +191,7 @@ def cli_main():
     eii_test = ExternalMNISTInputIterator(mnist_test, args.batch_size)
 
     pipe_train = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_train, num_threads=2, device_id=0)
-    train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=False)
+    train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=True)
 
     pipe_val = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_val, num_threads=2, device_id=0)
     val_loader = DALIClassificationLoader(pipe_val, size=len(mnist_val), auto_reset=True, fill_last_batch=False)

From 5d2fa98521629b2c0aa3ac5c5f34d8366154248e Mon Sep 17 00:00:00 2001
From: Marijan Smetko <marijansmetko123@gmail.com>
Date: Sat, 19 Dec 2020 00:20:49 +0100
Subject: [PATCH 024/136] Github Actions deprecation (#5183)

* Fix deprecation call

* fix

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .github/workflows/release-docker.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index b8ca5d8723b39..3543891cf7698 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Get release version
         if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
         id: get_version
-        run: echo ::set-env name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})
+        run: echo "::set-output name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})"
 
       - name: Publish Releases to Docker
         # only on releases
@@ -37,6 +37,6 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
           dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
-          tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
         timeout-minutes: 55

From 4c34855c57e4af535e26fca8fc8173e7f5c0d8ad Mon Sep 17 00:00:00 2001
From: Gregor <gregor.koporec@gmail.com>
Date: Sat, 19 Dec 2020 01:35:46 +0100
Subject: [PATCH 025/136] [bugfix] Correct call to torch.no_grad (#5124)

Co-authored-by: Gregor Koporec <gregork@unicorn.gorenje.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/utilities/distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index c315c6633b6fb..be5d781939c04 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -206,6 +206,6 @@ def all_gather_ddp_if_available(
         if sync_grads:
             return AllGatherGrad.apply(tensor, group)
         else:
-            with torch.no_grad:
+            with torch.no_grad():
                 return AllGatherGrad.apply(tensor, group)
     return tensor

From e89764e7d2ec9ffa476aa59e442f85b8390deb5c Mon Sep 17 00:00:00 2001
From: Boris Dayma <boris.dayma@gmail.com>
Date: Sat, 19 Dec 2020 06:52:11 -0600
Subject: [PATCH 026/136] feat(wandb): offset logging step when resuming
 (#5050)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(wandb): offset logging step when resuming

* feat(wandb): output warnings

* fix(wandb): allow step to be None

* test(wandb): update tests

* feat(wandb): display warning only once

* style: fix PEP issues

* tests(wandb): fix tests

* tests(wandb): improve test

* style: fix whitespace

* feat: improve warning

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* feat(wandb): use variable from class instance

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* tests(wandb): check warnings

* feat(wandb): use WarningCache

* tests(wandb): fix tests

* style: fix formatting

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/loggers/wandb.py | 11 ++++++++++-
 tests/loggers/test_all.py          |  6 +++++-
 tests/loggers/test_wandb.py        | 28 +++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 1c02dd65be072..455635690f5c9 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -31,6 +31,7 @@
 
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.warning_utils import WarningCache
 
 
 class WandbLogger(LightningLoggerBase):
@@ -66,6 +67,9 @@ class WandbLogger(LightningLoggerBase):
         wandb_logger = WandbLogger()
         trainer = Trainer(logger=wandb_logger)
 
+    Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`,
+    make sure to use `commit=False` so the logging step does not increase.
+
     See Also:
         - `Tutorial <https://app.wandb.ai/cayush/pytorchlightning/reports/
           Use-Pytorch-Lightning-with-Weights-%26-Biases--Vmlldzo2NjQ1Mw>`__
@@ -103,8 +107,9 @@ def __init__(
         self._log_model = log_model
         self._prefix = prefix
         self._kwargs = kwargs
-        # logging multiple Trainer on a single W&B run (k-fold, etc)
+        # logging multiple Trainer on a single W&B run (k-fold, resuming, etc)
         self._step_offset = 0
+        self.warning_cache = WarningCache()
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -134,6 +139,8 @@ def experiment(self) -> Run:
             self._experiment = wandb.init(
                 name=self._name, dir=self._save_dir, project=self._project, anonymous=self._anonymous,
                 id=self._id, resume='allow', **self._kwargs) if wandb.run is None else wandb.run
+            # offset logging step when resuming a run
+            self._step_offset = self._experiment.step
             # save checkpoints in wandb dir to upload on W&B servers
             if self._log_model:
                 self._save_dir = self._experiment.dir
@@ -154,6 +161,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
         assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0'
 
         metrics = self._add_prefix(metrics)
+        if step is not None and step + self._step_offset < self.experiment.step:
+            self.warning_cache.warn('Trying to log at a previous step. Use `commit=False` when logging metrics manually.')
         self.experiment.log(metrics, step=(step + self._step_offset) if step is not None else None)
 
     @property
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index ba5791c7b9f4a..89c731d432ee9 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -74,7 +74,9 @@ def test_loggers_fit_test_all(tmpdir, monkeypatch):
     with mock.patch('pytorch_lightning.loggers.test_tube.Experiment'):
         _test_loggers_fit_test(tmpdir, TestTubeLogger)
 
-    with mock.patch('pytorch_lightning.loggers.wandb.wandb'):
+    with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb:
+        wandb.run = None
+        wandb.init().step = 0
         _test_loggers_fit_test(tmpdir, WandbLogger)
 
 
@@ -368,5 +370,7 @@ def test_logger_with_prefix_all(tmpdir, monkeypatch):
     # WandB
     with mock.patch('pytorch_lightning.loggers.wandb.wandb') as wandb:
         logger = _instantiate_logger(WandbLogger, save_idr=tmpdir, prefix=prefix)
+        wandb.run = None
+        wandb.init().step = 0
         logger.log_metrics({"test": 1.0}, step=0)
         logger.experiment.log.assert_called_once_with({'tmp-test': 1.0}, step=0)
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 33211e6492d91..a44b19ca39270 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -22,8 +22,14 @@
 from tests.base import EvalModelTemplate, BoringModel
 
 
+def get_warnings(recwarn):
+    warnings_text = '\n'.join(str(w.message) for w in recwarn.list)
+    recwarn.clear()
+    return warnings_text
+
+
 @mock.patch('pytorch_lightning.loggers.wandb.wandb')
-def test_wandb_logger_init(wandb):
+def test_wandb_logger_init(wandb, recwarn):
     """Verify that basic functionality of wandb logger works.
     Wandb doesn't work well with pytest so we have to mock it out here."""
 
@@ -34,6 +40,9 @@ def test_wandb_logger_init(wandb):
     wandb.init.assert_called_once()
     wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None)
 
+    # mock wandb step
+    wandb.init().step = 0
+
     # test wandb.init not called if there is a W&B run
     wandb.init().log.reset_mock()
     wandb.init.reset_mock()
@@ -49,15 +58,28 @@ def test_wandb_logger_init(wandb):
     logger.log_metrics({'acc': 1.0}, step=3)
     wandb.init().log.assert_called_with({'acc': 1.0}, step=6)
 
+    # log hyper parameters
     logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]})
     wandb.init().config.update.assert_called_once_with(
         {'test': 'None', 'nested/a': 1, 'b': [2, 3, 4]},
         allow_val_change=True,
     )
 
+    # watch a model
     logger.watch('model', 'log', 10)
     wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10)
 
+    # verify warning for logging at a previous step
+    assert 'Trying to log at a previous step' not in get_warnings(recwarn)
+    # current step from wandb should be 6 (last logged step)
+    logger.experiment.step = 6
+    # logging at step 2 should raise a warning (step_offset is still 3)
+    logger.log_metrics({'acc': 1.0}, step=2)
+    assert 'Trying to log at a previous step' in get_warnings(recwarn)
+    # logging again at step 2 should not display again the same warning
+    logger.log_metrics({'acc': 1.0}, step=2)
+    assert 'Trying to log at a previous step' not in get_warnings(recwarn)
+
     assert logger.name == wandb.init().project_name()
     assert logger.version == wandb.init().id
 
@@ -71,6 +93,7 @@ def test_wandb_pickle(wandb, tmpdir):
     class Experiment:
         """ """
         id = 'the_id'
+        step = 0
 
         def project_name(self):
             return 'the_project_name'
@@ -108,8 +131,11 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir):
     assert logger.name is None
 
     # mock return values of experiment
+    wandb.run = None
+    wandb.init().step = 0
     logger.experiment.id = '1'
     logger.experiment.project_name.return_value = 'project'
+    logger.experiment.step = 0
 
     for _ in range(2):
         _ = logger.experiment

From 88b55e4f8a94cedb9af59226dbd2cf34c212509e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 20 Dec 2020 01:32:17 +0100
Subject: [PATCH 027/136] reduce verbosity level in drone ci (#5190)

* reduce verbosity level in drone

* verbosity
---
 .drone.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 1041ebdf872c8..b0b6c3df1b699 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -33,8 +33,8 @@ steps:
     - python --version
     - pip --version
     - nvidia-smi
-    - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir
-    - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 -v --no-cache-dir
+    - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed --no-cache-dir
+    - pip install git+https://${AUTH_TOKEN}@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
     # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0"
     - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed
     - pip list

From 618580b4200e90b0acbb5f847865e0e9c02adec3 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Sun, 20 Dec 2020 13:20:45 +0530
Subject: [PATCH 028/136] Remove Sourcerer (#5172)

* Remove Sourcerer

* trigger

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 README.md | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/README.md b/README.md
index a5c6bbb244730..19182098bdb76 100644
--- a/README.md
+++ b/README.md
@@ -73,19 +73,6 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 
 ---
 
-## Trending contributors
-
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/0)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/0)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/1)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/1)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/2)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/2)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/3)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/3)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/4)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/4)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/5)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/5)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/6)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/6)
-[![](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/images/7)](https://sourcerer.io/fame/williamFalcon/pytorchlightning/pytorch-lightning/links/7)
-
----
-
 ## Continuous Integration
 <center>
 

From be3e8701cebfc59bec97d0c7717bb5e52afc665e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 20 Dec 2020 17:11:58 +0100
Subject: [PATCH 029/136] skip multi-gpu test when running on single-gpu
 machine (#5186)

* skip test

* Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 tests/models/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 2393b42d27191..e34648671e12d 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -49,7 +49,7 @@ def test_multi_gpu_none_backend(tmpdir):
     tpipes.run_model_test(trainer_options, model)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.parametrize('gpus', [1, [0], [1]])
 def test_single_gpu_model(tmpdir, gpus):
     """Make sure single GPU works (DP mode)."""

From c8eda3feceffe1230233e7937cad5a19d1517018 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Mon, 21 Dec 2020 06:30:17 +0530
Subject: [PATCH 030/136] Update warning if ckpt directory is not empty (#5209)

---
 pytorch_lightning/callbacks/model_checkpoint.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 82df32ce3996c..d73e5104a0999 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -20,11 +20,11 @@
 
 """
 
+from copy import deepcopy
 import numbers
 import os
-import re
-from copy import deepcopy
 from pathlib import Path
+import re
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
@@ -302,8 +302,7 @@ def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k):
             and len(self._fs.ls(dirpath)) > 0
         ):
             rank_zero_warn(
-                f"Checkpoint directory {dirpath} exists and is not empty. With save_top_k={save_top_k},"
-                " all files in this directory will be deleted when a checkpoint is saved!"
+                f"Checkpoint directory {dirpath} exists and is not empty."
             )
 
         if dirpath and self._fs.protocol == 'file':

From 8ad7214f941bd1f07172774d91322f55a1312dd3 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 21 Dec 2020 07:35:01 +0100
Subject: [PATCH 031/136] add make cmd - clean (#5204)

Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 .update.sh | 17 -----------------
 Makefile   |  6 +++++-
 2 files changed, 5 insertions(+), 18 deletions(-)
 delete mode 100644 .update.sh

diff --git a/.update.sh b/.update.sh
deleted file mode 100644
index 40fcc22d6b79b..0000000000000
--- a/.update.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-version=$1
-
-git commit -am "release v$version"
-git tag $version -m "test_tube v$version"
-git push --tags origin master
-
-# push to pypi
-rm -rf ./dist/*
-python3 setup.py sdist
-twine upload dist/*
-
-# to update docs
-# cd to root dir
-# mkdocs gh-deploy
-
diff --git a/Makefile b/Makefile
index 76e8bac4e3748..55a95f0b14af2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test
+.PHONY: test clean
 
 test:
 	# install APEX, see https://github.com/NVIDIA/apex#linux
@@ -13,3 +13,7 @@ test:
 
 	# specific file
 	# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k
+
+clean:
+	# clean all temp runs
+	rm -rf $(shell find . -name "mlruns" )

From bb6dfb6567fd7e661219fadfd9b1dae53a52962e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 21 Dec 2020 09:58:06 +0100
Subject: [PATCH 032/136] remove unused rpc import in modelcheckpoint causing
 import error (#5198)

* remove unused rpc import

* isort

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/callbacks/model_checkpoint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index d73e5104a0999..5a1079f8063f4 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -34,7 +34,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.metrics.metric import Metric
-from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException

From a401fb39a866b34a79b76f3b8a5e9ddf06b0600a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 21 Dec 2020 12:04:39 +0100
Subject: [PATCH 033/136] add doctests for example 2/n segmentation (#5083)

* draft

* fix

* drop folder

Co-authored-by: chaton <thomas@grid.ai>
---
 .../domain_templates/semantic_segmentation.py | 36 +++++++++++++++++++
 pl_examples/pytorch_ecosystem/__init__.py     | 13 -------
 2 files changed, 36 insertions(+), 13 deletions(-)
 delete mode 100644 pl_examples/pytorch_ecosystem/__init__.py

diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 7bcad597a9a68..2e718a37ac4b0 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -32,6 +32,19 @@
 DEFAULT_VALID_LABELS = (7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33)
 
 
+def _create_synth_kitti_dataset(path_dir: str, image_dims: tuple = (1024, 512)):
+    """Create synthetic dataset with random images, just to simulate that the dataset have been already downloaded."""
+    path_dir_images = os.path.join(path_dir, KITTI.IMAGE_PATH)
+    path_dir_masks = os.path.join(path_dir, KITTI.MASK_PATH)
+    for p_dir in (path_dir_images, path_dir_masks):
+        os.makedirs(p_dir, exist_ok=True)
+    for i in range(3):
+        path_img = os.path.join(path_dir_images, f'dummy_kitti_{i}.png')
+        Image.new('RGB', image_dims).save(path_img)
+        path_mask = os.path.join(path_dir_masks, f'dummy_kitti_{i}.png')
+        Image.new('L', image_dims).save(path_mask)
+
+
 class KITTI(Dataset):
     """
     Class for KITTI Semantic Segmentation Benchmark dataset
@@ -53,6 +66,12 @@ class KITTI(Dataset):
     In the `get_item` function, images and masks are resized to the given `img_size`, masks are
     encoded using `encode_segmap`, and given `transform` (if any) are applied to the image only
     (mask does not usually require transforms, but they can be implemented in a similar way).
+
+    >>> from pl_examples import DATASETS_PATH
+    >>> dataset_path = os.path.join(DATASETS_PATH, "Kitti")
+    >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512))
+    >>> KITTI(dataset_path, 'train')  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    <...semantic_segmentation.KITTI object at ...>
     """
     IMAGE_PATH = os.path.join('training', 'image_2')
     MASK_PATH = os.path.join('training', 'semantic')
@@ -141,6 +160,23 @@ class SegModel(pl.LightningModule):
     It uses the FCN ResNet50 model as an example.
 
     Adam optimizer is used along with Cosine Annealing learning rate scheduler.
+
+    >>> from pl_examples import DATASETS_PATH
+    >>> dataset_path = os.path.join(DATASETS_PATH, "Kitti")
+    >>> _create_synth_kitti_dataset(dataset_path, image_dims=(1024, 512))
+    >>> SegModel(dataset_path)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    SegModel(
+      (net): UNet(
+        (layers): ModuleList(
+          (0): DoubleConv(...)
+          (1): Down(...)
+          (2): Down(...)
+          (3): Up(...)
+          (4): Up(...)
+          (5): Conv2d(64, 19, kernel_size=(1, 1), stride=(1, 1))
+        )
+      )
+    )
     """
     def __init__(
             self,
diff --git a/pl_examples/pytorch_ecosystem/__init__.py b/pl_examples/pytorch_ecosystem/__init__.py
deleted file mode 100644
index d7aa17d7f8468..0000000000000
--- a/pl_examples/pytorch_ecosystem/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 3bd6206e4503366b3fba7252c7354a590d6915fd Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Mon, 21 Dec 2020 19:04:48 -0500
Subject: [PATCH 034/136] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 19182098bdb76..649d3a86dd09c 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,11 @@ Scale your models, not the boilerplate.**
 
 ---
 
+## NEWS
+[Dec 2020 - Read about how Facebook uses Lightning to standardize across teams for research and production](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability)
+
+---
+
 ## PyTorch Lightning is just organized PyTorch
 Lightning disentangles PyTorch code to decouple the science from the engineering.
 ![PT to PL](docs/source/_images/general/pl_quick_start_full_compressed.gif)

From 43f73fdfdbd0d980031a9acc867c0cc362448a63 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Mon, 21 Dec 2020 19:05:18 -0500
Subject: [PATCH 035/136] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 649d3a86dd09c..84d9571395519 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ Scale your models, not the boilerplate.**
 ---
 
 ## NEWS
-[Dec 2020 - Read about how Facebook uses Lightning to standardize across teams for research and production](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability)
+[Dec 2020 - Read about how Facebook uses Lightning to standardize deep learning across research and production teams](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability)
 
 ---
 

From 9a3c0355d236102a3b20e431c0882cba9c6c41de Mon Sep 17 00:00:00 2001
From: Alan Du <alanhdu@gmail.com>
Date: Wed, 23 Dec 2020 02:05:55 -0500
Subject: [PATCH 036/136] Tighten up mypy config (#5237)

---
 setup.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index b0c2c8640bfba..27f5df8ac6961 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -102,6 +102,10 @@ max-line-length = 120
 files = pytorch_lightning, pl_examples, benchmarks, tests
 disallow_untyped_defs = True
 ignore_missing_imports = True
+show_error_codes = True
+warn_redundant_casts = True
+warn_unused_configs = True
+warn_unused_ignores = True
 
 # todo: add proper typing to this module...
 [mypy-pytorch_lightning.callbacks.*]

From 5820887863509af96c682fd375296228f3556fe6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 10:22:11 +0100
Subject: [PATCH 037/136] update for v1.1.2 (#5240)

---
 CHANGELOG.md                  | 17 ++++-------------
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb2ae514f2133..8509dfb68bfa3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,12 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.2rc1] - 2020-12-17
+## [1.1.2] - 2020-12-23
 
 ### Added
 
 - Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)
-
+- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050)
 
 ### Changed
 
@@ -22,25 +22,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)
 
-
 ### Fixed
 
 - Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)
-
-
 - Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)
-
-
 - Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)
-
-
 - Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)
-
-
 - Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)
-
-
 - Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)
+- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925)
+- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124)
 
 
 ## [1.1.1] - 2020-12-15
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index af63190037711..d1da4da1963ac 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '1.1.2rc1'
+__version__ = '1.1.2'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From ae04311863a61ed7642930040f505f23be45b87a Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 23 Dec 2020 11:53:06 +0100
Subject: [PATCH 038/136] [Bugfix] Add LightningOptimizer parity test and
 resolve AMP bug (#5191)

* update

* clean test

* still in progress

* udpdate test

* update

* update

* resolve flake

* add test for zero_grad

* update

* works without accumulated_grad

* update

* update

* resolve amp

* revert back to True

* update

* clean tests

* cleaned out

* typo

* update test

* git repare bug

* remove print

* udpate

* Fix formatting/optimizer imports

* Refactor the test for cleanliness

* Add vanilla model to the test, better var names

* Fixed var names, let's clean up these mock tests

* repare test

* update test

* resolve flake8

* add manual_optimization

* update tests

* resolve flake8

* add random accumulate_grad_batches

* improve test

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update

* clean tests

* correct bug

* Apply suggestions from code review

* format

* adress comments

* update on comments

Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
---
 pytorch_lightning/core/lightning.py           |  13 +-
 pytorch_lightning/core/optimizer.py           |   2 +
 pytorch_lightning/plugins/native_amp.py       |   6 +-
 pytorch_lightning/trainer/trainer.py          |   6 +-
 pytorch_lightning/trainer/training_loop.py    |  13 +-
 tests/base/boring_model.py                    |   3 +-
 tests/core/test_lightning_module.py           |  10 +-
 tests/core/test_lightning_optimizer.py        |   4 +-
 .../test_parity_automatic_optimization.py     | 371 ++++++++++++++++++
 .../test_parity_manual_optimization.py        | 211 ++++++++++
 10 files changed, 617 insertions(+), 22 deletions(-)
 create mode 100644 tests/trainer/optimization/test_parity_automatic_optimization.py
 create mode 100644 tests/trainer/optimization/test_parity_manual_optimization.py

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index ab66435a2935d..34072c5e43a61 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -14,15 +14,15 @@
 
 """nn.Module with additional great features."""
 
+from abc import ABC
+from argparse import Namespace
 import collections
 import copy
 import inspect
 import os
+from pathlib import Path
 import re
 import tempfile
-from abc import ABC
-from argparse import Namespace
-from pathlib import Path
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -35,9 +35,9 @@
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO
+from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES
 from pytorch_lightning.core.step_result import Result
-from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, TPU_AVAILABLE
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args
@@ -1252,9 +1252,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
                     optimizer.zero_grad()
 
         """
-        if not isinstance(optimizer, LightningOptimizer):
-            # wraps into LightingOptimizer only for running step
-            optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
         optimizer.step(closure=optimizer_closure)
 
     def optimizer_zero_grad(
diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index c8e9ff8b80a2f..f0b361de6133e 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -103,6 +103,8 @@ def _on_trainer_init(self, trainer):
 
     @classmethod
     def to_lightning_optimizer(cls, optimizer, trainer):
+        if isinstance(optimizer, LightningOptimizer):
+            return optimizer
         optimizer = cls(optimizer)
         optimizer._on_trainer_init(trainer)
         return optimizer
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 4df5d128476a4..3d64fe91388b8 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -16,6 +16,7 @@
 import torch
 from torch.optim import Optimizer
 
+from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 
 
@@ -52,7 +53,10 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
         # unscale gradient to allow analyze within `on_after_backward`
         if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
-            self.trainer.scaler.unscale_(optimizer)
+            if isinstance(optimizer, LightningOptimizer):
+                self.trainer.scaler.unscale_(optimizer._optimizer)
+            else:
+                self.trainer.scaler.unscale_(optimizer)
 
         return closure_loss
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 5a837956bc4ce..c66cc3a43d0b1 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,8 +15,8 @@
 """Trainer to automate the training."""
 
 import os
-import warnings
 from typing import Dict, Iterable, List, Optional, Union
+import warnings
 
 import torch
 from torch.utils.data import DataLoader
@@ -24,7 +24,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
-from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.lightning import LightningModule
@@ -47,6 +46,7 @@
 from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
 from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
 from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
+from pytorch_lightning.trainer.deprecated_api import DeprecatedDistDeviceAttributes
 from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
@@ -56,7 +56,7 @@
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
-from pytorch_lightning.utilities import rank_zero_warn, DeviceType
+from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 68a0f4781c9a9..fe4525006ebb9 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.core.step_result import EvalResult, Result
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.trainer.supporters import Accumulator, TensorRunningAccum
-from pytorch_lightning.utilities import TPU_AVAILABLE, AMPType, parsing
+from pytorch_lightning.utilities import AMPType, parsing, TPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
@@ -489,6 +489,9 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_
                 'native PyTorch amp and lbfgs are not compatible.'
                 ' To request, please file a Github issue in PyTorch and tag @mcarilli')
 
+        # wraps into LightingOptimizer only for running step
+        optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
+
         # model hook
         model_ref.optimizer_step(
             self.trainer.current_epoch,
@@ -831,6 +834,8 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs):
 
         # backward can be called manually in the training loop
         if isinstance(result, torch.Tensor):
+            # scale loss under accumulate_grad_batches > 1 and manual_backward
+            result = self.scale_closure_loss(result)
             self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, *args, **kwargs)
         else:
             result.closure_loss = self.trainer.accelerator_backend.backward(
@@ -975,3 +980,9 @@ def update_running_loss(self):
 
         # reset for next set of accumulated grads
         self.accumulated_loss.reset()
+
+    def scale_closure_loss(self, loss: torch.Tensor) -> torch.Tensor:
+        model_ref = self.trainer.get_model()
+        if model_ref._running_manual_backward:
+            loss /= self.trainer.accumulate_grad_batches
+        return loss
diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py
index 6ceffe8562372..6fdc3794d05f6 100644
--- a/tests/base/boring_model.py
+++ b/tests/base/boring_model.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
-from pytorch_lightning import LightningModule
 from torch.utils.data import Dataset
 
+from pytorch_lightning import LightningModule
+
 
 class RandomDictDataset(Dataset):
     def __init__(self, size, length):
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index e3a597063d02e..01319365d9051 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -11,17 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import pickle
 from argparse import ArgumentParser
+import pickle
 from typing import Optional
 from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
-from torch.optim import SGD, Adam
+from torch.optim import Adam, SGD
 from torch.utils.data import DataLoader, random_split
 
-from pytorch_lightning import LightningDataModule, Trainer, seed_everything
+from pytorch_lightning import LightningDataModule, seed_everything, Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel
 
@@ -75,16 +75,12 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
                     if batch_idx % 2 == 0:
                         assert isinstance(optimizer, SGD)
                         optimizer.step(closure=optimizer_closure)
-                        if not enable_pl_optimizer:
-                            optimizer.zero_grad()
 
                 # update discriminator opt every 4 steps
                 if optimizer_idx == 1:
                     if batch_idx % 4 == 0:
                         assert isinstance(optimizer, Adam)
                         optimizer.step(closure=optimizer_closure)
-                        if not enable_pl_optimizer:
-                            optimizer.zero_grad()
 
         model = TestModel()
         model.training_epoch_end = None
diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
index a9fcf918cc699..530f20f86a3db 100644
--- a/tests/core/test_lightning_optimizer.py
+++ b/tests/core/test_lightning_optimizer.py
@@ -14,16 +14,18 @@
 import os
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import torch
 import torch.nn as nn
 from torch.optim import Adam, Optimizer
 
 import pytorch_lightning as pl
-from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning import LightningModule, seed_everything, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.model_utils import is_overridden
 from tests.base.boring_model import BoringModel, RandomDataset, RandomDictDataset, RandomDictStringDataset
 
 
diff --git a/tests/trainer/optimization/test_parity_automatic_optimization.py b/tests/trainer/optimization/test_parity_automatic_optimization.py
new file mode 100644
index 0000000000000..4a1d6c384cd52
--- /dev/null
+++ b/tests/trainer/optimization/test_parity_automatic_optimization.py
@@ -0,0 +1,371 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import Callable
+from copy import deepcopy
+from typing import Optional
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+import torch
+from torch.optim import Optimizer
+
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from tests.base.boring_model import BoringModel
+
+# TODO:
+# For both automatic / manual optimization
+#     - Test dp, ddp, ddp2
+#     - Apex
+#     - Random accumulated_grad_batches (bug)
+#     - Multiple optimizers
+
+
+class BaseParityAutomaticOptimizationModel(BoringModel):
+
+    def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_batches=None):
+        super().__init__()
+        self.optimizer_cls = optimizer_cls
+        self.losses = []
+        self.grads = []
+        self.on_before_zero_grad_count = 0
+        self.optimizer_is_mocked = optimizer_is_mocked
+        self.grad_checked = False
+        self.accumulate_grad_batches = accumulate_grad_batches
+
+    def on_before_zero_grad(self, optimizer):
+        self.on_before_zero_grad_count += 1
+        if self.layer.weight.grad is not None:
+            self.grads.append(self.layer.weight.grad.clone())
+
+    def configure_optimizers(self):
+        optimizer = self.optimizer_cls(self.layer.parameters(), lr=0.1)
+        assert isinstance(optimizer, Optimizer)
+        return optimizer
+
+    def training_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.losses.append(loss.detach().item())
+        return {"loss": loss}
+
+
+class AutomaticOptimizationPurePytorchOptimizerModel(BaseParityAutomaticOptimizationModel):
+
+    def training_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.losses.append(loss.detach().item())
+        loss /= float(self.accumulate_grad_batches)
+        return {"loss": loss}
+
+    def optimizer_step(
+            self,
+            epoch: int = None,
+            batch_idx: int = None,
+            optimizer: Optimizer = None,
+            optimizer_idx: int = None,
+            optimizer_closure: Optional[Callable] = None,
+            on_tpu: bool = None,
+            using_native_amp: bool = None,
+            using_lbfgs: bool = None,
+    ) -> None:
+        """
+        Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard
+        """
+        # Get the unwrapped optimizer
+        optimizer = optimizer._optimizer
+        assert not isinstance(optimizer, LightningOptimizer)
+
+        optimizer_closure()
+        assert self.trainer.accumulate_grad_batches == 1
+
+        if should_accumulate(self.trainer, self.accumulate_grad_batches):
+            return
+
+        self.grad_checked = True
+        assert torch.abs(self.layer.weight.grad).sum() > 0
+        optimizer.step()
+
+        self.on_before_zero_grad_count += 1
+        optimizer.zero_grad()
+
+        if not self.optimizer_is_mocked:
+            assert torch.abs(self.layer.weight.grad).sum() == 0
+
+
+class AutomaticOptimizationPurePytorchAMPOptimizerModel(BaseParityAutomaticOptimizationModel):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scaler = torch.cuda.amp.GradScaler()
+
+    def training_step(self, batch, batch_idx):
+        with torch.cuda.amp.autocast():
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.losses.append(loss.detach().item())
+            loss /= float(self.accumulate_grad_batches)
+            loss = self.scaler.scale(loss)
+            return {"loss": loss}
+
+    def optimizer_step(
+            self,
+            epoch: int = None,
+            batch_idx: int = None,
+            optimizer: Optimizer = None,
+            optimizer_idx: int = None,
+            optimizer_closure: Optional[Callable] = None,
+            on_tpu: bool = None,
+            using_native_amp: bool = None,
+            using_lbfgs: bool = None,
+    ) -> None:
+        """
+        Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard
+        """
+        # Get the unwrapped optimizer
+        optimizer = optimizer._optimizer
+        assert not isinstance(optimizer, LightningOptimizer)
+
+        optimizer_closure()
+        assert self.trainer.accumulate_grad_batches == 1
+
+        if should_accumulate(self.trainer, self.accumulate_grad_batches):
+            return
+
+        self.scaler.unscale_(optimizer)
+        self.grad_checked = True
+        assert torch.abs(self.layer.weight.grad).sum() > 0
+        self.scaler.step(optimizer)
+        self.scaler.update()
+        self.on_before_zero_grad_count += 1
+        optimizer.zero_grad()
+        if not self.optimizer_is_mocked:
+            assert torch.abs(self.layer.weight.grad).sum() == 0
+
+
+def should_accumulate(trainer, accumulate_grad_batches):
+    accumulation_done = (trainer.batch_idx + 1) == trainer.num_training_batches
+    is_final_batch = (trainer.batch_idx + 1) % accumulate_grad_batches == 0
+    return not (accumulation_done or is_final_batch)
+
+
+@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [
+    pytest.param(32, "native", 0),
+    pytest.param(16, "native", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')),
+])
+@pytest.mark.parametrize('accumulate_grad_batches', [1, 7])
+def test_lightning_optimizer_and_no_lightning_optimizer_equality(
+        tmpdir,
+        precision,
+        amp_backend,
+        gpus,
+        accumulate_grad_batches,
+):
+
+    if accumulate_grad_batches > 1:
+        accumulate_grad_batches = np.random.randint(1, accumulate_grad_batches)
+
+    vanilla_model_cls = AutomaticOptimizationPurePytorchAMPOptimizerModel if precision == 16 \
+        else AutomaticOptimizationPurePytorchOptimizerModel
+
+    run_lightning_optimizer_equality(
+        BaseParityAutomaticOptimizationModel,
+        vanilla_model_cls,
+        precision=precision,
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=5,
+        accumulate_grad_batches=accumulate_grad_batches,
+        amp_backend=amp_backend,
+        gpus=gpus
+    )
+
+
+@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [
+    pytest.param(32, "native", 0),
+])
+@pytest.mark.parametrize('accumulate_grad_batches', [1])
+def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_calls(
+        tmpdir,
+        precision,
+        amp_backend,
+        gpus,
+        accumulate_grad_batches,
+):
+
+    vanilla_model_cls = AutomaticOptimizationPurePytorchAMPOptimizerModel if precision == 16 \
+        else AutomaticOptimizationPurePytorchOptimizerModel
+
+    with patch("torch.optim.SGD.step") as mock_sgd_step, \
+            patch("torch.optim.Adam.step") as mock_adam_step, \
+            patch("torch.optim.AdamW.step") as mock_adamw_step, \
+            patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \
+            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \
+            patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad:
+
+        max_epochs = 2
+        limit_train_batches = 10
+
+        # Run equality test using Lightning Optimizer
+        run_lightning_optimizer_equality(
+            BaseParityAutomaticOptimizationModel,
+            vanilla_model_cls,
+            default_root_dir=tmpdir,
+            optimizer_is_mocked=True,
+            accumulate_grad_batches=accumulate_grad_batches,
+            max_epochs=max_epochs,
+            limit_train_batches=limit_train_batches,
+            amp_backend=amp_backend,
+            precision=precision,
+            gpus=gpus
+        )
+
+        expected_num_batches = max_epochs * limit_train_batches
+        assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches)
+        assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches)
+        assert mock_sgd_step.call_count == mock_adam_step.call_count
+        assert mock_sgd_step.call_count == mock_adam_step.call_count
+        assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count
+        assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count
+
+
+def run_lightning_optimizer_equality(
+        lightning_model_cls,
+        vanilla_model_cls,
+        optimizer_is_mocked=False,
+        **trainer_kwargs,
+):
+
+    trainer_kwargs = {
+        "limit_val_batches": 0,
+        **trainer_kwargs
+    }
+    expected_num_batches = trainer_kwargs["max_epochs"] * trainer_kwargs["limit_train_batches"]
+    accumulate_grad_batches = trainer_kwargs["accumulate_grad_batches"]
+
+    pl_optimizer_initial_model_weights, pl_optimizer_model = train_specific_optimizer_model(
+        lightning_model_cls,
+        torch.optim.SGD,
+        expected_num_batches=expected_num_batches,
+        optimizer_is_mocked=optimizer_is_mocked,
+        enable_pl_optimizer=True,
+        **trainer_kwargs,
+    )
+
+    no_pl_optimizer_initial_model_weights, no_pl_optimizer_model = train_specific_optimizer_model(
+        lightning_model_cls,
+        torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD,
+        expected_num_batches=expected_num_batches,
+        optimizer_is_mocked=optimizer_is_mocked,
+        enable_pl_optimizer=False,  # Disable pl optimizer
+        **trainer_kwargs,
+    )
+
+    pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model = train_specific_optimizer_model(
+        vanilla_model_cls,
+        torch.optim.AdamW if optimizer_is_mocked else torch.optim.SGD,
+        expected_num_batches=expected_num_batches,
+        optimizer_is_mocked=optimizer_is_mocked,
+        replace_optimizer_step_with_pure_pytorch=True,
+        **trainer_kwargs,
+    )
+
+    if not optimizer_is_mocked:
+
+        assert_model_equality(
+            pl_optimizer_initial_model_weights=pl_optimizer_initial_model_weights,
+            pl_optimizer_model=pl_optimizer_model,
+            no_pl_optimizer_initial_model_weights=no_pl_optimizer_initial_model_weights,
+            no_pl_optimizer_model=no_pl_optimizer_model,
+            pure_pytorch_optimizer_initial_model_weights=pure_pytorch_optimizer_initial_model_weights,
+            pure_pytorch_optimizer_model=pure_pytorch_optimizer_model,
+            expected_num_batches=expected_num_batches,
+            precision=trainer_kwargs["precision"]
+        )
+
+
+def assert_model_equality(
+        pl_optimizer_initial_model_weights,
+        pl_optimizer_model,
+        no_pl_optimizer_initial_model_weights,
+        no_pl_optimizer_model,
+        pure_pytorch_optimizer_initial_model_weights,
+        pure_pytorch_optimizer_model,
+        expected_num_batches,
+        precision,
+):
+
+    assert torch.equal(pl_optimizer_initial_model_weights, no_pl_optimizer_initial_model_weights)
+    assert torch.equal(pl_optimizer_initial_model_weights, pure_pytorch_optimizer_initial_model_weights)
+    assert len(pl_optimizer_model.losses) == expected_num_batches
+    assert pure_pytorch_optimizer_model.grad_checked
+    assert pure_pytorch_optimizer_model.losses == no_pl_optimizer_model.losses
+    assert not torch.isnan(torch.FloatTensor(no_pl_optimizer_model.losses)).any()
+
+    assert torch.equal(torch.FloatTensor(no_pl_optimizer_model.losses), torch.FloatTensor(pl_optimizer_model.losses))
+    assert no_pl_optimizer_model.on_before_zero_grad_count == pl_optimizer_model.on_before_zero_grad_count
+
+    for pytorch_grad, no_pl_optim_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads,
+                                                             no_pl_optimizer_model.grads,
+                                                             pl_optimizer_model.grads):
+        assert torch.equal(no_pl_optim_grad, pl_optim_grad), 'Grad parameters are different'
+        assert torch.equal(pytorch_grad, no_pl_optim_grad), 'Grad parameters are different'
+
+    for pytorch_weight, no_pl_optim_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(),
+                                                                   no_pl_optimizer_model.parameters(),
+                                                                   pl_optimizer_model.parameters()):
+        assert torch.equal(no_pl_optim_weight, pl_optim_weight), 'Model parameters are different'
+        assert torch.equal(pytorch_weight, no_pl_optim_weight), 'Model parameters are different'
+
+
+# train function
+def train_specific_optimizer_model(
+        model_cls,
+        optimizer_cls,
+        expected_num_batches,
+        enable_pl_optimizer=False,
+        optimizer_is_mocked=False,
+        replace_optimizer_step_with_pure_pytorch=False,
+        **trainer_kwargs,
+):
+
+    seed_everything(42)
+    trainer_kwargs = deepcopy(trainer_kwargs)
+
+    model = model_cls(
+        optimizer_cls=optimizer_cls,
+        optimizer_is_mocked=optimizer_is_mocked,
+        accumulate_grad_batches=trainer_kwargs["accumulate_grad_batches"],
+    )
+
+    if replace_optimizer_step_with_pure_pytorch:
+        # When running pure vanilla training, accumulate_grad_batches should be 1.
+        trainer_kwargs["accumulate_grad_batches"] = 1
+        trainer_kwargs["precision"] = 32
+
+    expected_global_step = expected_num_batches // trainer_kwargs["accumulate_grad_batches"]
+
+    initial_weights = model.layer.weight.clone()
+    model.training_epoch_end = None
+
+    trainer = Trainer(
+        enable_pl_optimizer=enable_pl_optimizer,
+        **trainer_kwargs
+    )
+    trainer.fit(model)
+
+    assert np.abs(trainer.global_step - expected_global_step) <= 2
+    return initial_weights, model
diff --git a/tests/trainer/optimization/test_parity_manual_optimization.py b/tests/trainer/optimization/test_parity_manual_optimization.py
new file mode 100644
index 0000000000000..5d110b2fbdca7
--- /dev/null
+++ b/tests/trainer/optimization/test_parity_manual_optimization.py
@@ -0,0 +1,211 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import Callable
+from copy import deepcopy
+from typing import Optional
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+import torch
+from torch.optim import Optimizer
+
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from tests.base.boring_model import BoringModel
+from tests.trainer.optimization.test_parity_automatic_optimization import (
+    assert_model_equality,
+    run_lightning_optimizer_equality,
+    should_accumulate,
+)
+
+"""
+TODO:
+For both Manual / manual optimization
+    - Test dp, ddp, ddp2
+    - Apex
+    - Random accumulated_grad_batches (bug)
+    - Multiple optimizers
+"""
+
+
+class BaseParityManualOptimizationModel(BoringModel):
+
+    def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_batches=None):
+        super().__init__()
+        self.optimizer_cls = optimizer_cls
+        self.losses = []
+        self.grads = []
+        self.on_before_zero_grad_count = 0
+        self.optimizer_is_mocked = optimizer_is_mocked
+        self.grad_checked = False
+        self.accumulate_grad_batches = accumulate_grad_batches
+
+    def on_before_zero_grad(self, optimizer):
+        self.on_before_zero_grad_count += 1
+        if self.layer.weight.grad is not None:
+            self.grads.append(self.layer.weight.grad.clone())
+
+    def configure_optimizers(self):
+        optimizer = self.optimizer_cls(self.layer.parameters(), lr=0.1)
+        assert isinstance(optimizer, Optimizer)
+        return optimizer
+
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+        if not isinstance(opt, LightningOptimizer):
+            opt = LightningOptimizer.to_lightning_optimizer(opt, self.trainer)
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.losses.append(loss.detach().item())
+        self.manual_backward(loss, opt)
+        opt.step()
+
+
+class ManualOptimizationPurePytorchOptimizerModel(BaseParityManualOptimizationModel):
+
+    def training_step(self, batch, batch_idx):
+        optimizer = self.optimizers()
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        self.losses.append(loss.detach().item())
+        loss /= float(self.accumulate_grad_batches)
+        loss.backward()
+
+        if should_accumulate(self.trainer, self.accumulate_grad_batches):
+            return
+
+        self.grad_checked = True
+        assert torch.abs(self.layer.weight.grad).sum() > 0
+        optimizer.step()
+
+        self.on_before_zero_grad_count += 1
+        optimizer.zero_grad()
+
+        if not self.optimizer_is_mocked:
+            assert torch.abs(self.layer.weight.grad).sum() == 0
+
+
+class ManualOptimizationPurePytorchAMPOptimizerModel(BaseParityManualOptimizationModel):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scaler = torch.cuda.amp.GradScaler()
+
+    def training_step(self, batch, batch_idx):
+        optimizer = self.optimizers()
+        with torch.cuda.amp.autocast():
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.losses.append(loss.detach().item())
+            loss /= float(self.accumulate_grad_batches)
+            loss = self.scaler.scale(loss)
+            loss.backward()
+
+        if should_accumulate(self.trainer, self.accumulate_grad_batches):
+            return
+
+        self.scaler.unscale_(optimizer)
+        self.grad_checked = True
+
+        assert torch.abs(self.layer.weight.grad).sum() > 0
+        self.scaler.step(optimizer)
+        self.scaler.update()
+        self.on_before_zero_grad_count += 1
+        optimizer.zero_grad()
+
+        if not self.optimizer_is_mocked:
+            assert torch.abs(self.layer.weight.grad).sum() == 0
+
+
+@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [
+    pytest.param(32, "native", 0),
+    pytest.param(16, "native", 1, marks=pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')),
+])
+@pytest.mark.parametrize('accumulate_grad_batches', [1, 7])
+def test_lightning_optimizer_and_no_lightning_optimizer_equality(
+        tmpdir,
+        precision,
+        amp_backend,
+        gpus,
+        accumulate_grad_batches):
+
+    if accumulate_grad_batches > 1:
+        accumulate_grad_batches = np.random.randint(1, accumulate_grad_batches)
+
+    vanilla_model_cls = ManualOptimizationPurePytorchAMPOptimizerModel if precision == 16 \
+        else ManualOptimizationPurePytorchOptimizerModel
+
+    run_lightning_optimizer_equality(
+        BaseParityManualOptimizationModel,
+        vanilla_model_cls,
+        precision=precision,
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=5,
+        accumulate_grad_batches=accumulate_grad_batches,
+        amp_backend=amp_backend,
+        gpus=gpus,
+        automatic_optimization=False
+    )
+
+
+@pytest.mark.parametrize(["precision", "amp_backend", "gpus"], [
+    pytest.param(32, "native", 0),
+])
+@pytest.mark.parametrize('accumulate_grad_batches', [1])
+def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_calls(
+        tmpdir,
+        precision,
+        amp_backend,
+        gpus,
+        accumulate_grad_batches,
+):
+
+    vanilla_model_cls = ManualOptimizationPurePytorchAMPOptimizerModel if precision == 16 \
+        else ManualOptimizationPurePytorchOptimizerModel
+
+    with patch("torch.optim.SGD.step") as mock_sgd_step, \
+            patch("torch.optim.Adam.step") as mock_adam_step, \
+            patch("torch.optim.AdamW.step") as mock_adamw_step, \
+            patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \
+            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \
+            patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad:
+
+        max_epochs = 2
+        limit_train_batches = 10
+
+        # Run equality test using Lightning Optimizer
+
+        run_lightning_optimizer_equality(
+            BaseParityManualOptimizationModel,
+            vanilla_model_cls,
+            default_root_dir=tmpdir,
+            optimizer_is_mocked=True,
+            accumulate_grad_batches=accumulate_grad_batches,
+            max_epochs=max_epochs,
+            limit_train_batches=limit_train_batches,
+            amp_backend=amp_backend,
+            precision=precision,
+            gpus=gpus,
+            automatic_optimization=False
+        )
+
+        expected_num_batches = max_epochs * limit_train_batches
+        assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches)
+        assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches)
+        assert mock_sgd_step.call_count == mock_adam_step.call_count
+        assert mock_sgd_step.call_count == mock_adam_step.call_count
+        assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count
+        assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count

From 27f3f973d6c55aeeba7895f1b6111ce743e16725 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 17:21:16 +0100
Subject: [PATCH 039/136] update chlog for future 1.1.3rc (#5242)

* update chlog for future 1.1.3rc

* prune

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 CHANGELOG.md | 58 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8509dfb68bfa3..5b9b705459510 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,12 +5,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.2] - 2020-12-23
+## [1.1.3rc] - 2020-12-29
 
 ### Added
 
-- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)
-- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050)
 
 ### Changed
 
@@ -20,49 +18,63 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
-- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)
 
 ### Fixed
 
-- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)
-- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)
-- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)
-- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)
-- Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)
-- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)
-- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925)
-- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124)
+
+
+## [1.1.2] - 2020-12-23
+
+### Added
+
+- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080))
+- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050))
+
+### Removed
+
+- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163))
+
+### Fixed
+
+- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150))
+- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121))
+- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119))
+- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157))
+- Fix saved filename in `ModelCheckpoint` if it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
+- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106))
+- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925))
+- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124))
 
 
 ## [1.1.1] - 2020-12-15
 
 ### Added
 
-- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)
+- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818))
 
 ### Changed
 
-- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)
-- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)
+- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015))
+- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593))
 - Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
 
 ### Removed
 
-- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
-- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)
+- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014))
+- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076))
 
 ### Fixed
 
 - Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915))
 - Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
 - Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
-- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)
-- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)
+- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981))
+- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
 - Add deprecated metric utility functions back to functional (
     [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067),
     [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068))
-- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
-- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)
+- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378))
+- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
 
 - Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157))
 
@@ -81,8 +93,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added logging using `self.log` in train and evaluation for epoch end hooks (
     [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552),
     [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
-    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439))
-    [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684))
+    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439),
+    [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684),
     [#4913](https://github.com/PyTorchLightning/pytorch-lightning/pull/4913))
 - Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675))
 - Added casting to python types for numpy scalars when logging hparams ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647))

From 176735097ab5be9ee21d3e7a3dedc174f3e0dd3f Mon Sep 17 00:00:00 2001
From: Gregor <gregor.koporec@gmail.com>
Date: Wed, 23 Dec 2020 19:16:45 +0100
Subject: [PATCH 040/136] [bugfix] Group defaults to WORLD if None (#5125)

* [bugfix] Group defaults to WORLD if None

* fix no_grad

* Update pytorch_lightning/utilities/distributed.py

* Update pytorch_lightning/utilities/distributed.py

Co-authored-by: Gregor Koporec <gregork@unicorn.gorenje.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/utilities/distributed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index be5d781939c04..2a0b989e9b9cd 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -202,6 +202,7 @@ def all_gather_ddp_if_available(
     Return:
         A tensor of shape (world_size, batch, ...)
     """
+    group = group if group is not None else torch.distributed.group.WORLD
     if torch.distributed.is_available() and torch.distributed.is_initialized():
         if sync_grads:
             return AllGatherGrad.apply(tensor, group)

From 6adc1b32bdeddbc34282abe0fd9f654c0cba570b Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 20:38:57 +0100
Subject: [PATCH 041/136] add memory parity for PL vs Vanilla (#5170)

* refactor

* memory

* show

* clean

* clean

* try

* device

* reset

* fix

* fix

* mean

* hook

* format

* add todo

Co-authored-by: chaton <thomas@grid.ai>

Co-authored-by: chaton <thomas@grid.ai>
---
 benchmarks/generate_comparison.py |   7 +-
 benchmarks/test_basic_parity.py   | 190 ++++++++++++++++++------------
 benchmarks/test_sharded_parity.py |  42 +++----
 tests/base/develop_utils.py       |  20 ----
 4 files changed, 135 insertions(+), 124 deletions(-)

diff --git a/benchmarks/generate_comparison.py b/benchmarks/generate_comparison.py
index 69eb47cb7e759..6b5a0680a6b36 100644
--- a/benchmarks/generate_comparison.py
+++ b/benchmarks/generate_comparison.py
@@ -16,7 +16,7 @@
 import matplotlib.pylab as plt
 import pandas as pd
 
-from benchmarks.test_basic_parity import lightning_loop, vanilla_loop
+from benchmarks.test_basic_parity import measure_loops
 from tests.base.models import ParityModuleMNIST, ParityModuleRNN
 
 NUM_EPOCHS = 20
@@ -34,8 +34,9 @@ def _main():
         if os.path.isfile(path_csv):
             df_time = pd.read_csv(path_csv, index_col=0)
         else:
-            vanilla = vanilla_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
-            lightning = lightning_loop(cls_model, num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+            # todo: kind="Vanilla PT" -> use_lightning=False
+            vanilla = measure_loops(cls_model, kind="Vanilla PT", num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
+            lightning = measure_loops(cls_model, kind="PT Lightning", num_epochs=NUM_EPOCHS, num_runs=NUM_RUNS)
 
             df_time = pd.DataFrame({'vanilla PT': vanilla['durations'][1:], 'PT Lightning': lightning['durations'][1:]})
             df_time /= NUM_RUNS
diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py
index c85984b092b9d..ce3d831f099f5 100644
--- a/benchmarks/test_basic_parity.py
+++ b/benchmarks/test_basic_parity.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import gc
 import time
 
 import numpy as np
@@ -19,118 +19,156 @@
 import torch
 from tqdm import tqdm
 
-from pytorch_lightning import seed_everything, Trainer
-import tests.base.develop_utils as tutils
+from pytorch_lightning import LightningModule, seed_everything, Trainer
 from tests.base.models import ParityModuleMNIST, ParityModuleRNN
 
 
+def assert_parity_relative(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.1):
+    # assert speeds
+    diffs = np.asarray(pl_values) - np.mean(pt_values)
+    # norm by vanilla time
+    diffs = diffs / norm_by
+    # relative to mean reference value
+    diffs = diffs / np.mean(pt_values)
+    assert np.mean(diffs) < max_diff, f"Lightning diff {diffs} was worse than vanilla PT (threshold {max_diff})"
+
+
+def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.55):
+    # assert speeds
+    diffs = np.asarray(pl_values) - np.mean(pt_values)
+    # norm by event count
+    diffs = diffs / norm_by
+    assert np.mean(diffs) < max_diff, f"Lightning {diffs} was worse than vanilla PT (threshold {max_diff})"
+
+
 # ParityModuleMNIST runs with num_workers=1
-@pytest.mark.parametrize('cls_model,max_diff', [
-    (ParityModuleRNN, 0.05),
-    (ParityModuleMNIST, 0.25),  # todo: lower this thr
+@pytest.mark.parametrize('cls_model,max_diff_speed,max_diff_memory', [
+    (ParityModuleRNN, 0.05, 0.0),
+    (ParityModuleMNIST, 0.25, 0.0),  # todo: lower this thr
 ])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_pytorch_parity(tmpdir, cls_model, max_diff: float, num_epochs: int = 4, num_runs: int = 3):
+def test_pytorch_parity(
+        tmpdir,
+        cls_model: LightningModule,
+        max_diff_speed: float,
+        max_diff_memory: float,
+        num_epochs: int = 4,
+        num_runs: int = 3,
+):
     """
     Verify that the same  pytorch and lightning models achieve the same results
     """
-    lightning = lightning_loop(cls_model, num_runs, num_epochs)
-    vanilla = vanilla_loop(cls_model, num_runs, num_epochs)
+    lightning = measure_loops(cls_model, kind="PT Lightning", num_epochs=num_epochs, num_runs=num_runs)
+    vanilla = measure_loops(cls_model, kind="Vanilla PT", num_epochs=num_epochs, num_runs=num_runs)
 
     # make sure the losses match exactly  to 5 decimal places
+    print(f"Losses are for... \n vanilla: {vanilla['losses']} \n lightning: {lightning['losses']}")
     for pl_out, pt_out in zip(lightning['losses'], vanilla['losses']):
         np.testing.assert_almost_equal(pl_out, pt_out, 5)
 
-    # the fist run initialize dataset (download & filter)
-    tutils.assert_speed_parity_absolute(
-        lightning['durations'][1:], vanilla['durations'][1:], nb_epochs=num_epochs, max_diff=max_diff
+    # drop the first run for initialize dataset (download & filter)
+    assert_parity_absolute(
+        lightning['durations'][1:], vanilla['durations'][1:], norm_by=num_epochs, max_diff=max_diff_speed
     )
 
+    assert_parity_relative(lightning['memory'], vanilla['memory'], max_diff=max_diff_memory)
 
-def vanilla_loop(cls_model, num_runs=10, num_epochs=10):
+
+def _hook_memory():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        used_memory = torch.cuda.max_memory_allocated()
+    else:
+        used_memory = np.nan
+    return used_memory
+
+
+def measure_loops(cls_model, kind, num_runs=10, num_epochs=10):
     """
     Returns an array with the last loss from each epoch for each run
     """
     hist_losses = []
     hist_durations = []
+    hist_memory = []
 
-    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+    device_type = "cuda" if torch.cuda.is_available() else "cpu"
     torch.backends.cudnn.deterministic = True
-    for i in tqdm(range(num_runs), desc=f'Vanilla PT with {cls_model.__name__}'):
-        time_start = time.perf_counter()
+    for i in tqdm(range(num_runs), desc=f'{kind} with {cls_model.__name__}'):
+        gc.collect()
+        if device_type == 'cuda':
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_cached()
+            torch.cuda.reset_max_memory_allocated()
+            torch.cuda.reset_accumulated_memory_stats()
+            torch.cuda.reset_peak_memory_stats()
+        time.sleep(1)
 
-        # set seed
-        seed = i
-        seed_everything(seed)
-
-        # init model parts
-        model = cls_model()
-        dl = model.train_dataloader()
-        optimizer = model.configure_optimizers()
-
-        # model to GPU
-        model = model.to(device)
-
-        epoch_losses = []
-        # as the first run is skipped, no need to run it long
-        for epoch in range(num_epochs if i > 0 else 1):
-
-            # run through full training set
-            for j, batch in enumerate(dl):
-                batch = [x.to(device) for x in batch]
-                loss_dict = model.training_step(batch, j)
-                loss = loss_dict['loss']
-                loss.backward()
-                optimizer.step()
-                optimizer.zero_grad()
+        time_start = time.perf_counter()
 
-            # track last epoch loss
-            epoch_losses.append(loss.item())
+        _loop = lightning_loop if kind == "PT Lightning" else vanilla_loop
+        final_loss, used_memory = _loop(cls_model, idx=i, device_type=device_type, num_epochs=num_epochs)
 
         time_end = time.perf_counter()
-        hist_durations.append(time_end - time_start)
 
-        hist_losses.append(epoch_losses[-1])
+        hist_losses.append(final_loss)
+        hist_durations.append(time_end - time_start)
+        hist_memory.append(used_memory)
 
     return {
         'losses': hist_losses,
         'durations': hist_durations,
+        'memory': hist_memory,
     }
 
 
-def lightning_loop(cls_model, num_runs=10, num_epochs=10):
-    hist_losses = []
-    hist_durations = []
+def vanilla_loop(cls_model, idx, device_type: str = 'cuda', num_epochs=10):
+    device = torch.device(device_type)
+    # set seed
+    seed_everything(idx)
 
-    for i in tqdm(range(num_runs), desc=f'PT Lightning with {cls_model.__name__}'):
-        time_start = time.perf_counter()
+    # init model parts
+    model = cls_model()
+    dl = model.train_dataloader()
+    optimizer = model.configure_optimizers()
 
-        # set seed
-        seed = i
-        seed_everything(seed)
-
-        model = cls_model()
-        # init model parts
-        trainer = Trainer(
-            # as the first run is skipped, no need to run it long
-            max_epochs=num_epochs if i > 0 else 1,
-            progress_bar_refresh_rate=0,
-            weights_summary=None,
-            gpus=1,
-            checkpoint_callback=False,
-            deterministic=True,
-            logger=False,
-            replace_sampler_ddp=False,
-        )
-        trainer.fit(model)
-
-        final_loss = trainer.train_loop.running_loss.last().item()
-        hist_losses.append(final_loss)
+    # model to GPU
+    model = model.to(device)
 
-        time_end = time.perf_counter()
-        hist_durations.append(time_end - time_start)
+    epoch_losses = []
+    # as the first run is skipped, no need to run it long
+    for epoch in range(num_epochs if idx > 0 else 1):
 
-    return {
-        'losses': hist_losses,
-        'durations': hist_durations,
-    }
+        # run through full training set
+        for j, batch in enumerate(dl):
+            batch = [x.to(device) for x in batch]
+            loss_dict = model.training_step(batch, j)
+            loss = loss_dict['loss']
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+        # track last epoch loss
+        epoch_losses.append(loss.item())
+
+    return epoch_losses[-1], _hook_memory()
+
+
+def lightning_loop(cls_model, idx, device_type: str = 'cuda', num_epochs=10):
+    seed_everything(idx)
+
+    model = cls_model()
+    # init model parts
+    trainer = Trainer(
+        # as the first run is skipped, no need to run it long
+        max_epochs=num_epochs if idx > 0 else 1,
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+        gpus=1 if device_type == 'cuda' else 0,
+        checkpoint_callback=False,
+        deterministic=True,
+        logger=False,
+        replace_sampler_ddp=False,
+    )
+    trainer.fit(model)
+
+    return trainer.train_loop.running_loss.last().item(), _hook_memory()
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 7bb29ab31b5e2..fae343d921035 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -28,35 +28,32 @@
 from tests.base.boring_model import BoringModel, RandomDataset
 
 
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_one_device():
     plugin_parity_test(
         accelerator='ddp_cpu',
-        max_percent_speed_diff=0.15,  # slower speed due to one CPU doing additional sequential memory saving calls
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
+        max_percent_speed_diff=0.15,  # todo: slower speed due to one CPU doing additional sequential memory saving calls
     )
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_one_gpu():
     plugin_parity_test(
         gpus=1,
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
     )
 
 
 @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_one_gpu():
     plugin_parity_test(
@@ -64,14 +61,13 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
         precision=16,
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
-        model_cls=SeedTrainLoaderModel
+        model_cls=SeedTrainLoaderModel,
     )
 
 
 @pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu():
     plugin_parity_test(
@@ -79,13 +75,12 @@ def test_ddp_sharded_plugin_correctness_multi_gpu():
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
 @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
@@ -95,13 +90,12 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
         accelerator='ddp_spawn',
         plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
 @pytest.mark.skipif(not NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
@@ -111,7 +105,7 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
         accelerator='ddp_spawn',
         plugin='ddp_sharded',
         model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25
+        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
@@ -147,8 +141,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
     """
@@ -159,14 +152,13 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
         gpus=2,
         accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderMultipleOptimizersModel,
-        max_percent_speed_diff=0.25  # Increase speed diff since only 2 GPUs sharding 2 optimizers
+        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
 @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(platform.system() == "Windows",
-                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
     """
@@ -177,7 +169,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
         gpus=2,
         accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderManualModel,
-        max_percent_speed_diff=0.25  # Increase speed diff since only 2 GPUs sharding 2 optimizers
+        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
 
 
diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 9c88ba1b7e4d3..7b40ba4f39ead 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -14,8 +14,6 @@
 import functools
 import os
 
-import numpy as np
-
 from pytorch_lightning import seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
@@ -23,24 +21,6 @@
 from tests.base.model_template import EvalModelTemplate
 
 
-def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1):
-    # assert speeds
-    diffs = np.asarray(pl_times) - np.asarray(pt_times)
-    # norm by vanila time
-    diffs = diffs / np.asarray(pt_times)
-    assert np.alltrue(diffs < max_diff), \
-        f"lightning {diffs} was slower than PT (threshold {max_diff})"
-
-
-def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.55):
-    # assert speeds
-    diffs = np.asarray(pl_times) - np.asarray(pt_times)
-    # norm by vanila time
-    diffs = diffs / nb_epochs
-    assert np.alltrue(diffs < max_diff), \
-        f"lightning {diffs} was slower than PT (threshold {max_diff})"
-
-
 def get_default_logger(save_dir, version=None):
     # set up logger object without actually saving logs
     logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version)

From c479351a938240fbda6774a404494ee399ff361a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 21:29:00 +0100
Subject: [PATCH 042/136] releasing feature as nightly (#5233)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .github/workflows/nightly.yml |  4 +++-
 README.md                     | 28 +++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 92fd99c40279b..eb3e55268b682 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -13,7 +13,10 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
+    # does nightly releases from feature branch
     - uses: actions/checkout@v2
+      with:
+        ref: release/1.2-dev
     - uses: actions/setup-python@v2
       with:
         python-version: 3.7
@@ -29,7 +32,6 @@ jobs:
         ls -lh dist/
 
     - name: Delay releasing
-      if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
       uses: juliangruber/sleep-action@v1
       with:
         time: 5m
diff --git a/README.md b/README.md
index 84d9571395519..c10f404c3d2b3 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 
 ## How To Use
 
-#### Step 0: Install
+### Step 0: Install
 
 Simple installation from PyPI
 ```bash
@@ -114,12 +114,26 @@ From Conda
 conda install pytorch-lightning -c conda-forge
 ```
 
-Install bleeding-edge (no guarantees)   
+#### Install bleeding-edge - future 1.2
+
+the actual status of 1.2 [nightly] is following:
+
+![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push)
+![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
+![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
+
+Install future release from the source (no guarantees)   
+```bash
+pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2-dev --upgrade
+```
+or nightly from testing PyPI
 ```bash
-pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
+pip install -iU https://test.pypi.org/simple/ pytorch-lightning
 ```
 
-#### Step 0: Add these imports
+### Step 1: Add these imports
 
 ```python
 import os
@@ -132,7 +146,7 @@ from torchvision import transforms
 import pytorch_lightning as pl
 ```
 
-#### Step 1: Define a LightningModule (nn.Module subclass)
+### Step 2: Define a LightningModule (nn.Module subclass)
 A LightningModule defines a full *system* (ie: a GAN, autoencoder, BERT or a simple Image Classifier).
 
 ```python
@@ -163,9 +177,9 @@ class LitAutoEncoder(pl.LightningModule):
         return optimizer
 ```
 
-###### Note: Training_step defines the training loop. Forward defines how the LightningModule behaves during inference/prediction.
+**Note: Training_step defines the training loop. Forward defines how the LightningModule behaves during inference/prediction.**
 
-#### Step 2: Train!
+### Step 3: Train!
 
 ```python
 dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())

From b22b1c2df25156f6d93eb3d8b3c85cf7b072e57e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 22:31:09 +0100
Subject: [PATCH 043/136] update PR template (#5206)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .github/ISSUE_TEMPLATE/bug_report.md |  6 +++++-
 .github/PULL_REQUEST_TEMPLATE.md     | 16 +++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index ddae1ea8a951c..cef062516b0eb 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -10,11 +10,15 @@ assignees: ''
 
 <!-- A clear and concise description of what the bug is. -->
 
-## Please reproduce using [the BoringModel and post here](https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing)
+## Please reproduce using the BoringModel
+
 
 <!-- Please paste your BoringModel colab link here. -->
 
 ### To Reproduce
+
+Use following [**BoringModel**](https://colab.research.google.com/drive/1HvWVVTK8j2Nj52qU4Q4YCyzOm0_aLQF3?usp=sharing) and post here
+
 <!-- If you could not reproduce using the BoringModel and still think there's a bug, please post here -->
 
 ### Expected behavior
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4263a76fb16ae..c2ce4a5e8bf26 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,11 @@
 ## What does this PR do?
 
 <!--
+IMPORTANT:
+ We separated bug-fix PRs and feature PRs and they shall land in master and release/1.X-dev accordingly.
+ By default all PR are targeted to master which is correct for bug-fixes, but need to be change for features.
+ If you miss it we can still fix it for you, just ping us... :]
+
 Please include a summary of the change and which issue is fixed.
  Please also include relevant motivation and context.
  List any dependencies that are required for this change.
@@ -8,14 +13,14 @@ Please include a summary of the change and which issue is fixed.
 If we didn't discuss your PR in Github issues there's a high chance it will not be merged.
 -->
 
-Fixes # (issue)
+Fixes # (issue) <- this [links related issue to this PR](https://docs.github.com/en/free-pro-team@latest/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)
 
 ## Before submitting
 - [ ] Was this discussed/approved via a Github issue? (no need for typos and docs improvements)
 - [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), Pull Request section?
-- [ ] Did you make sure your PR does only one thing, instead of bundling different changes together? Otherwise, we ask you to create a separate PR for every change.
-- [ ] Did you make sure to update the documentation with your changes?
-- [ ] Did you write any new necessary tests? 
+- [ ] Did you make sure your PR does only one thing, instead of bundling different changes together?
+- [ ] Did you make sure to update the documentation with your changes [if needed]?
+- [ ] Did you write any new necessary tests [no need for typos, docs]? 
 - [ ] Did you verify new and existing tests pass locally with your changes?
 - [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)?
 
@@ -28,7 +33,8 @@ Before you start reviewing make sure you have read [Review guidelines](https://g
  - [ ] Is this pull request ready for review? (if not, please submit in draft mode)
  - [ ] Check that all items from **Before submitting** are resolved
  - [ ] Make sure the title is self-explanatory and the description concisely explains the PR
- - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified; _Bugfixes should be including in bug-fix release milestones (m.f.X) and features should be included in (m.X.b) releases._
+ - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified
+ - [ ] **Check that target branch and milestone are aligned!**
  
 
 ## Did you have fun?

From 9b3c6a3e843837996d890c9e02823889a10b1d77 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 23 Dec 2020 23:37:33 +0100
Subject: [PATCH 044/136] skip some description from pypi (#5234)

* skip some description from pypi

* flake8
---
 README.md                        | 4 ++++
 pytorch_lightning/setup_tools.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/README.md b/README.md
index c10f404c3d2b3..0379b61ab70ee 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,8 @@ From Conda
 conda install pytorch-lightning -c conda-forge
 ```
 
+<!-- following section will be skipped from PyPI description -->
+
 #### Install bleeding-edge - future 1.2
 
 the actual status of 1.2 [nightly] is following:
@@ -133,6 +135,8 @@ or nightly from testing PyPI
 pip install -iU https://test.pypi.org/simple/ pytorch-lightning
 ```
 
+<!-- end skipping PyPI description -->
+
 ### Step 1: Add these imports
 
 ```python
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index 29ac3b814b3c2..de05acf408381 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -178,6 +178,11 @@ def _load_long_description(path_dir: str) -> str:
     # replace github badges for release ones
     text = text.replace('badge.svg?branch=master&event=push', f'badge.svg?tag={__version__}')
 
+    skip_begin = r'<!-- following section will be skipped from PyPI description -->'
+    skip_end = r'<!-- end skipping PyPI description -->'
+    # todo: wrap content as commented description
+    text = re.sub(rf"{skip_begin}.+?{skip_end}", '<!--  -->', text, flags=re.IGNORECASE + re.DOTALL)
+
     # # https://github.com/Borda/pytorch-lightning/releases/download/1.1.0a6/codecov_badge.png
     # github_release_url = os.path.join(__homepage__, "releases", "download", __version__)
     # # download badge and replace url with local file

From 5651c9cc1ab37efa48332a35b7d55c33da1bab7d Mon Sep 17 00:00:00 2001
From: BobAnkh <bobankhshen@gmail.com>
Date: Thu, 24 Dec 2020 08:03:01 +0800
Subject: [PATCH 045/136] fix typo in Optimization (#5228)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/optimizers.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 06e6e9679d29f..446a21b5b1997 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -67,7 +67,7 @@ Under the hood Lightning does the following:
 .. code-block:: python
 
     for epoch in epochs:
-        for batch id data:
+        for batch in data:
             loss = model.training_step(batch, batch_idx, ...)
             loss.backward()
             optimizer.step()

From 1d533074b3b84729bd76c29456b750ba8f151d81 Mon Sep 17 00:00:00 2001
From: James Malcolm <jcallinicos@gmail.com>
Date: Thu, 24 Dec 2020 20:41:24 +1300
Subject: [PATCH 046/136] Fix typo in Trainer.test() (#5226)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/trainer.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 0748302f30613..392edf8ce11a2 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -155,7 +155,7 @@ Once you're done training, feel free to run the test set!
 
 .. code-block:: python
 
-    trainer.test(test_dataloader=test_dataloader)
+    trainer.test(test_dataloaders=test_dataloader)
 
 ------------
 

From b930b5f2220f800c9f22eb74b19b3bcf8478c735 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 24 Dec 2020 13:45:24 +0530
Subject: [PATCH 047/136] Add TPU example (#5109)

* Add TPU example

* add badge

* add badge

* add badge

* bullets

* name

* trigger

* add dataset name

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: chaton <thomas@grid.ai>
---
 README.md                                     |  54 +--
 notebooks/06-mnist-tpu-training.ipynb         | 368 ++++++++++++++++++
 ...seline.ipynb => 07-cifar10-baseline.ipynb} |   2 +-
 notebooks/README.md                           |   3 +-
 4 files changed, 398 insertions(+), 29 deletions(-)
 create mode 100644 notebooks/06-mnist-tpu-training.ipynb
 rename notebooks/{06-cifar10-baseline.ipynb => 07-cifar10-baseline.ipynb} (99%)

diff --git a/README.md b/README.md
index 0379b61ab70ee..036ebe74244ed 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 <img src="docs/source/_images/logos/lightning_logo-name.png" width="400px">
 
 
-**The lightweight PyTorch wrapper for high-performance AI research.    
+**The lightweight PyTorch wrapper for high-performance AI research.
 Scale your models, not the boilerplate.**
 
 ---
@@ -56,10 +56,10 @@ Lightning disentangles PyTorch code to decouple the science from the engineering
 ## Lightning Philosophy
 Lightning is designed with these principles in mind:
 
-Principle 1: Enable maximal flexibility.   
-Principle 2: Abstract away unecessary boilerplate, but make it accessible when needed.    
-Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).    
-Principle 4: Deep learning code should be organized into 4 distinct categories.    
+Principle 1: Enable maximal flexibility.
+Principle 2: Abstract away unecessary boilerplate, but make it accessible when needed.
+Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).
+Principle 4: Deep learning code should be organized into 4 distinct categories.
 
   - Research code (the LightningModule).
   - Engineering code (you delete, and is handled by the Trainer).
@@ -126,7 +126,7 @@ the actual status of 1.2 [nightly] is following:
 ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
 ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
 
-Install future release from the source (no guarantees)   
+Install future release from the source (no guarantees)
 ```bash
 pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2-dev --upgrade
 ```
@@ -160,7 +160,7 @@ class LitAutoEncoder(pl.LightningModule):
         super().__init__()
         self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3))
         self.decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28))
-    
+
     def forward(self, x):
         # in lightning, forward defines the prediction/inference actions
         embedding = self.encoder(x)
@@ -210,7 +210,7 @@ trainer = Trainer(tpu_cores=8)
 ```python
 # torchscript
 autoencoder = LitAutoEncoder()
-torch.jit.save(autoencoder.to_torchscript(), "model.pt") 
+torch.jit.save(autoencoder.to_torchscript(), "model.pt")
 
 # onnx
 with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
@@ -226,12 +226,12 @@ with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
 class LitAutoEncoder(pl.LightningModule):
     def training_step(self, batch, batch_idx, opt_idx):
         (opt_a, opt_b) = self.optimizers()
-        
+
         loss_a = ...
         self.manual_backward(loss_a, opt_a)
         opt_a.step()
         opt_a.zero_grad()
-        
+
         loss_b = ...
         self.manual_backward(loss_b, opt_b, retain_graph=True)
         self.manual_backward(loss_b, opt_b)
@@ -266,31 +266,31 @@ class LitAutoEncoder(pl.LightningModule):
 ## Examples
 
 ###### Hello world
-[MNIST hello world](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb)  
-[MNIST on TPUs](https://colab.research.google.com/drive/1-_LKx4HwAxl5M6xPJmqAAu444LTDQoa3)
+- [MNIST hello world](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb)
+- [MNIST on TPUs](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb)
 
 ###### Contrastive Learning
-[BYOL](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#byol)    
-[CPC v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#cpc-v2)    
-[Moco v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#moco-v2)    
-[SIMCLR](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#simclr) 
+- [BYOL](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#byol)
+- [CPC v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#cpc-v2)
+- [Moco v2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#moco-v2)
+- [SIMCLR](https://pytorch-lightning-bolts.readthedocs.io/en/latest/self_supervised_models.html#simclr)
 
 ###### NLP
-[BERT](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb)   
-[GPT-2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) 
+- [BERT](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb)
+- [GPT-2](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2)
 
 
 ###### Reinforcement Learning
-[DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html?highlight=dqn#dqn-models)   
-[Dueling-DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dueling-dqn)   
-[Reinforce](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#reinforce)
+- [DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dqn-models)
+- [Dueling-DQN](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#dueling-dqn)
+- [Reinforce](https://pytorch-lightning-bolts.readthedocs.io/en/latest/reinforce_learn.html#reinforce)
 
 ###### Vision
-[GAN](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb)   
+- [GAN](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb)
 
 ###### Classic ML
-[Logistic Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression)   
-[Linear Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#linear-regression)    
+- [Logistic Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression)
+- [Linear Regression](https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#linear-regression)
 
 ---
 
@@ -311,12 +311,12 @@ If you have any questions please:
 4. [Ask on stackoverflow](https://stackoverflow.com/questions/ask?guided=false) with the tag pytorch-lightning.
 
 ### Funding
-Building open-source software with only a few part-time people is hard! 
+Building open-source software with only a few part-time people is hard!
 
 [We're venture funded](https://techcrunch.com/2020/10/08/grid-ai-raises-18-6m-series-a-to-help-ai-researchers-and-engineers-bring-their-models-to-production/)
 and backed by some of the top VC funds in the world, [Index Ventures](https://www.indexventures.com/companies/), [Bain Capital Ventures](https://www.baincapitalventures.com/portfolio/), [First Minute Capital](https://firstminute.capital/companies).
 
-Their funding ensures we can continue to build awesome tooling like Grid, give you around the clock support, 
+Their funding ensures we can continue to build awesome tooling like Grid, give you around the clock support,
 hire a full-time staff, attend conferences, and move faster through implementing features you request.
 
 To supercharge your research and production work, visit our [Grid.ai platform](https://www.grid.ai/)
@@ -324,7 +324,7 @@ To supercharge your research and production work, visit our [Grid.ai platform](h
 ---
 
 ## Grid AI
-Grid AI is our native platform for training models at scale on the cloud!    
+Grid AI is our native platform for training models at scale on the cloud!
 
 **Sign up for [early access here](https://www.grid.ai/)**
 
diff --git a/notebooks/06-mnist-tpu-training.ipynb b/notebooks/06-mnist-tpu-training.ipynb
new file mode 100644
index 0000000000000..9628c8e31879b
--- /dev/null
+++ b/notebooks/06-mnist-tpu-training.ipynb
@@ -0,0 +1,368 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "06-mnist-tpu-training.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "TPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WsWdLFMVKqbi"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-tpu-training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qXO1QLkbRXl0"
+      },
+      "source": [
+        "# TPU training with PyTorch Lightning ⚡\n",
+        "\n",
+        "In this notebook, we'll train a model on TPUs. Changing one line of code is all you need to that.\n",
+        "\n",
+        "The most up to documentation related to TPU training can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/tpu.html).\n",
+        "\n",
+        "---\n",
+        "\n",
+        "  - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n",
+        "  - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n",
+        "  - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n",
+        "  - Ask a question on our [official forum](https://forums.pytorchlightning.ai/)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UmKX0Qa1RaLL"
+      },
+      "source": [
+        "### Setup\n",
+        "\n",
+        "Lightning is easy to install. Simply ```pip install pytorch-lightning```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vAWOr0FZRaIj"
+      },
+      "source": [
+        "! pip install pytorch-lightning -qU"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zepCr1upT4Z3"
+      },
+      "source": [
+        "###  Install Colab TPU compatible PyTorch/TPU wheels and dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "AYGWh10lRaF1"
+      },
+      "source": [
+        "! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SNHa7DpmRZ-C"
+      },
+      "source": [
+        "import torch\n",
+        "from torch import nn\n",
+        "import torch.nn.functional as F\n",
+        "from torch.utils.data import random_split, DataLoader\n",
+        "\n",
+        "# Note - you must have torchvision installed for this example\n",
+        "from torchvision.datasets import MNIST\n",
+        "from torchvision import transforms\n",
+        "\n",
+        "import pytorch_lightning as pl\n",
+        "from pytorch_lightning.metrics.functional import accuracy"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rjo1dqzGUxt6"
+      },
+      "source": [
+        "### Defining The `MNISTDataModule`\n",
+        "\n",
+        "Below we define `MNISTDataModule`. You can learn more about datamodules in [docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html) and [datamodule notebook](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/notebooks/02-datamodules.ipynb)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "pkbrm3YgUxlE"
+      },
+      "source": [
+        "class MNISTDataModule(pl.LightningDataModule):\n",
+        "\n",
+        "    def __init__(self, data_dir: str = './'):\n",
+        "        super().__init__()\n",
+        "        self.data_dir = data_dir\n",
+        "        self.transform = transforms.Compose([\n",
+        "            transforms.ToTensor(),\n",
+        "            transforms.Normalize((0.1307,), (0.3081,))\n",
+        "        ])\n",
+        "\n",
+        "        # self.dims is returned when you call dm.size()\n",
+        "        # Setting default dims here because we know them.\n",
+        "        # Could optionally be assigned dynamically in dm.setup()\n",
+        "        self.dims = (1, 28, 28)\n",
+        "        self.num_classes = 10\n",
+        "\n",
+        "    def prepare_data(self):\n",
+        "        # download\n",
+        "        MNIST(self.data_dir, train=True, download=True)\n",
+        "        MNIST(self.data_dir, train=False, download=True)\n",
+        "\n",
+        "    def setup(self, stage=None):\n",
+        "\n",
+        "        # Assign train/val datasets for use in dataloaders\n",
+        "        if stage == 'fit' or stage is None:\n",
+        "            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n",
+        "            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n",
+        "\n",
+        "        # Assign test dataset for use in dataloader(s)\n",
+        "        if stage == 'test' or stage is None:\n",
+        "            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n",
+        "\n",
+        "    def train_dataloader(self):\n",
+        "        return DataLoader(self.mnist_train, batch_size=32)\n",
+        "\n",
+        "    def val_dataloader(self):\n",
+        "        return DataLoader(self.mnist_val, batch_size=32)\n",
+        "\n",
+        "    def test_dataloader(self):\n",
+        "        return DataLoader(self.mnist_test, batch_size=32)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nr9AqDWxUxdK"
+      },
+      "source": [
+        "### Defining the `LitModel`\n",
+        "\n",
+        "Below, we define the model `LitMNIST`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YKt0KZkOUxVY"
+      },
+      "source": [
+        "class LitModel(pl.LightningModule):\n",
+        "    \n",
+        "    def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n",
+        "\n",
+        "        super().__init__()\n",
+        "\n",
+        "        self.save_hyperparameters()\n",
+        "\n",
+        "        self.model = nn.Sequential(\n",
+        "            nn.Flatten(),\n",
+        "            nn.Linear(channels * width * height, hidden_size),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Dropout(0.1),\n",
+        "            nn.Linear(hidden_size, hidden_size),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Dropout(0.1),\n",
+        "            nn.Linear(hidden_size, num_classes)\n",
+        "        )\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = self.model(x)\n",
+        "        return F.log_softmax(x, dim=1)\n",
+        "\n",
+        "    def training_step(self, batch, batch_idx):\n",
+        "        x, y = batch\n",
+        "        logits = self(x)\n",
+        "        loss = F.nll_loss(logits, y)\n",
+        "        self.log('train_loss', loss, prog_bar=False)\n",
+        "        return loss\n",
+        "\n",
+        "    def validation_step(self, batch, batch_idx):\n",
+        "        x, y = batch\n",
+        "        logits = self(x)\n",
+        "        loss = F.nll_loss(logits, y)\n",
+        "        preds = torch.argmax(logits, dim=1)\n",
+        "        acc = accuracy(preds, y)\n",
+        "        self.log('val_loss', loss, prog_bar=True)\n",
+        "        self.log('val_acc', acc, prog_bar=True)\n",
+        "        return loss\n",
+        "\n",
+        "    def configure_optimizers(self):\n",
+        "        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)\n",
+        "        return optimizer"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Uxl88z06cHyV"
+      },
+      "source": [
+        "### TPU Training\n",
+        "\n",
+        "Lightning supports training on a single TPU core or 8 TPU cores.\n",
+        "\n",
+        "The Trainer parameters `tpu_cores` defines how many TPU cores to train on (1 or 8) / Single TPU core to train on [1].\n",
+        "\n",
+        "For Single TPU training, Just pass the TPU core ID [1-8] in a list. Setting `tpu_cores=[5]` will train on TPU core ID 5."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UZ647Xg2gYng"
+      },
+      "source": [
+        "Train on TPU core ID 5 with `tpu_cores=[5]`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "bzhJ8g_vUxN2"
+      },
+      "source": [
+        "# Init DataModule\n",
+        "dm = MNISTDataModule()\n",
+        "# Init model from datamodule's attributes\n",
+        "model = LitModel(*dm.size(), dm.num_classes)\n",
+        "# Init trainer\n",
+        "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=[5])\n",
+        "# Train\n",
+        "trainer.fit(model, dm)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "slMq_0XBglzC"
+      },
+      "source": [
+        "Train on single TPU core with `tpu_cores=1`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "31N5Scf2RZ61"
+      },
+      "source": [
+        "# Init DataModule\n",
+        "dm = MNISTDataModule()\n",
+        "# Init model from datamodule's attributes\n",
+        "model = LitModel(*dm.size(), dm.num_classes)\n",
+        "# Init trainer\n",
+        "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=1)\n",
+        "# Train\n",
+        "trainer.fit(model, dm)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_v8xcU5Sf_Cv"
+      },
+      "source": [
+        "Train on 8 TPU cores with `tpu_cores=8`. You might have to restart the notebook to run it on 8 TPU cores after training on single TPU core."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "EFEw7YpLf-gE"
+      },
+      "source": [
+        "# Init DataModule\n",
+        "dm = MNISTDataModule()\n",
+        "# Init model from datamodule's attributes\n",
+        "model = LitModel(*dm.size(), dm.num_classes)\n",
+        "# Init trainer\n",
+        "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, tpu_cores=8)\n",
+        "# Train\n",
+        "trainer.fit(model, dm)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "m2mhgEgpRZ1g"
+      },
+      "source": [
+        "<code style=\"color:#792ee5;\">\n",
+        "    <h1> <strong> Congratulations - Time to Join the Community! </strong>  </h1>\n",
+        "</code>\n",
+        "\n",
+        "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n",
+        "\n",
+        "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n",
+        "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n",
+        "\n",
+        "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n",
+        "\n",
+        "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n",
+        "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n",
+        "\n",
+        "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n",
+        "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n",
+        "\n",
+        "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n",
+        "\n",
+        "### Contributions !\n",
+        "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n",
+        "\n",
+        "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n",
+        "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n",
+        "* You can also contribute your own notebooks with useful examples !\n",
+        "\n",
+        "### Great thanks from the entire Pytorch Lightning Team for your interest !\n",
+        "\n",
+        "<img src=\"https://github.com/PyTorchLightning/pytorch-lightning/blob/master/docs/source/_images/logos/lightning_logo-name.png?raw=true\" width=\"800\" height=\"200\" />"
+      ]
+    }
+  ]
+}
diff --git a/notebooks/06-cifar10-baseline.ipynb b/notebooks/07-cifar10-baseline.ipynb
similarity index 99%
rename from notebooks/06-cifar10-baseline.ipynb
rename to notebooks/07-cifar10-baseline.ipynb
index d4b2209cc91b6..7adabf382163e 100644
--- a/notebooks/06-cifar10-baseline.ipynb
+++ b/notebooks/07-cifar10-baseline.ipynb
@@ -4,7 +4,7 @@
   "metadata": {
     "accelerator": "GPU",
     "colab": {
-      "name": "06_cifar10_baseline.ipynb",
+      "name": "07-cifar10-baseline.ipynb",
       "provenance": [],
       "collapsed_sections": []
     },
diff --git a/notebooks/README.md b/notebooks/README.md
index 5d0f3564e9387..a72e154c36410 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -11,4 +11,5 @@ You can easily run any of the official notebooks by clicking the 'Open in Colab'
 | **GAN**                  | Train a GAN on the MNIST Dataset. Learn how to use multiple optimizers in Lightning. |            [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/03-basic-gan.ipynb)             |
 | **BERT**                 | Fine-tune HuggingFace Transformers models on the GLUE Benchmark                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb) |
 | **Trainer Flags**        | Overview of the available Lightning `Trainer` flags                                  |      [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/05-trainer-flags-overview.ipynb)      |
-| **94% Baseline CIFAR10** | Establish a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning     |         [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-cifar10-baseline.ipynb)         |
+| **TPU Training**        | Train a model on MNIST using TPUs with Lightning                                      |      [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/06-mnist-tpu-training.ipynb)                |
+| **94% Baseline CIFAR10** | Establish a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning     |         [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/07-cifar10-baseline.ipynb)         |

From 90c1c0f68b4983c685e9d009482890e578800439 Mon Sep 17 00:00:00 2001
From: Varad Pimpalkhute <35189598+nightlessbaron@users.noreply.github.com>
Date: Thu, 24 Dec 2020 16:45:46 +0530
Subject: [PATCH 048/136] Update README.md (#5018)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 036ebe74244ed..cd9eb7cf02fc2 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ Lightning disentangles PyTorch code to decouple the science from the engineering
 Lightning is designed with these principles in mind:
 
 Principle 1: Enable maximal flexibility.
-Principle 2: Abstract away unecessary boilerplate, but make it accessible when needed.
+Principle 2: Abstract away unnecessary boilerplate, but make it accessible when needed.
 Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).
 Principle 4: Deep learning code should be organized into 4 distinct categories.
 

From 8d8098c04e716c8b9ccf5bc9208dd4f23b2b8e14 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Fri, 25 Dec 2020 00:07:30 +0530
Subject: [PATCH 049/136] Minor doc fixes (#5139)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* minor doc fix

* minor doc fix

* Apply suggestions from code review

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* suggestions

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 docs/source/multi_gpu.rst           | 68 +++++++++++++++--------------
 docs/source/trainer.rst             | 24 +++++-----
 pytorch_lightning/core/lightning.py |  8 ++--
 3 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index b3e0b905f27f4..a737ad4a70fd4 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -58,10 +58,10 @@ This will make your code scale to any arbitrary number of GPUs or TPUs with Ligh
         z = torch.Tensor(2, 3)
         z = z.type_as(x)
 
-The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via `self.device`.
+The :class:`~pytorch_lightning.core.lightning.LightningModule` knows what device it is on. You can access the reference via ``self.device``.
 Sometimes it is necessary to store tensors as module attributes. However, if they are not parameters they will
 remain on the CPU even if the module gets moved to a new device. To prevent that and remain device agnostic,
-register the tensor as a buffer in your modules's `__init__` method with :meth:`~torch.nn.Module.register_buffer`.
+register the tensor as a buffer in your modules's ``__init__`` method with :meth:`~torch.nn.Module.register_buffer`.
 
 .. testcode::
 
@@ -75,8 +75,8 @@ register the tensor as a buffer in your modules's `__init__` method with :meth:`
 
 Remove samplers
 ^^^^^^^^^^^^^^^
-In PyTorch, you must use `torch.nn.DistributedSampler` for multi-node or TPU training. The
-sampler makes sure each GPU sees the appropriate part of your data.
+In PyTorch, you must use :class:`~torch.utils.data.distributed.DistributedSampler`
+for multi-node or TPU training. The sampler makes sure each GPU sees the appropriate part of your data.
 
 .. testcode::
 
@@ -99,7 +99,11 @@ Lightning adds the correct samplers when needed, so no need to explicitly add sa
         dataset = MNIST(...)
         return DataLoader(dataset)
 
-.. note:: You can disable this behavior with `Trainer(replace_sampler_ddp=False)`
+.. note::
+    By default it will add ``shuffle=True`` for train sampler and ``shuffle=False`` for val/test sampler.
+    ``drop_last`` in :class:`~torch.utils.data.distributed.DistributedSampler` will be set to its default value in PyTorch.
+
+.. note:: You can disable this behavior with ``Trainer(replace_sampler_ddp=False)``
 
 .. note:: For iterable datasets, we don't do this automatically.
 
@@ -108,7 +112,7 @@ Synchronize validation and test logging
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes.
-This is done by adding `sync_dist=True` to all `self.log` calls in the validation and test step.
+This is done by adding ``sync_dist=True`` to all ``self.log`` calls in the validation and test step.
 This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
 
 Note if you use any built in metrics or custom metrics that use the :ref:`Metrics API <metrics>`, these do not need to be updated and are automatically handled for you.
@@ -229,8 +233,8 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`.
 
 .. note::
 
-    When specifying number of gpus as an integer `gpus=k`, setting the trainer flag
-    `auto_select_gpus=True` will automatically help you find `k` gpus that are not
+    When specifying number of gpus as an integer ``gpus=k``, setting the trainer flag
+    ``auto_select_gpus=True`` will automatically help you find ``k`` gpus that are not
     occupied by other processes. This is especially useful when GPUs are configured
     to be in "exclusive mode", such that only one process at a time can access them.
     For more details see the :ref:`Trainer guide <trainer>`.
@@ -258,12 +262,12 @@ Distributed modes
 -----------------
 Lightning allows multiple ways of training
 
-- Data Parallel (`accelerator='dp'`) (multiple-gpus, 1 machine)
-- DistributedDataParallel (`accelerator='ddp'`) (multiple-gpus across many machines (python script based)).
-- DistributedDataParallel (`accelerator='ddp_spawn'`) (multiple-gpus across many machines (spawn based)).
-- DistributedDataParallel 2 (`accelerator='ddp2'`) (DP in a machine, DDP across machines).
-- Horovod (`accelerator='horovod'`) (multi-machine, multi-gpu, configured at runtime)
-- TPUs (`tpu_cores=8|x`) (tpu or TPU pod)
+- Data Parallel (``accelerator='dp'``) (multiple-gpus, 1 machine)
+- DistributedDataParallel (``accelerator='ddp'``) (multiple-gpus across many machines (python script based)).
+- DistributedDataParallel (``accelerator='ddp_spawn'``) (multiple-gpus across many machines (spawn based)).
+- DistributedDataParallel 2 (``accelerator='ddp2'``) (DP in a machine, DDP across machines).
+- Horovod (``accelerator='horovod'``) (multi-machine, multi-gpu, configured at runtime)
+- TPUs (``tpu_cores=8|x``) (tpu or TPU pod)
 
 .. note::
     If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.
@@ -275,7 +279,7 @@ For a deeper understanding of what Lightning is doing, feel free to read this
 
 Data Parallel
 ^^^^^^^^^^^^^
-`DataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel>`_ (DP) splits a batch across k GPUs.
+:class:`~torch.nn.DataParallel` (DP) splits a batch across k GPUs.
 That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples,
 after which the root node will aggregate the results.
 
@@ -289,7 +293,7 @@ after which the root node will aggregate the results.
 
 Distributed Data Parallel
 ^^^^^^^^^^^^^^^^^^^^^^^^^
-`DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#distributeddataparallel>`_ (DDP) works as follows:
+:class:`~torch.nn.parallel.DistributedDataParallel` (DDP) works as follows:
 
 1. Each GPU across each node gets its own process.
 
@@ -576,26 +580,26 @@ not allow 16-bit and DP training. We tried to get this to work, but it's an issu
 
 Below are the possible configurations we support.
 
-+-------+---------+----+-----+---------+------------------------------------------------------------+
-| 1 GPU | 1+ GPUs | DP | DDP | 16-bit  | command                                                    |
-+=======+=========+====+=====+=========+============================================================+
-| Y     |         |    |     |         | `Trainer(gpus=1)`                                          |
-+-------+---------+----+-----+---------+------------------------------------------------------------+
-| Y     |         |    |     | Y       | `Trainer(gpus=1, precision=16)`                            |
-+-------+---------+----+-----+---------+------------------------------------------------------------+
-|       | Y       | Y  |     |         | `Trainer(gpus=k, accelerator='dp')`                        |
-+-------+---------+----+-----+---------+------------------------------------------------------------+
-|       | Y       |    | Y   |         | `Trainer(gpus=k, accelerator='ddp')`                       |
-+-------+---------+----+-----+---------+------------------------------------------------------------+
-|       | Y       |    | Y   | Y       | `Trainer(gpus=k, accelerator='ddp', precision=16)`         |
-+-------+---------+----+-----+---------+------------------------------------------------------------+
++-------+---------+----+-----+--------+------------------------------------------------------------+
+| 1 GPU | 1+ GPUs | DP | DDP | 16-bit | command                                                    |
++=======+=========+====+=====+========+============================================================+
+| Y     |         |    |     |        | `Trainer(gpus=1)`                                          |
++-------+---------+----+-----+--------+------------------------------------------------------------+
+| Y     |         |    |     | Y      | `Trainer(gpus=1, precision=16)`                            |
++-------+---------+----+-----+--------+------------------------------------------------------------+
+|       | Y       | Y  |     |        | `Trainer(gpus=k, accelerator='dp')`                        |
++-------+---------+----+-----+--------+------------------------------------------------------------+
+|       | Y       |    | Y   |        | `Trainer(gpus=k, accelerator='ddp')`                       |
++-------+---------+----+-----+--------+------------------------------------------------------------+
+|       | Y       |    | Y   | Y      | `Trainer(gpus=k, accelerator='ddp', precision=16)`         |
++-------+---------+----+-----+--------+------------------------------------------------------------+
 
 
 Implement Your Own Distributed (DDP) training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 If you need your own way to init PyTorch DDP you can override :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.init_ddp_connection`.
 
-If you also need to use your own DDP implementation, override: :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.configure_ddp`.
+If you also need to use your own DDP implementation, override :meth:`pytorch_lightning.plugins.ddp_plugin.DDPPlugin.configure_ddp`.
 
 
 ----------
@@ -694,9 +698,7 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-To get started, install FairScale through extras using with ``pip install pytorch-lightning["extra"]``
-
-or directly using
+To get started, install FairScale using the command below.
 
 .. code-block:: bash
 
diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 392edf8ce11a2..634a0c5d3d9dc 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -141,9 +141,9 @@ So you can run it like so:
 
 .. note::
     If you want to stop a training run early, you can press "Ctrl + C" on your keyboard.
-    The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including
-    running callbacks such as `on_train_end`. The trainer object will also set an attribute
-    `interrupted` to `True` in such cases. If you have a callback which shuts down compute
+    The trainer will catch the ``KeyboardInterrupt`` and attempt a graceful shutdown, including
+    running callbacks such as ``on_train_end``. The trainer object will also set an attribute
+    ``interrupted`` to ``True`` in such cases. If you have a callback which shuts down compute
     resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs.
 
 ------------
@@ -220,13 +220,13 @@ accelerator
 
 The accelerator backend to use (previously known as distributed_backend).
 
-- (```dp```) is DataParallel (split batch among GPUs of same machine)
-- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads)
-- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs.
+- (``'dp'``) is DataParallel (split batch among GPUs of same machine)
+- (``'ddp'``) is DistributedDataParallel (each gpu on each node trains, and syncs grads)
+- (``'ddp_cpu'``) is DistributedDataParallel on CPU (same as ``'ddp'``, but does not use GPUs.
   Useful for multi-node CPU training or single-node debugging. Note that this will **not** give
   a speedup on a single node, since Torch already makes efficient use of multiple CPUs on a single
   machine.)
-- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing
+- (``'ddp2'``) dp on node, ddp across nodes. Useful for things like increasing
     the number of negative samples
 
 .. testcode::
@@ -245,7 +245,7 @@ Example::
     # ddp2 = DistributedDataParallel + dp
     trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2')
 
-.. note:: This option does not apply to TPU. TPUs use ```ddp``` by default (over each core)
+.. note:: This option does not apply to TPU. TPUs use ``'ddp'`` by default (over each core)
 
 You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs.
 
@@ -619,7 +619,7 @@ will need to be set up to use remote filepaths.
 
 distributed_backend
 ^^^^^^^^^^^^^^^^^^^
-This has been renamed "accelerator".
+Deprecated: This has been renamed ``accelerator``.
 
 fast_dev_run
 ^^^^^^^^^^^^
@@ -818,7 +818,7 @@ Options:
     # log only the min and max memory on the master node
     trainer = Trainer(log_gpu_memory='min_max')
 
-.. note:: Might slow performance because it uses the output of nvidia-smi.
+.. note:: Might slow performance because it uses the output of ``nvidia-smi``.
 
 flush_logs_every_n_steps
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1099,7 +1099,9 @@ as you request.
 
 Your effective batch size is batch_size * total tpu cores.
 
-.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you.
+.. note::
+    No need to add a :class:`~torch.utils.data.distributed.DistributedSampler`,
+    Lightning automatically does it for you.
 
 This parameter can be either 1 or 8.
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 34072c5e43a61..a4330b401936d 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -737,7 +737,7 @@ def validation_step(self, batch, batch_idx):
                 out = self(x)
                 return out
 
-            def validation_epoch_end(self, val_step_outputs):
+            def validation_step_end(self, val_step_outputs):
                 for out in val_step_outputs:
                     # do something with these
 
@@ -745,9 +745,7 @@ def validation_epoch_end(self, val_step_outputs):
             See the :ref:`multi_gpu` guide for more details.
         """
 
-    def validation_epoch_end(
-        self, outputs: List[Any]
-    ) -> None:
+    def validation_epoch_end(self, outputs: List[Any]) -> None:
         """
         Called at the end of the validation epoch with the outputs of all validation steps.
 
@@ -914,7 +912,7 @@ def test_step(self, batch, batch_idx):
                 out = self.encoder(x)
                 return out
 
-            def test_epoch_end(self, output_results):
+            def test_step_end(self, output_results):
                 # this out is now the full size of the batch
                 all_test_step_outs = output_results.out
                 loss = nce_loss(all_test_step_outs)

From d1e97a4f114a285349e31e330c7bf8937bc1ee04 Mon Sep 17 00:00:00 2001
From: Jonathan Chang <31893406+cccntu@users.noreply.github.com>
Date: Sun, 27 Dec 2020 07:22:53 +0800
Subject: [PATCH 050/136] Fix typo in doc (#5270)

---
 docs/source/optimizers.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 446a21b5b1997..53d98076e8529 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -73,7 +73,7 @@ Under the hood Lightning does the following:
             optimizer.step()
             optimizer.zero_grad()
 
-        for scheduler in scheduler:
+        for scheduler in schedulers:
             scheduler.step()
 
 In the case of multiple optimizers, Lightning does the following:
@@ -87,7 +87,7 @@ In the case of multiple optimizers, Lightning does the following:
             train_step(opt)
             opt.step()
 
-      for scheduler in scheduler:
+      for scheduler in schedulers:
          scheduler.step()
 
 
@@ -179,7 +179,7 @@ Lightning will call each optimizer sequentially:
             train_step(opt)
             opt.step()
 
-      for scheduler in scheduler:
+      for scheduler in schedulers:
          scheduler.step()
 
 ----------

From 9ebbfece5e2c56bb5300cfffafb129e399492469 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Mon, 28 Dec 2020 15:34:18 +0100
Subject: [PATCH 051/136] Trainer.test should return only test metrics (#5214)

* resolve bug

* merge tests
---
 .../logger_connector/epoch_result_store.py    |  2 +-
 .../logger_connector/logger_connector.py      | 18 ++++---
 .../test_eval_loop_logging_1_0.py             | 48 ++++++-------------
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 6d206f3dd929e..dd12a2970727a 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -399,7 +399,7 @@ def update_logger_connector(self) -> None:
             callback_metrics.update(epoch_log_metrics)
             callback_metrics.update(forked_metrics)
 
-        if not is_train:
+        if not is_train and self.trainer.testing:
             logger_connector.evaluation_callback_metrics.update(callback_metrics)
 
         # update callback_metrics
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 6fdd2f0d57b63..54bf2f9a90cea 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 from copy import deepcopy
+import os
 from pprint import pprint
 from typing import Iterable, Union
 
@@ -273,10 +273,13 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
             if isinstance(eval_results, list):
                 for eval_result in eval_results:
                     self.trainer.logger_connector.callback_metrics.update(eval_result.callback_metrics)
-                    self.trainer.logger_connector.evaluation_callback_metrics.update(eval_result.callback_metrics)
+                    if self.trainer.testing:
+                        self.trainer.logger_connector.evaluation_callback_metrics.update(
+                            eval_result.callback_metrics)
             else:
                 self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics)
-                self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics)
+                if self.trainer.testing:
+                    self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics)
         else:
             flat = {}
             if isinstance(eval_results, list):
@@ -292,7 +295,8 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
                         flat['checkpoint_on'] = flat['val_loss']
                         flat['early_stop_on'] = flat['val_loss']
                     self.trainer.logger_connector.callback_metrics.update(flat)
-                    self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
+                    if self.trainer.testing:
+                        self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
             else:
                 # with a scalar return, auto set it to "val_loss" for callbacks
                 if isinstance(eval_results, torch.Tensor):
@@ -305,7 +309,8 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
                     flat['checkpoint_on'] = flat['val_loss']
                     flat['early_stop_on'] = flat['val_loss']
                 self.trainer.logger_connector.callback_metrics.update(flat)
-                self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
+                if self.trainer.testing:
+                    self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
 
     def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics):
         # eval loop returns all metrics
@@ -322,7 +327,8 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric
         callback_metrics.update(log_metrics)
         callback_metrics.update(prog_bar_metrics)
         self.trainer.logger_connector.callback_metrics.update(callback_metrics)
-        self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics)
+        if self.trainer.testing:
+            self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics)
 
         if len(dataloader_result_metrics) > 0:
             self.eval_loop_results.append(dataloader_result_metrics)
diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
index 76baa9237955f..da08ffe710e75 100644
--- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
@@ -25,7 +25,7 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 
-from pytorch_lightning import Trainer, callbacks, seed_everything
+from pytorch_lightning import callbacks, seed_everything, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.loggers import TensorBoardLogger
@@ -813,7 +813,7 @@ def validation_step(self, batch, batch_idx):
         def test_step(self, batch, batch_idx):
             output = self.layer(batch)
             loss = self.loss(batch, output)
-            self.log('fake_test_acc', loss)
+            self.log('test_loss', loss)
             return {"y": loss}
 
     model = ExtendedModel()
@@ -825,7 +825,7 @@ def test_step(self, batch, batch_idx):
         logger=TensorBoardLogger(tmpdir),
         limit_train_batches=2,
         limit_val_batches=2,
-        limit_test_batches=0,
+        limit_test_batches=2,
         max_epochs=2,
         progress_bar_refresh_rate=1,
     )
@@ -877,33 +877,15 @@ def get_metrics_at_idx(idx):
     expected = torch.stack(model.val_losses[4:]).mean()
     assert get_metrics_at_idx(6)["valid_loss_1"] == expected
 
-
-def test_progress_bar_dict_contains_values_on_test_epoch_end(tmpdir):
-    class TestModel(BoringModel):
-        def test_step(self, *args):
-            self.log("foo", torch.tensor(self.current_epoch), on_step=False, on_epoch=True, prog_bar=True)
-
-        def test_epoch_end(self, *_):
-            self.epoch_end_called = True
-            self.log('foo_2', torch.tensor(self.current_epoch), prog_bar=True,
-                     on_epoch=True, sync_dist=True, sync_dist_op='sum')
-
-        def on_test_epoch_end(self, *_):
-            self.on_test_epoch_end_called = True
-            assert self.trainer.progress_bar_dict["foo"] == self.current_epoch
-            assert self.trainer.progress_bar_dict["foo_2"] == self.current_epoch
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=2,
-        limit_train_batches=1,
-        num_sanity_val_steps=2,
-        checkpoint_callback=False,
-        logger=False,
-        weights_summary=None,
-        progress_bar_refresh_rate=0,
-    )
-    model = TestModel()
-    trainer.test(model)
-    assert model.epoch_end_called
-    assert model.on_test_epoch_end_called
+    results = trainer.test(model)
+    expected_callback_metrics = {
+        'train_loss',
+        'valid_loss_0_epoch',
+        'valid_loss_0',
+        'debug_epoch',
+        'valid_loss_1',
+        'test_loss',
+        'val_loss'
+    }
+    assert set(trainer.callback_metrics) == expected_callback_metrics
+    assert set(results[0]) == {'test_loss', 'debug_epoch'}

From eb1d61caaf49e6b87c0edd50472eb3474da17376 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Mon, 28 Dec 2020 20:32:53 +0100
Subject: [PATCH 052/136] remove docs (#5287)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 docs/source/multi_gpu.rst | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index a737ad4a70fd4..9d868406e2985 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -239,23 +239,6 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`.
     to be in "exclusive mode", such that only one process at a time can access them.
     For more details see the :ref:`Trainer guide <trainer>`.
 
-
-Remove CUDA flags
-^^^^^^^^^^^^^^^^^
-
-CUDA flags make certain GPUs visible to your script.
-Lightning sets these for you automatically, there's NO NEED to do this yourself.
-
-.. testcode::
-
-    # lightning will set according to what you give the trainer
-    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-However, when using a cluster, Lightning will NOT set these flags (and you should not either).
-SLURM will set these for you.
-For more details see the :ref:`SLURM cluster guide <slurm>`.
-
 ----------
 
 Distributed modes

From 0c7c9e85404ce4be33cc65f95a029b6bc03d84e4 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Tue, 29 Dec 2020 17:19:02 +0900
Subject: [PATCH 053/136] Apply isort to `pl_examples/` (#5291)

* Remove examples from isort ignore list

* Apply isort
---
 pl_examples/basic_examples/autoencoder.py                | 9 ++++-----
 pl_examples/basic_examples/backbone_image_classifier.py  | 4 ++--
 pl_examples/basic_examples/conv_sequential_example.py    | 4 ++--
 pl_examples/basic_examples/dali_image_classifier.py      | 8 ++++----
 pl_examples/basic_examples/simple_image_classifier.py    | 2 +-
 pl_examples/bug_report_model.py                          | 3 ++-
 .../domain_templates/computer_vision_fine_tuning.py      | 9 ++++-----
 .../domain_templates/generative_adversarial_net.py       | 8 ++++----
 pl_examples/domain_templates/imagenet.py                 | 4 ++--
 pl_examples/domain_templates/reinforce_learn_Qnet.py     | 6 +++---
 pl_examples/domain_templates/semantic_segmentation.py    | 8 ++++----
 pyproject.toml                                           | 1 -
 12 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index 91f7ac0a1569d..13564b8f7480c 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -15,17 +15,16 @@
 from argparse import ArgumentParser
 
 import torch
-import torch.nn.functional as F
 from torch import nn
-from torch.utils.data import DataLoader
-from torch.utils.data import random_split
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, random_split
 
+from pl_examples import cli_lightning_logo, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
-    from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
+    from torchvision.datasets.mnist import MNIST
 else:
     from tests.base.datasets import MNIST
 
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index bb1daad301d08..a98a27c591982 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -18,12 +18,12 @@
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
 
+from pl_examples import cli_lightning_logo, DATASETS_PATH, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
-from pl_examples import DATASETS_PATH, TORCHVISION_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
-    from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
+    from torchvision.datasets.mnist import MNIST
 else:
     from tests.base.datasets import MNIST
 
diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 39634084860c2..2492b24287929 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -20,16 +20,16 @@
 To run:
 python conv_model_sequential_example.py --accelerator ddp --gpus 4 --max_epochs 1  --batch_size 256 --use_ddp_sequential
 """
-import math
 from argparse import ArgumentParser
+import math
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
+import pytorch_lightning as pl
 from pytorch_lightning import Trainer
 from pytorch_lightning.metrics.functional import accuracy
 from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index e628f5daf8a53..84414b9eca560 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -13,29 +13,29 @@
 # limitations under the License.
 from abc import ABC
 from argparse import ArgumentParser
+from distutils.version import LooseVersion
 from random import shuffle
 from warnings import warn
-from distutils.version import LooseVersion
 
 import numpy as np
 import torch
 from torch.nn import functional as F
 from torch.utils.data import random_split
 
+from pl_examples import cli_lightning_logo, DALI_AVAILABLE, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
-from pl_examples import TORCHVISION_AVAILABLE, DALI_AVAILABLE, cli_lightning_logo
 
 if TORCHVISION_AVAILABLE:
-    from torchvision.datasets.mnist import MNIST
     from torchvision import transforms
+    from torchvision.datasets.mnist import MNIST
 else:
     from tests.base.datasets import MNIST
 
 if DALI_AVAILABLE:
+    from nvidia.dali import __version__ as dali_version
     from nvidia.dali import ops
     from nvidia.dali.pipeline import Pipeline
     from nvidia.dali.plugin.pytorch import DALIClassificationIterator
-    from nvidia.dali import __version__ as dali_version
 
     NEW_DALI_API = LooseVersion(dali_version) >= LooseVersion('0.28.0')
     if NEW_DALI_API:
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index 894eeea619ba9..630ea73974e7f 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -18,9 +18,9 @@
 import torch
 from torch.nn import functional as F
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
+import pytorch_lightning as pl
 
 
 class LitClassifier(pl.LightningModule):
diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
index 30345122e251f..1351048711df4 100644
--- a/pl_examples/bug_report_model.py
+++ b/pl_examples/bug_report_model.py
@@ -20,11 +20,12 @@
 # --------------------------------------------
 # --------------------------------------------
 import os
+
 import torch
 from torch.utils.data import Dataset
 
 from pl_examples import cli_lightning_logo
-from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning import LightningModule, Trainer
 
 
 class RandomDataset(Dataset):
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 4392ac47e837f..d643562a82400 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -38,22 +38,21 @@
 from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Optional, Generator, Union
+from typing import Generator, Optional, Union
 
 import torch
-import torch.nn.functional as F
 from torch import optim
 from torch.nn import Module
+import torch.nn.functional as F
 from torch.optim.lr_scheduler import MultiStepLR
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
-from torchvision import models
-from torchvision import transforms
+from torchvision import models, transforms
 from torchvision.datasets import ImageFolder
 from torchvision.datasets.utils import download_and_extract_archive
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
+import pytorch_lightning as pl
 from pytorch_lightning import _logger as log
 
 BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)
diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py
index b0c324c193574..5227d4defcda3 100644
--- a/pl_examples/domain_templates/generative_adversarial_net.py
+++ b/pl_examples/domain_templates/generative_adversarial_net.py
@@ -19,20 +19,20 @@
 
 tensorboard --logdir default
 """
-import os
 from argparse import ArgumentParser, Namespace
+import os
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa
-import torchvision
-import torchvision.transforms as transforms
 from torch.utils.data import DataLoader
+import torchvision
 from torchvision.datasets import MNIST
+import torchvision.transforms as transforms
 
 from pl_examples import cli_lightning_logo
-from pytorch_lightning.core import LightningModule, LightningDataModule
+from pytorch_lightning.core import LightningDataModule, LightningModule
 from pytorch_lightning.trainer import Trainer
 
 
diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index cc36f3542a1c8..d379b5d3e9a6c 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -30,8 +30,8 @@
     python imagenet.py --help
 
 """
-import os
 from argparse import ArgumentParser, Namespace
+import os
 
 import torch
 import torch.nn.functional as F
@@ -44,8 +44,8 @@
 import torchvision.models as models
 import torchvision.transforms as transforms
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
+import pytorch_lightning as pl
 from pytorch_lightning.core import LightningModule
 
 
diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index 6aee8bb6038c1..6a006326384b6 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -33,8 +33,8 @@
 """
 
 import argparse
-from collections import OrderedDict, deque, namedtuple
-from typing import Tuple, List
+from collections import deque, namedtuple, OrderedDict
+from typing import List, Tuple
 
 import gym
 import numpy as np
@@ -45,8 +45,8 @@
 from torch.utils.data import DataLoader
 from torch.utils.data.dataset import IterableDataset
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
+import pytorch_lightning as pl
 
 
 class DQN(nn.Module):
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 2e718a37ac4b0..7b5131d51d1fc 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -12,20 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from argparse import ArgumentParser, Namespace
 import os
 import random
-from argparse import ArgumentParser, Namespace
 
 import numpy as np
+from PIL import Image
 import torch
 import torch.nn.functional as F
-import torchvision.transforms as transforms
-from PIL import Image
 from torch.utils.data import DataLoader, Dataset
+import torchvision.transforms as transforms
 
-import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
 from pl_examples.domain_templates.unet import UNet
+import pytorch_lightning as pl
 from pytorch_lightning.loggers import WandbLogger
 
 DEFAULT_VOID_LABELS = (0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1)
diff --git a/pyproject.toml b/pyproject.toml
index 01e416aa51d8b..58c92aff5cebf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,6 @@ known_first_party = [
     "tests",
 ]
 skip_glob = [
-    "pl_examples/*",
     "pytorch_lightning/accelerators/*",
     "pytorch_lightning/callbacks/*",
     "pytorch_lightning/cluster_environments/*",

From dabfeca92e0702e55f09ac53e9412672cd258cd3 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Tue, 29 Dec 2020 10:06:28 +0100
Subject: [PATCH 054/136] [Metrics] [Docs] Add section about device placement
 (#5280)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update docs

* Update docs/source/metrics.rst

Co-authored-by: Shreeyak <shreeyak.sajjan@gmail.com>

* Update docs/source/metrics.rst

Co-authored-by: Shreeyak <shreeyak.sajjan@gmail.com>

* Update docs/source/metrics.rst

Co-authored-by: Shreeyak <shreeyak.sajjan@gmail.com>

* Update docs/source/metrics.rst

* Update docs/source/metrics.rst

* Update docs/source/metrics.rst

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Update docs/source/metrics.rst

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update docs/source/metrics.rst

* Update docs/source/metrics.rst

* try fix failing doc test

Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
Co-authored-by: Shreeyak <shreeyak.sajjan@gmail.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/metrics.rst | 50 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 387cbc3bd7482..d6d9cb8fb0ae7 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -137,6 +137,56 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us
     To change this, after initializing the metric, the method ``.persistent(mode)`` can
     be used to enable (``mode=True``) or disable (``mode=False``) this behaviour.
 
+*******************
+Metrics and devices
+*******************
+
+Metrics are simple subclasses of :class:`~torch.nn.Module` and their metric states behave
+similar to buffers and parameters of modules. This means that metrics states should
+be moved to the same device as the input of the metric:
+
+.. code-block:: python
+
+    import torch
+    from pytorch_lightning.metrics import Accuracy
+
+    target = torch.tensor([1, 1, 0, 0], device=torch.device("cuda", 0))
+    preds = torch.tensor([0, 1, 0, 0], device=torch.device("cuda", 0))
+
+    # Metric states are always initialized on cpu, and needs to be moved to
+    # the correct device
+    confmat = Accuracy(num_classes=2).to(torch.device("cuda", 0))
+    out = confmat(preds, target)
+    print(out.device) # cuda:0
+
+However, when **properly defined** inside a :class:`~pytorch_lightning.core.lightning.LightningModule`
+, Lightning will automatically move the metrics to the same device as the data. Being
+**properly defined** means that the metric is correctly identified as a child module of the
+model (check ``.children()`` attribute of the model). Therefore, metrics cannot be placed
+in native python ``list`` and ``dict``, as they will not be correctly identified
+as child modules. Instead of ``list`` use :class:`~torch.nn.ModuleList` and instead of
+``dict`` use :class:`~torch.nn.ModuleDict`.
+
+.. testcode::
+
+    class MyModule(LightningModule):
+        def __init__(self):
+            ...
+            # valid ways metrics will be identified as child modules
+            self.metric1 = pl.metrics.Accuracy()
+            self.metric2 = torch.nn.ModuleList(pl.metrics.Accuracy())
+            self.metric3 = torch.nn.ModuleDict({'accuracy': Accuracy()})
+
+        def training_step(self, batch, batch_idx):
+            # all metrics will be on the same device as the input batch
+            data, target = batch
+            preds = self(data)
+            ...
+            val1 = self.metric1(preds, target)
+            val2 = self.metric2[0](preds, target)
+            val3 = self.metric3['accuracy'](preds, target)
+
+
 *********************
 Implementing a Metric
 *********************

From 4913cbb987a0516f8b33c016134b19c0588d107a Mon Sep 17 00:00:00 2001
From: Tadej Svetina <tadej.svetina@gmail.com>
Date: Tue, 29 Dec 2020 22:09:10 +0100
Subject: [PATCH 055/136] Fix metric state reset (#5273)

* Fix metric state reset

* Fix test

* Improve formatting

Co-authored-by: Ananya Harsh Jha <ananya@pytorchlightning.ai>
---
 pytorch_lightning/metrics/metric.py |  5 +++--
 tests/metrics/test_metric.py        | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py
index 0f61b94c55139..a21242c3bdc7e 100644
--- a/pytorch_lightning/metrics/metric.py
+++ b/pytorch_lightning/metrics/metric.py
@@ -94,7 +94,8 @@ def add_state(
                 reset to this value when ``self.reset()`` is called.
             dist_reduce_fx (Optional): Function to reduce state accross mutliple processes in distributed mode.
                 If value is ``"sum"``, ``"mean"``, or ``"cat"``, we will use ``torch.sum``, ``torch.mean``,
-                and ``torch.cat`` respectively, each with argument ``dim=0``. The user can also pass a custom
+                and ``torch.cat`` respectively, each with argument ``dim=0``. Note that the ``"cat"`` reduction
+                only makes sense if the state is a list, and not a tensor. The user can also pass a custom
                 function in this parameter.
             persistent (Optional): whether the state will be saved as part of the modules ``state_dict``.
                 Default is ``False``.
@@ -244,7 +245,7 @@ def reset(self):
         """
         for attr, default in self._defaults.items():
             current_val = getattr(self, attr)
-            if isinstance(current_val, torch.Tensor):
+            if isinstance(default, torch.Tensor):
                 setattr(self, attr, deepcopy(default).to(current_val.device))
             else:
                 setattr(self, attr, deepcopy(default))
diff --git a/tests/metrics/test_metric.py b/tests/metrics/test_metric.py
index d97cd1a176cf2..67e85624379a5 100644
--- a/tests/metrics/test_metric.py
+++ b/tests/metrics/test_metric.py
@@ -26,6 +26,20 @@ def compute(self):
         pass
 
 
+class DummyList(Metric):
+    name = "DummyList"
+
+    def __init__(self):
+        super().__init__()
+        self.add_state("x", list(), dist_reduce_fx=None)
+
+    def update(self):
+        pass
+
+    def compute(self):
+        pass
+
+
 def test_inherit():
     a = Dummy()
 
@@ -77,12 +91,21 @@ def test_reset():
     class A(Dummy):
         pass
 
+    class B(DummyList):
+        pass
+
     a = A()
     assert a.x == 0
     a.x = torch.tensor(5)
     a.reset()
     assert a.x == 0
 
+    b = B()
+    assert isinstance(b.x, list) and len(b.x) == 0
+    b.x = torch.tensor(5)
+    b.reset()
+    assert isinstance(b.x, list) and len(b.x) == 0
+
 
 def test_update():
     class A(Dummy):

From dd98a60e901ccd511136bd955e0964eaa5b4e8dd Mon Sep 17 00:00:00 2001
From: Sugato Ray <sugatoray@users.noreply.github.com>
Date: Wed, 30 Dec 2020 21:33:22 -0600
Subject: [PATCH 056/136] Fixed typo in docs for optimizer_idx (#5310)

There were four instances where `optimizer_idx` was the argument for **`optimizer_step()`** under [Step optimizers at arbitrary intervals][#link-to-docs]. However, instead of using it what was being used inside (erroneously) was `optimizer_i`.

[#link-to-docs]: https://pytorch-lightning.readthedocs.io/en/latest/optimizers.html#step-optimizers-at-arbitrary-intervals
---
 docs/source/optimizers.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 53d98076e8529..2680c01e4c7ec 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -201,12 +201,12 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
     # Alternating schedule for optimizer steps (ie: GANs)
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
         # update generator opt every 2 steps
-        if optimizer_i == 0:
+        if optimizer_idx == 0:
             if batch_nb % 2 == 0 :
                optimizer.step(closure=closure)
 
         # update discriminator opt every 4 steps
-        if optimizer_i == 1:
+        if optimizer_idx == 1:
             if batch_nb % 4 == 0 :
                optimizer.step(closure=closure)
 
@@ -220,11 +220,11 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
     # Alternating schedule for optimizer steps (ie: GANs)
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
         # update generator opt every 2 steps
-        if optimizer_i == 0:
+        if optimizer_idx == 0:
             optimizer.step(closure=closure, make_optimizer_step=(batch_nb % 2) == 0)
 
         # update discriminator opt every 4 steps
-        if optimizer_i == 1:
+        if optimizer_idx == 1:
             optimizer.step(closure=closure, make_optimizer_step=(batch_nb % 4) == 0)
 
 Here we add a learning-rate warm up

From 64163c2662e0aae420388f84ca1525f25bd23e24 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 31 Dec 2020 09:25:30 +0100
Subject: [PATCH 057/136] [Docs] Mention that datamodules can also be used with
 `.test()` method (#5286)

* docs

* Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* ref

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 docs/source/datamodules.rst |  1 +
 docs/source/test_set.rst    | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst
index b7dd9ec92e71d..2589ac605ee11 100644
--- a/docs/source/datamodules.rst
+++ b/docs/source/datamodules.rst
@@ -268,6 +268,7 @@ Use this method to generate the val dataloader.  Usually you just wrap the datas
         def val_dataloader(self):
             return DataLoader(self.mnist_val, batch_size=64)
 
+.. _datamodule-test-dataloader-label:
 
 test_dataloader
 ^^^^^^^^^^^^^^^
diff --git a/docs/source/test_set.rst b/docs/source/test_set.rst
index 8d8edce672e11..9fe9640aa723b 100644
--- a/docs/source/test_set.rst
+++ b/docs/source/test_set.rst
@@ -3,6 +3,10 @@
 Test set
 ========
 Lightning forces the user to run the test set separately to make sure it isn't evaluated by mistake.
+Testing is performed using the ``trainer`` object's ``.test()`` method.
+
+.. automethod:: pytorch_lightning.trainer.Trainer.test
+    :noindex:
 
 ----------
 
@@ -82,4 +86,22 @@ is not available at the time your model was declared.
     trainer.test(test_dataloaders=test)
 
 You can either pass in a single dataloader or a list of them. This optional named
-parameter can be used in conjunction with any of the above use cases.
+parameter can be used in conjunction with any of the above use cases. Additionally,
+you can also pass in an :ref:`datamodules` that have overridden the 
+:ref:`datamodule-test-dataloader-label` method.
+
+.. code-block:: python
+
+    class MyDataModule(pl.LightningDataModule):
+        ...
+        def test_dataloader(self):
+            return DataLoader(...)
+
+    # setup your datamodule
+    dm = MyDataModule(...)
+
+    # test (pass in datamodule)
+    trainer.test(datamodule=dm)
+    
+
+

From ab7512d7ba0b522114f97db58c8d0b9555d7b75a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 31 Dec 2020 17:24:33 +0100
Subject: [PATCH 058/136] refactor python in GH actions (#5281)

* refactor python in GH actions

* .
* .
---
 .github/workflows/ci_test-full.yml | 41 ++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index b87a1d8557843..719b374d76efb 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -47,26 +47,47 @@ jobs:
       if: runner.os == 'windows'
       run: |
         # remove Horovod from requirements
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
+        fname = 'requirements/extra.txt'
+        lines = [line for line in open(fname).readlines() if not line.startswith('horovod')]
+        open(fname, 'w').writelines(lines)
+      shell: python
 
     # versions <= 1.3 may have issues on mac with some BLAS ops due to missing mkl (https://github.com/pytorch/pytorch/issues/18996)
     - name: Adjust minimal for Python 3.8 and MacOS
       if: matrix.requires == 'minimal' && (runner.os == 'macOS' || matrix.python-version == 3.8)
       run : |
-        python -c "fname = 'requirements.txt' ; req = open(fname).read().replace('torch>=1.3', 'torch>=1.4') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/examples.txt' ; req = open(fname).read().replace('torchvision>=0.4.1', 'torchvision>=0.5.0') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/extra.txt' ; req = open(fname).read().replace('torchtext>=0.3.1', 'torchtext>=0.5.0') ; open(fname, 'w').write(req)"
+        fname = 'requirements.txt'
+        req = open(fname).read().replace('torch>=1.3', 'torch>=1.4')
+        open(fname, 'w').write(req)
+
+        fname = 'requirements/examples.txt'
+        req = open(fname).read().replace('torchvision>=0.4.1', 'torchvision>=0.5.0')
+        open(fname, 'w').write(req)
+
+        fname = 'requirements/extra.txt'
+        req = open(fname).read().replace('torchtext>=0.3.1', 'torchtext>=0.5.0')
+        open(fname, 'w').write(req)
+      shell: python
 
     - name: Set min. dependencies
       if: matrix.requires == 'minimal'
       run: |
-        python -c "fname = 'requirements.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/extra.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/loggers.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/test.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)"
-        python -c "fname = 'requirements/examples.txt' ; req = open(fname).read().replace('>=', '==') ; open(fname, 'w').write(req)"
+        files = (
+            'requirements.txt',
+            'requirements/extra.txt',
+            'requirements/loggers.txt',
+            'requirements/test.txt',
+            'requirements/examples.txt',
+        )
+        for fname in files:
+            req = open(fname).read().replace('>=', '==')
+            open(fname, 'w').write(req)
+
         # remove Fairscale from requirements
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
+        fname = 'requirements/extra.txt'
+        lines = [line for line in open(fname).readlines() if 'fairscale' not in line]
+        open(fname, 'w').writelines(lines)
+      shell: python
 
     # Note: This uses an internal pip API and may not always work
     # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow

From d20fd8e5ab1a52747fee2cd53290a679d8b726d0 Mon Sep 17 00:00:00 2001
From: "Heewon Jeon(gogamza)" <gogamza@gmail.com>
Date: Sat, 2 Jan 2021 22:29:02 +0900
Subject: [PATCH 059/136] supports --num-nodes on DDPSequentialPlugin() (#5327)

---
 pytorch_lightning/plugins/ddp_sequential_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/ddp_sequential_plugin.py
index 010f0ea1648a8..cb8740742db73 100644
--- a/pytorch_lightning/plugins/ddp_sequential_plugin.py
+++ b/pytorch_lightning/plugins/ddp_sequential_plugin.py
@@ -228,7 +228,7 @@ def _infer_check_num_gpus(self, trainer):
         Returns: The appropriate balance for the model
         """
         if isinstance(self.balance, list):
-            if len(self.balance) != trainer.world_size:
+            if len(self.balance) != (trainer.world_size / trainer.num_nodes):
                 raise MisconfigurationException(
                     "Pipe currently only supports splitting the module onto all available GPUs"
                 )

From 724f1051f0cef154999dbeb7c1ce468ff13a8da5 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Sun, 3 Jan 2021 23:13:22 +0100
Subject: [PATCH 060/136] update isort config (#5335)

* update isort config

* apply
---
 pl_examples/basic_examples/autoencoder.py                 | 4 ++--
 pl_examples/basic_examples/backbone_image_classifier.py   | 2 +-
 pl_examples/basic_examples/conv_sequential_example.py     | 4 ++--
 pl_examples/basic_examples/dali_image_classifier.py       | 2 +-
 pl_examples/basic_examples/simple_image_classifier.py     | 2 +-
 .../domain_templates/computer_vision_fine_tuning.py       | 4 ++--
 .../domain_templates/generative_adversarial_net.py        | 6 +++---
 pl_examples/domain_templates/imagenet.py                  | 4 ++--
 pl_examples/domain_templates/reinforce_learn_Qnet.py      | 2 +-
 pl_examples/domain_templates/semantic_segmentation.py     | 8 ++++----
 pyproject.toml                                            | 2 +-
 pytorch_lightning/setup_tools.py                          | 2 +-
 tests/conftest.py                                         | 4 ++--
 tests/test_profiler.py                                    | 2 +-
 14 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
index 13564b8f7480c..e1b284856c3bc 100644
--- a/pl_examples/basic_examples/autoencoder.py
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -15,12 +15,12 @@
 from argparse import ArgumentParser
 
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 from torch.utils.data import DataLoader, random_split
 
-from pl_examples import cli_lightning_logo, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo, TORCHVISION_AVAILABLE
 
 if TORCHVISION_AVAILABLE:
     from torchvision import transforms
diff --git a/pl_examples/basic_examples/backbone_image_classifier.py b/pl_examples/basic_examples/backbone_image_classifier.py
index a98a27c591982..c4dd4fdc3a478 100644
--- a/pl_examples/basic_examples/backbone_image_classifier.py
+++ b/pl_examples/basic_examples/backbone_image_classifier.py
@@ -18,8 +18,8 @@
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
 
-from pl_examples import cli_lightning_logo, DATASETS_PATH, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo, DATASETS_PATH, TORCHVISION_AVAILABLE
 
 if TORCHVISION_AVAILABLE:
     from torchvision import transforms
diff --git a/pl_examples/basic_examples/conv_sequential_example.py b/pl_examples/basic_examples/conv_sequential_example.py
index 2492b24287929..39634084860c2 100644
--- a/pl_examples/basic_examples/conv_sequential_example.py
+++ b/pl_examples/basic_examples/conv_sequential_example.py
@@ -20,16 +20,16 @@
 To run:
 python conv_model_sequential_example.py --accelerator ddp --gpus 4 --max_epochs 1  --batch_size 256 --use_ddp_sequential
 """
-from argparse import ArgumentParser
 import math
+from argparse import ArgumentParser
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
 
-from pl_examples import cli_lightning_logo
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import Trainer
 from pytorch_lightning.metrics.functional import accuracy
 from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
index 84414b9eca560..cfa146911dd1b 100644
--- a/pl_examples/basic_examples/dali_image_classifier.py
+++ b/pl_examples/basic_examples/dali_image_classifier.py
@@ -22,8 +22,8 @@
 from torch.nn import functional as F
 from torch.utils.data import random_split
 
-from pl_examples import cli_lightning_logo, DALI_AVAILABLE, TORCHVISION_AVAILABLE
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo, DALI_AVAILABLE, TORCHVISION_AVAILABLE
 
 if TORCHVISION_AVAILABLE:
     from torchvision import transforms
diff --git a/pl_examples/basic_examples/simple_image_classifier.py b/pl_examples/basic_examples/simple_image_classifier.py
index 630ea73974e7f..894eeea619ba9 100644
--- a/pl_examples/basic_examples/simple_image_classifier.py
+++ b/pl_examples/basic_examples/simple_image_classifier.py
@@ -18,9 +18,9 @@
 import torch
 from torch.nn import functional as F
 
+import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
 from pl_examples.basic_examples.mnist_datamodule import MNISTDataModule
-import pytorch_lightning as pl
 
 
 class LitClassifier(pl.LightningModule):
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index d643562a82400..8f7585d0f70c8 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -41,9 +41,9 @@
 from typing import Generator, Optional, Union
 
 import torch
+import torch.nn.functional as F
 from torch import optim
 from torch.nn import Module
-import torch.nn.functional as F
 from torch.optim.lr_scheduler import MultiStepLR
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
@@ -51,8 +51,8 @@
 from torchvision.datasets import ImageFolder
 from torchvision.datasets.utils import download_and_extract_archive
 
-from pl_examples import cli_lightning_logo
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning import _logger as log
 
 BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)
diff --git a/pl_examples/domain_templates/generative_adversarial_net.py b/pl_examples/domain_templates/generative_adversarial_net.py
index 5227d4defcda3..35b5563d2c1cc 100644
--- a/pl_examples/domain_templates/generative_adversarial_net.py
+++ b/pl_examples/domain_templates/generative_adversarial_net.py
@@ -19,17 +19,17 @@
 
 tensorboard --logdir default
 """
-from argparse import ArgumentParser, Namespace
 import os
+from argparse import ArgumentParser, Namespace
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa
-from torch.utils.data import DataLoader
 import torchvision
-from torchvision.datasets import MNIST
 import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torchvision.datasets import MNIST
 
 from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningDataModule, LightningModule
diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index d379b5d3e9a6c..cc36f3542a1c8 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -30,8 +30,8 @@
     python imagenet.py --help
 
 """
-from argparse import ArgumentParser, Namespace
 import os
+from argparse import ArgumentParser, Namespace
 
 import torch
 import torch.nn.functional as F
@@ -44,8 +44,8 @@
 import torchvision.models as models
 import torchvision.transforms as transforms
 
-from pl_examples import cli_lightning_logo
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 from pytorch_lightning.core import LightningModule
 
 
diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index 6a006326384b6..21583dad1f086 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -45,8 +45,8 @@
 from torch.utils.data import DataLoader
 from torch.utils.data.dataset import IterableDataset
 
-from pl_examples import cli_lightning_logo
 import pytorch_lightning as pl
+from pl_examples import cli_lightning_logo
 
 
 class DQN(nn.Module):
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 7b5131d51d1fc..2e718a37ac4b0 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -12,20 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from argparse import ArgumentParser, Namespace
 import os
 import random
+from argparse import ArgumentParser, Namespace
 
 import numpy as np
-from PIL import Image
 import torch
 import torch.nn.functional as F
-from torch.utils.data import DataLoader, Dataset
 import torchvision.transforms as transforms
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
 
+import pytorch_lightning as pl
 from pl_examples import cli_lightning_logo
 from pl_examples.domain_templates.unet import UNet
-import pytorch_lightning as pl
 from pytorch_lightning.loggers import WandbLogger
 
 DEFAULT_VOID_LABELS = (0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1)
diff --git a/pyproject.toml b/pyproject.toml
index 58c92aff5cebf..d7d07b1526390 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,5 +51,5 @@ skip_glob = [
 ]
 profile = "black"
 line_length = 120
-force_sort_within_sections = "True"
+force_sort_within_sections = "False"
 order_by_type = "False"
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index de05acf408381..b49c90bd0b28c 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -14,10 +14,10 @@
 # limitations under the License.
 import os
 import re
+import warnings
 from typing import Iterable, List
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
-import warnings
 
 from pytorch_lightning import __homepage__, __version__, PROJECT_ROOT
 
diff --git a/tests/conftest.py b/tests/conftest.py
index c6a14a99b2478..07188fed4dbed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from functools import partial, wraps
-from http.server import SimpleHTTPRequestHandler
 import sys
 import threading
+from functools import partial, wraps
+from http.server import SimpleHTTPRequestHandler
 
 import pytest
 import torch.multiprocessing as mp
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 91a8631a73287..4728b11582dfc 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-from pathlib import Path
 import time
+from pathlib import Path
 
 import numpy as np
 import pytest

From 51af3957fc7f18a4ce101c6da2a8177a36217d74 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 4 Jan 2021 08:38:46 +0100
Subject: [PATCH 061/136] uniques docs artefact name (#5336)

---
 .github/workflows/docs-checks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 2f91a4f5d43c8..3f6b35ba7b7cb 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -105,7 +105,7 @@ jobs:
       - name: Upload built docs
         uses: actions/upload-artifact@v2
         with:
-          name: docs-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}
+          name: docs-results-${{ github.sha }}
           path: docs/build/html/
         # Use always() to always run this step to publish test results when there are test failures
         if: success()

From 17a0784c5e9789ec06e720f5f4a06a80756e73fa Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@users.noreply.github.com>
Date: Mon, 4 Jan 2021 07:20:04 -0600
Subject: [PATCH 062/136] black formatting and migrated to self.log logging in
 finetuning example (#5229)

* black formatting and migrated to self.log logging

* Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* migrated to accuracy in the metrics package

migrated to accuracy in the metrics package

* removed trailing whitespace

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 .../computer_vision_fine_tuning.py            | 324 ++++++++----------
 1 file changed, 149 insertions(+), 175 deletions(-)

diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
index 8f7585d0f70c8..733fd8646142e 100644
--- a/pl_examples/domain_templates/computer_vision_fine_tuning.py
+++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -17,18 +17,23 @@
 network (by default, a ResNet50 is used) using pytorch-lightning. For the sake
 of this example, the 'cats and dogs dataset' (~60MB, see `DATA_URL` below) and
 the proposed network (denoted by `TransferLearningModel`, see below) is
-trained for 15 epochs. The training consists in three stages. From epoch 0 to
-4, the feature extractor (the pre-trained network) is frozen except maybe for
-the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm
-layers (if `train_bn = True`) and the parameters of the classifier are trained
-as a single parameters group with lr = 1e-2. From epoch 5 to 9, the last two
-layer groups of the pre-trained network are unfrozen and added to the
-optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3 for the
-first parameter group in the optimizer). Eventually, from epoch 10, all the
-remaining layer groups of the pre-trained network are unfrozen and added to
-the optimizer as a third parameter group. From epoch 10, the parameters of the
-pre-trained network are trained with lr = 1e-5 while those of the classifier
-are trained with lr = 1e-4.
+trained for 15 epochs.
+
+The training consists of three stages.
+
+From epoch 0 to 4, the feature extractor (the pre-trained network) is frozen except
+maybe for the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm
+layers (if `train_bn = True`) and the parameters of the classifier are trained as a
+single parameters group with lr = 1e-2.
+
+From epoch 5 to 9, the last two layer groups of the pre-trained network are unfrozen
+and added to the optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3
+for the first parameter group in the optimizer).
+
+Eventually, from epoch 10, all the remaining layer groups of the pre-trained network
+are unfrozen and added to the optimizer as a third parameter group. From epoch 10,
+the parameters of the pre-trained network are trained with lr = 1e-5 while those of
+the classifier is trained with lr = 1e-4.
 
 Note:
     See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
@@ -56,7 +61,7 @@
 from pytorch_lightning import _logger as log
 
 BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)
-DATA_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
+DATA_URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip"
 
 
 #  --- Utility functions ---
@@ -73,8 +78,7 @@ def _make_trainable(module: Module) -> None:
     module.train()
 
 
-def _recursive_freeze(module: Module,
-                      train_bn: bool = True) -> None:
+def _recursive_freeze(module: Module, train_bn: bool = True) -> None:
     """Freezes the layers of a given module.
 
     Args:
@@ -95,9 +99,7 @@ def _recursive_freeze(module: Module,
             _recursive_freeze(module=child, train_bn=train_bn)
 
 
-def freeze(module: Module,
-           n: Optional[int] = None,
-           train_bn: bool = True) -> None:
+def freeze(module: Module, n: Optional[int] = None, train_bn: bool = True) -> None:
     """Freezes the layers up to index n (if n is not None).
 
     Args:
@@ -116,8 +118,7 @@ def freeze(module: Module,
         _make_trainable(module=child)
 
 
-def filter_params(module: Module,
-                  train_bn: bool = True) -> Generator:
+def filter_params(module: Module, train_bn: bool = True) -> Generator:
     """Yields the trainable parameters of a given module.
 
     Args:
@@ -139,17 +140,18 @@ def filter_params(module: Module,
                 yield param
 
 
-def _unfreeze_and_add_param_group(module: Module,
-                                  optimizer: Optimizer,
-                                  lr: Optional[float] = None,
-                                  train_bn: bool = True):
+def _unfreeze_and_add_param_group(
+    module: Module, optimizer: Optimizer, lr: Optional[float] = None, train_bn: bool = True
+):
     """Unfreezes a module and adds its parameters to an optimizer."""
     _make_trainable(module)
-    params_lr = optimizer.param_groups[0]['lr'] if lr is None else float(lr)
+    params_lr = optimizer.param_groups[0]["lr"] if lr is None else float(lr)
     optimizer.add_param_group(
-        {'params': filter_params(module=module, train_bn=train_bn),
-         'lr': params_lr / 10.,
-         })
+        {
+            "params": filter_params(module=module, train_bn=train_bn),
+            "lr": params_lr / 10.0,
+        }
+    )
 
 
 #  --- Pytorch-lightning module ---
@@ -165,23 +167,24 @@ class TransferLearningModel(pl.LightningModule):
       (fc): Sequential(...)
     )
     """
+
     def __init__(
-            self,
-            dl_path: Union[str, Path],
-            backbone: str = 'resnet50',
-            train_bn: bool = True,
-            milestones: tuple = (5, 10),
-            batch_size: int = 8,
-            lr: float = 1e-2,
-            lr_scheduler_gamma: float = 1e-1,
-            num_workers: int = 6,
-            **kwargs,
+        self,
+        dl_path: Union[str, Path],
+        backbone: str = "resnet50",
+        train_bn: bool = True,
+        milestones: tuple = (5, 10),
+        batch_size: int = 8,
+        lr: float = 1e-2,
+        lr_scheduler_gamma: float = 1e-1,
+        num_workers: int = 6,
+        **kwargs,
     ) -> None:
         """
         Args:
             dl_path: Path where the data will be downloaded
         """
-        super().__init__(**kwargs)
+        super().__init__()
         self.dl_path = dl_path
         self.backbone = backbone
         self.train_bn = train_bn
@@ -194,6 +197,10 @@ def __init__(
         self.dl_path = dl_path
         self.__build_model()
 
+        self.train_acc = pl.metrics.Accuracy()
+        self.valid_acc = pl.metrics.Accuracy()
+        self.save_hyperparameters()
+
     def __build_model(self):
         """Define model layers & loss."""
 
@@ -206,9 +213,7 @@ def __build_model(self):
         freeze(module=self.feature_extractor, train_bn=self.train_bn)
 
         # 2. Classifier:
-        _fc_layers = [torch.nn.Linear(2048, 256),
-                      torch.nn.Linear(256, 32),
-                      torch.nn.Linear(32, 1)]
+        _fc_layers = [torch.nn.Linear(2048, 256), torch.nn.Linear(256, 32), torch.nn.Linear(32, 1)]
         self.fc = torch.nn.Sequential(*_fc_layers)
 
         # 3. Loss:
@@ -235,27 +240,24 @@ def train(self, mode=True):
         epoch = self.current_epoch
         if epoch < self.milestones[0] and mode:
             # feature extractor is frozen (except for BatchNorm layers)
-            freeze(module=self.feature_extractor,
-                   train_bn=self.train_bn)
+            freeze(module=self.feature_extractor, train_bn=self.train_bn)
 
         elif self.milestones[0] <= epoch < self.milestones[1] and mode:
             # Unfreeze last two layers of the feature extractor
-            freeze(module=self.feature_extractor,
-                   n=-2,
-                   train_bn=self.train_bn)
+            freeze(module=self.feature_extractor, n=-2, train_bn=self.train_bn)
 
     def on_epoch_start(self):
         """Use `on_epoch_start` to unfreeze layers progressively."""
         optimizer = self.trainer.optimizers[0]
         if self.current_epoch == self.milestones[0]:
-            _unfreeze_and_add_param_group(module=self.feature_extractor[-2:],
-                                          optimizer=optimizer,
-                                          train_bn=self.train_bn)
+            _unfreeze_and_add_param_group(
+                module=self.feature_extractor[-2:], optimizer=optimizer, train_bn=self.train_bn
+            )
 
         elif self.current_epoch == self.milestones[1]:
-            _unfreeze_and_add_param_group(module=self.feature_extractor[:-2],
-                                          optimizer=optimizer,
-                                          train_bn=self.train_bn)
+            _unfreeze_and_add_param_group(
+                module=self.feature_extractor[:-2], optimizer=optimizer, train_bn=self.train_bn
+            )
 
     def training_step(self, batch, batch_idx):
 
@@ -263,32 +265,22 @@ def training_step(self, batch, batch_idx):
         x, y = batch
         y_logits = self.forward(x)
         y_true = y.view((-1, 1)).type_as(x)
-        y_bin = torch.ge(y_logits, 0)
 
         # 2. Compute loss & accuracy:
         train_loss = self.loss(y_logits, y_true)
-        num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum()
+        accuracy = self.train_acc(y_logits, y_true)
 
         # 3. Outputs:
-        tqdm_dict = {'train_loss': train_loss}
-        output = OrderedDict({'loss': train_loss,
-                              'num_correct': num_correct,
-                              'log': tqdm_dict,
-                              'progress_bar': tqdm_dict})
-
-        return output
+        tqdm_dict = {"train_loss": train_loss}
+        self.log_dict(tqdm_dict, prog_bar=True)
+        return {"loss": train_loss}
 
     def training_epoch_end(self, outputs):
         """Compute and log training loss and accuracy at the epoch level."""
 
-        train_loss_mean = torch.stack([output['loss']
-                                       for output in outputs]).mean()
-        train_acc_mean = torch.stack([output['num_correct']
-                                      for output in outputs]).sum().float()
-        train_acc_mean /= (len(outputs) * self.batch_size)
-        return {'log': {'train_loss': train_loss_mean,
-                        'train_acc': train_acc_mean,
-                        'step': self.current_epoch}}
+        train_loss_mean = torch.stack([output["loss"] for output in outputs]).mean()
+        train_acc_mean = self.train_acc.compute()
+        self.log_dict({"train_loss": train_loss_mean, "train_acc": train_acc_mean, "step": self.current_epoch})
 
     def validation_step(self, batch, batch_idx):
 
@@ -296,142 +288,121 @@ def validation_step(self, batch, batch_idx):
         x, y = batch
         y_logits = self.forward(x)
         y_true = y.view((-1, 1)).type_as(x)
-        y_bin = torch.ge(y_logits, 0)
 
         # 2. Compute loss & accuracy:
         val_loss = self.loss(y_logits, y_true)
-        num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum()
+        accuracy = self.valid_acc(y_logits, y_true)
 
-        return {'val_loss': val_loss,
-                'num_correct': num_correct}
+        return {"val_loss": val_loss}
 
     def validation_epoch_end(self, outputs):
         """Compute and log validation loss and accuracy at the epoch level."""
 
-        val_loss_mean = torch.stack([output['val_loss']
-                                     for output in outputs]).mean()
-        val_acc_mean = torch.stack([output['num_correct']
-                                    for output in outputs]).sum().float()
-        val_acc_mean /= (len(outputs) * self.batch_size)
-        return {'log': {'val_loss': val_loss_mean,
-                        'val_acc': val_acc_mean,
-                        'step': self.current_epoch}}
+        val_loss_mean = torch.stack([output["val_loss"] for output in outputs]).mean()
+        train_acc_mean = self.valid_acc.compute()
+        log_dict = {"val_loss": val_loss_mean, "val_acc": train_acc_mean}
+        self.log_dict(log_dict, prog_bar=True)
+        self.log_dict({"step": self.current_epoch})
 
     def configure_optimizers(self):
-        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
-                                      self.parameters()),
-                               lr=self.lr)
+        optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr)
 
-        scheduler = MultiStepLR(optimizer,
-                                milestones=self.milestones,
-                                gamma=self.lr_scheduler_gamma)
+        scheduler = MultiStepLR(optimizer, milestones=self.milestones, gamma=self.lr_scheduler_gamma)
 
         return [optimizer], [scheduler]
 
     def prepare_data(self):
         """Download images and prepare images datasets."""
-        download_and_extract_archive(url=DATA_URL,
-                                     download_root=self.dl_path,
-                                     remove_finished=True)
+        download_and_extract_archive(url=DATA_URL, download_root=self.dl_path, remove_finished=True)
 
     def setup(self, stage: str):
-        data_path = Path(self.dl_path).joinpath('cats_and_dogs_filtered')
+        data_path = Path(self.dl_path).joinpath("cats_and_dogs_filtered")
 
         # 2. Load the data + preprocessing & data augmentation
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                         std=[0.229, 0.224, 0.225])
-
-        train_dataset = ImageFolder(root=data_path.joinpath('train'),
-                                    transform=transforms.Compose([
-                                        transforms.Resize((224, 224)),
-                                        transforms.RandomHorizontalFlip(),
-                                        transforms.ToTensor(),
-                                        normalize,
-                                    ]))
-
-        valid_dataset = ImageFolder(root=data_path.joinpath('validation'),
-                                    transform=transforms.Compose([
-                                        transforms.Resize((224, 224)),
-                                        transforms.ToTensor(),
-                                        normalize,
-                                    ]))
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+        train_dataset = ImageFolder(
+            root=data_path.joinpath("train"),
+            transform=transforms.Compose(
+                [
+                    transforms.Resize((224, 224)),
+                    transforms.RandomHorizontalFlip(),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+        )
+
+        valid_dataset = ImageFolder(
+            root=data_path.joinpath("validation"),
+            transform=transforms.Compose(
+                [
+                    transforms.Resize((224, 224)),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            ),
+        )
 
         self.train_dataset = train_dataset
         self.valid_dataset = valid_dataset
 
-    def __dataloader(self, train):
+    def __dataloader(self, train: bool):
         """Train/validation loaders."""
 
         _dataset = self.train_dataset if train else self.valid_dataset
-        loader = DataLoader(dataset=_dataset,
-                            batch_size=self.batch_size,
-                            num_workers=self.num_workers,
-                            shuffle=True if train else False)
+        loader = DataLoader(dataset=_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=train)
 
         return loader
 
     def train_dataloader(self):
-        log.info('Training data loaded.')
+        log.info("Training data loaded.")
         return self.__dataloader(train=True)
 
     def val_dataloader(self):
-        log.info('Validation data loaded.')
+        log.info("Validation data loaded.")
         return self.__dataloader(train=False)
 
     @staticmethod
     def add_model_specific_args(parent_parser):
         parser = argparse.ArgumentParser(parents=[parent_parser])
-        parser.add_argument('--backbone',
-                            default='resnet50',
-                            type=str,
-                            metavar='BK',
-                            help='Name (as in ``torchvision.models``) of the feature extractor')
-        parser.add_argument('--epochs',
-                            default=15,
-                            type=int,
-                            metavar='N',
-                            help='total number of epochs',
-                            dest='nb_epochs')
-        parser.add_argument('--batch-size',
-                            default=8,
-                            type=int,
-                            metavar='B',
-                            help='batch size',
-                            dest='batch_size')
-        parser.add_argument('--gpus',
-                            type=int,
-                            default=1,
-                            help='number of gpus to use')
-        parser.add_argument('--lr',
-                            '--learning-rate',
-                            default=1e-2,
-                            type=float,
-                            metavar='LR',
-                            help='initial learning rate',
-                            dest='lr')
-        parser.add_argument('--lr-scheduler-gamma',
-                            default=1e-1,
-                            type=float,
-                            metavar='LRG',
-                            help='Factor by which the learning rate is reduced at each milestone',
-                            dest='lr_scheduler_gamma')
-        parser.add_argument('--num-workers',
-                            default=6,
-                            type=int,
-                            metavar='W',
-                            help='number of CPU workers',
-                            dest='num_workers')
-        parser.add_argument('--train-bn',
-                            default=True,
-                            type=bool,
-                            metavar='TB',
-                            help='Whether the BatchNorm layers should be trainable',
-                            dest='train_bn')
-        parser.add_argument('--milestones',
-                            default=[5, 10],
-                            type=list,
-                            metavar='M',
-                            help='List of two epochs milestones')
+        parser.add_argument(
+            "--backbone",
+            default="resnet50",
+            type=str,
+            metavar="BK",
+            help="Name (as in ``torchvision.models``) of the feature extractor",
+        )
+        parser.add_argument(
+            "--epochs", default=15, type=int, metavar="N", help="total number of epochs", dest="nb_epochs"
+        )
+        parser.add_argument("--batch-size", default=8, type=int, metavar="B", help="batch size", dest="batch_size")
+        parser.add_argument("--gpus", type=int, default=1, help="number of gpus to use")
+        parser.add_argument(
+            "--lr", "--learning-rate", default=1e-2, type=float, metavar="LR", help="initial learning rate", dest="lr"
+        )
+        parser.add_argument(
+            "--lr-scheduler-gamma",
+            default=1e-1,
+            type=float,
+            metavar="LRG",
+            help="Factor by which the learning rate is reduced at each milestone",
+            dest="lr_scheduler_gamma",
+        )
+        parser.add_argument(
+            "--num-workers", default=6, type=int, metavar="W", help="number of CPU workers", dest="num_workers"
+        )
+        parser.add_argument(
+            "--train-bn",
+            default=True,
+            type=bool,
+            metavar="TB",
+            help="Whether the BatchNorm layers should be trainable",
+            dest="train_bn",
+        )
+        parser.add_argument(
+            "--milestones", default=[5, 10], type=list, metavar="M", help="List of two epochs milestones"
+        )
         return parser
 
 
@@ -456,23 +427,26 @@ def main(args: argparse.Namespace) -> None:
             num_sanity_val_steps=0,
             gpus=args.gpus,
             min_epochs=args.nb_epochs,
-            max_epochs=args.nb_epochs)
+            max_epochs=args.nb_epochs,
+        )
 
         trainer.fit(model)
 
 
 def get_args() -> argparse.Namespace:
     parent_parser = argparse.ArgumentParser(add_help=False)
-    parent_parser.add_argument('--root-data-path',
-                               metavar='DIR',
-                               type=str,
-                               default=Path.cwd().as_posix(),
-                               help='Root directory where to download the data',
-                               dest='root_data_path')
+    parent_parser.add_argument(
+        "--root-data-path",
+        metavar="DIR",
+        type=str,
+        default=Path.cwd().as_posix(),
+        help="Root directory where to download the data",
+        dest="root_data_path",
+    )
     parser = TransferLearningModel.add_model_specific_args(parent_parser)
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli_lightning_logo()
     main(get_args())

From 0e593fb6a8b6abadb61f7cb6754b79b2117d7f0f Mon Sep 17 00:00:00 2001
From: Sejin Kim <40668167+skim2257@users.noreply.github.com>
Date: Mon, 4 Jan 2021 09:05:24 -0500
Subject: [PATCH 063/136] Reordered sections for intuitive browsing. (e.g.
 limit_train_batches was at the end of the page, far from
 limit_test/val_batches) (#5283)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/trainer.rst | 285 ++++++++++++++++++++--------------------
 1 file changed, 141 insertions(+), 144 deletions(-)

diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 634a0c5d3d9dc..36e895ec904a3 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -670,6 +670,27 @@ Under the hood the pseudocode looks like this when running *fast_dev_run* with a
     used only for debugging purposes. ``limit_train/val/test_batches`` only limits the number of batches and won't
     disable anything.
 
+flush_logs_every_n_steps
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/flush_logs%E2%80%A8_every_n_steps.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/flush_logs_every_n_steps.mp4"></video>
+
+|
+
+Writes logs to disk this often.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(flush_logs_every_n_steps=100)
+
+See Also:
+    - :ref:`logging`
+
 gpus
 ^^^^
 
@@ -736,6 +757,35 @@ Gradient clipping value
     # default used by the Trainer
     trainer = Trainer(gradient_clip_val=0.0)
 
+limit_train_batches
+^^^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/limit_train_batches.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/limit_batches.mp4"></video>
+
+|
+
+How much of training dataset to check.
+Useful when debugging or testing something that happens at the end of an epoch.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(limit_train_batches=1.0)
+
+Example::
+
+    # default used by the Trainer
+    trainer = Trainer(limit_train_batches=1.0)
+
+    # run through only 25% of the training set each epoch
+    trainer = Trainer(limit_train_batches=0.25)
+
+    # run through only 10 batches of the training set each epoch
+    trainer = Trainer(limit_train_batches=10)
 
 limit_test_batches
 ^^^^^^^^^^^^^^^^^^
@@ -790,6 +840,28 @@ Useful when debugging or testing something that happens at the end of an epoch.
 
 In the case of multiple validation dataloaders, the limit applies to each dataloader individually.
 
+log_every_n_steps
+^^^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/log_every_n_steps.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/log_every_n_steps.mp4"></video>
+
+|
+
+
+How often to add logging rows (does not write to disk)
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(log_every_n_steps=50)
+
+See Also:
+    - :ref:`logging`
+
 log_gpu_memory
 ^^^^^^^^^^^^^^
 
@@ -820,27 +892,6 @@ Options:
 
 .. note:: Might slow performance because it uses the output of ``nvidia-smi``.
 
-flush_logs_every_n_steps
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/flush_logs%E2%80%A8_every_n_steps.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/flush_logs_every_n_steps.mp4"></video>
-
-|
-
-Writes logs to disk this often.
-
-.. testcode::
-
-    # default used by the Trainer
-    trainer = Trainer(flush_logs_every_n_steps=100)
-
-See Also:
-    - :ref:`logging`
-
 logger
 ^^^^^^
 
@@ -1019,6 +1070,32 @@ The Trainer uses 2 steps by default. Turn it off or modify it here.
 
 This option will reset the validation dataloader unless ``num_sanity_val_steps=0``.
 
+overfit_batches
+^^^^^^^^^^^^^^^
+
+.. raw:: html
+
+    <video width="50%" max-width="400px" controls
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/overfit_batches.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/overfit_batches.mp4"></video>
+
+|
+
+Uses this much data of the training set. If nonzero, will use the same training set for validation and testing.
+If the training dataloaders have `shuffle=True`, Lightning will automatically disable it.
+
+Useful for quickly debugging or trying to overfit on purpose.
+
+.. testcode::
+
+    # default used by the Trainer
+    trainer = Trainer(overfit_batches=0.0)
+
+    # use only 1% of the train set (and use the train set for val and test)
+    trainer = Trainer(overfit_batches=0.01)
+
+    # overfit on 10 of the same batches
+    trainer = Trainer(overfit_batches=10)
 
 plugins
 ^^^^^^^
@@ -1079,91 +1156,6 @@ If False will only call from NODE_RANK=0, LOCAL_RANK=0
     # use only NODE_RANK=0, LOCAL_RANK=0
     Trainer(prepare_data_per_node=False)
 
-tpu_cores
-^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/tpu_cores.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/tpu_cores.mp4"></video>
-
-|
-
-- How many TPU cores to train on (1 or 8).
-- Which TPU core to train on [1-8]
-
-A single TPU v2 or v3 has 8 cores. A TPU pod has
-up to 2048 cores. A slice of a POD means you get as many cores
-as you request.
-
-Your effective batch size is batch_size * total tpu cores.
-
-.. note::
-    No need to add a :class:`~torch.utils.data.distributed.DistributedSampler`,
-    Lightning automatically does it for you.
-
-This parameter can be either 1 or 8.
-
-Example::
-
-    # your_trainer_file.py
-
-    # default used by the Trainer (ie: train on CPU)
-    trainer = Trainer(tpu_cores=None)
-
-    # int: train on a single core
-    trainer = Trainer(tpu_cores=1)
-
-    # list: train on a single selected core
-    trainer = Trainer(tpu_cores=[2])
-
-    # int: train on all cores few cores
-    trainer = Trainer(tpu_cores=8)
-
-    # for 8+ cores must submit via xla script with
-    # a max of 8 cores specified. The XLA script
-    # will duplicate script onto each TPU in the POD
-    trainer = Trainer(tpu_cores=8)
-
-To train on more than 8 cores (ie: a POD),
-submit this script using the xla_dist script.
-
-Example::
-
-    python -m torch_xla.distributed.xla_dist
-    --tpu=$TPU_POD_NAME
-    --conda-env=torch-xla-nightly
-    --env=XLA_USE_BF16=1
-    -- python your_trainer_file.py
-
-overfit_batches
-^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/overfit_batches.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/overfit_batches.mp4"></video>
-
-|
-
-Uses this much data of the training set. If nonzero, will use the same training set for validation and testing.
-If the training dataloaders have `shuffle=True`, Lightning will automatically disable it.
-
-Useful for quickly debugging or trying to overfit on purpose.
-
-.. testcode::
-
-    # default used by the Trainer
-    trainer = Trainer(overfit_batches=0.0)
-
-    # use only 1% of the train set (and use the train set for val and test)
-    trainer = Trainer(overfit_batches=0.01)
-
-    # overfit on 10 of the same batches
-    trainer = Trainer(overfit_batches=10)
-
 precision
 ^^^^^^^^^
 
@@ -1346,29 +1338,6 @@ To resume training from a specific checkpoint pass in the path here.
     # resume from a specific checkpoint
     trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')
 
-log_every_n_steps
-^^^^^^^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/log_every_n_steps.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/log_every_n_steps.mp4"></video>
-
-|
-
-
-How often to add logging rows (does not write to disk)
-
-.. testcode::
-
-    # default used by the Trainer
-    trainer = Trainer(log_every_n_steps=50)
-
-See Also:
-    - :ref:`logging`
-
-
 sync_batchnorm
 ^^^^^^^^^^^^^^
 
@@ -1408,35 +1377,63 @@ track_grad_norm
     # track the 2-norm
     trainer = Trainer(track_grad_norm=2)
 
-limit_train_batches
-^^^^^^^^^^^^^^^^^^^
+tpu_cores
+^^^^^^^^^
 
 .. raw:: html
 
     <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/limit_train_batches.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/limit_batches.mp4"></video>
+    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/tpu_cores.jpg"
+    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/tpu_cores.mp4"></video>
 
 |
 
-How much of training dataset to check.
-Useful when debugging or testing something that happens at the end of an epoch.
+- How many TPU cores to train on (1 or 8).
+- Which TPU core to train on [1-8]
 
-.. testcode::
+A single TPU v2 or v3 has 8 cores. A TPU pod has
+up to 2048 cores. A slice of a POD means you get as many cores
+as you request.
 
-    # default used by the Trainer
-    trainer = Trainer(limit_train_batches=1.0)
+Your effective batch size is batch_size * total tpu cores.
+
+.. note::
+    No need to add a :class:`~torch.utils.data.distributed.DistributedSampler`,
+    Lightning automatically does it for you.
+
+This parameter can be either 1 or 8.
 
 Example::
 
-    # default used by the Trainer
-    trainer = Trainer(limit_train_batches=1.0)
+    # your_trainer_file.py
 
-    # run through only 25% of the training set each epoch
-    trainer = Trainer(limit_train_batches=0.25)
+    # default used by the Trainer (ie: train on CPU)
+    trainer = Trainer(tpu_cores=None)
 
-    # run through only 10 batches of the training set each epoch
-    trainer = Trainer(limit_train_batches=10)
+    # int: train on a single core
+    trainer = Trainer(tpu_cores=1)
+
+    # list: train on a single selected core
+    trainer = Trainer(tpu_cores=[2])
+
+    # int: train on all cores few cores
+    trainer = Trainer(tpu_cores=8)
+
+    # for 8+ cores must submit via xla script with
+    # a max of 8 cores specified. The XLA script
+    # will duplicate script onto each TPU in the POD
+    trainer = Trainer(tpu_cores=8)
+
+To train on more than 8 cores (ie: a POD),
+submit this script using the xla_dist script.
+
+Example::
+
+    python -m torch_xla.distributed.xla_dist
+    --tpu=$TPU_POD_NAME
+    --conda-env=torch-xla-nightly
+    --env=XLA_USE_BF16=1
+    -- python your_trainer_file.py
 
 truncated_bptt_steps
 ^^^^^^^^^^^^^^^^^^^^

From 15a400b95faf45124f28fd8ca4ef7520d47df88a Mon Sep 17 00:00:00 2001
From: Pavel Kulikov <kulikovpavel@gmail.com>
Date: Mon, 4 Jan 2021 18:40:01 +0300
Subject: [PATCH 064/136] docs: logits -> probs in Accuracy metric
 documentation (#5340)

* fix: logits -> probs in accuracy metrics documentation

* Update metrics.rst

* Update metrics.rst

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/metrics.rst | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index d6d9cb8fb0ae7..67640debb665a 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -33,10 +33,11 @@ The example below shows how to use a metric in your ``LightningModule``:
         self.accuracy = pl.metrics.Accuracy()
 
     def training_step(self, batch, batch_idx):
-        logits = self(x)
+        x, y = batch
+        preds = self(x)
         ...
         # log step metric
-        self.log('train_acc_step', self.accuracy(logits, y))
+        self.log('train_acc_step', self.accuracy(preds, y))
         ...
 
     def training_epoch_end(self, outs):
@@ -67,9 +68,10 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v
         self.valid_acc = pl.metrics.Accuracy()
 
     def training_step(self, batch, batch_idx):
-        logits = self(x)
+        x, y = batch
+        preds = self(x)
         ...
-        self.train_acc(logits, y)
+        self.train_acc(preds, y)
         self.log('train_acc', self.train_acc, on_step=True, on_epoch=False)
 
     def validation_step(self, batch, batch_idx):
@@ -88,7 +90,7 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v
 
         def training_step(self, batch, batch_idx):
             data, target = batch
-            pred = self(data)
+            preds = self(data)
             ...
             return {'loss' : loss, 'preds' : preds, 'target' : target}
 

From dd442b6d335a5553961e7904cd9454b738cb72cd Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Tue, 5 Jan 2021 01:33:14 +0530
Subject: [PATCH 065/136] [Docs] update docs for resume_from_checkpoint (#5164)

* update docs and add pathlib support

* fix
---
 docs/source/trainer.rst                             |  3 ++-
 .../trainer/connectors/checkpoint_connector.py      | 13 +++++++------
 pytorch_lightning/trainer/trainer.py                |  6 ++++--
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 36e895ec904a3..3dfe120cde959 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -1328,7 +1328,8 @@ resume_from_checkpoint
 
 |
 
-To resume training from a specific checkpoint pass in the path here.
+To resume training from a specific checkpoint pass in the path here. If resuming from a mid-epoch
+checkpoint, training will start from the beginning of the next epoch.
 
 .. testcode::
 
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 429bddd88b77e..e912462d2491b 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -15,18 +15,18 @@
 import os
 from pathlib import Path
 import re
-from typing import Union, Optional
+from typing import Optional, Union
 
 import torch
 
 import pytorch_lightning
 from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.cloud_io import load as pl_load
-from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
 
 if APEX_AVAILABLE:
     from apex import amp
@@ -157,9 +157,10 @@ def restore_training_state(self, checkpoint):
         expected_steps = self.trainer.num_training_batches / n_accum
         if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1:
             rank_zero_warn(
-                "You're resuming from a checkpoint that ended mid-epoch. "
-                "This can cause unreliable results if further training is done, "
-                "consider using an end of epoch checkpoint. "
+                "You're resuming from a checkpoint that ended mid-epoch."
+                " Training will start from the beginning of the next epoch."
+                " This can cause unreliable results if further training is done,"
+                " consider using an end of epoch checkpoint."
             )
 
         # restore the optimizers
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c66cc3a43d0b1..25dffa52dcdab 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,6 +15,7 @@
 """Trainer to automate the training."""
 
 import os
+from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Union
 import warnings
 
@@ -117,7 +118,7 @@ def __init__(
         weights_save_path: Optional[str] = None,
         num_sanity_val_steps: int = 2,
         truncated_bptt_steps: Optional[int] = None,
-        resume_from_checkpoint: Optional[str] = None,
+        resume_from_checkpoint: Optional[Union[Path, str]] = None,
         profiler: Optional[Union[BaseProfiler, bool, str]] = None,
         benchmark: bool = False,
         deterministic: bool = False,
@@ -251,7 +252,8 @@ def __init__(
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
             resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
-                This can be a URL.
+                This can be a URL. If resuming from mid-epoch checkpoint, training will start from
+                the beginning of the next epoch.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 

From b0051e8c036fa3312ad4d37aa7141bea64ac6148 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Tue, 5 Jan 2021 09:52:35 +0900
Subject: [PATCH 066/136] Add non-existing resume_from_checkpoint acceptance
 for auto-resubmit (#4402)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add empty resume_from_checkpoint acceptance #4366

* Fix general error catch with focused file check

* Add fsspec HTTP extras

Add fsspec's HTTPFileSystem  support through http extras.
pl has supported remote http file (e.g. #2925),
so this commit do not add new functionality.

* Fix potential too much logging in DDP

* Add PR changelog

* Add well-written argument explanation

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix DDP-compatible restore logging

Notify from where the states are restored.
This feature temporally deleted as a result of PR review.
With succeeding review, added with DDP compatibility.

* Fix utility import pathes

* Refactor load step commentaries

* Refactor hpc ckpt suffix acquisition

* Refactor restore/hpc_load match

* Refactor hpc load trial

* Refactor checkpoint dir check

* Refactor unneeded function nest

* Refactor nested If

* Refactor duplicated cache clear

* Refactor attempt flow with if/elif

* Fix pip8

* Refactor hook commentary

Co-authored-by: chaton <thomas@grid.ai>

* Fix pep8

* Refactor hpc load checkpoint path acquisition

* Fix pip8

* Fix typo

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix typo

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix doc

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Refactor None Union type with Optional

* Fix build-doc CI failure debuged in #5329

* Fix fsspec import during build-doc #5329

* Fix test epoch

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix test with latest test models

* .

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 CHANGELOG.md                                  |  2 ++
 docs/source/conf.py                           |  6 +++++-
 environment.yml                               |  2 +-
 .../connectors/checkpoint_connector.py        | 12 +++++++++--
 pytorch_lightning/trainer/trainer.py          |  6 +++---
 requirements.txt                              |  2 +-
 requirements/docs.txt                         |  2 +-
 tests/models/test_restore.py                  | 21 ++++++++++++++++++-
 8 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b9b705459510..68941743ed00e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
+
 
 ### Changed
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 655e8dba30a36..2b861623599a6 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -294,10 +294,14 @@ def setup(app):
 # Ignoring Third-party packages
 # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule
 def package_list_from_file(file):
+    """List up package name (not containing version and extras) from a package list file
+    """
     mocked_packages = []
     with open(file, 'r') as fp:
         for ln in fp.readlines():
-            found = [ln.index(ch) for ch in list(',=<>#') if ch in ln]
+            # Example: `tqdm>=4.41.0` => `tqdm`
+            # `[` is for package with extras
+            found = [ln.index(ch) for ch in list(',=<>#[') if ch in ln]
             pkg = ln[:min(found)] if found else ln
             if pkg.rstrip():
                 mocked_packages.append(pkg.rstrip())
diff --git a/environment.yml b/environment.yml
index 3d59c1eeed0dd..1278f15f718e9 100644
--- a/environment.yml
+++ b/environment.yml
@@ -30,7 +30,7 @@ dependencies:
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
-    - fsspec>=0.8.0
+    - fsspec[http]>=0.8.1
     #- tensorboard>=2.2.0  # not needed, already included in pytorch
 
     # Optional
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index e912462d2491b..c71cbe6ce6180 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -43,7 +43,7 @@ def __init__(self, trainer):
         # used to validate checkpointing logic
         self.has_trained = False
 
-    def restore_weights(self, model: LightningModule):
+    def restore_weights(self, model: LightningModule) -> None:
         """
         Attempt to restore a checkpoint (e.g. weights) in this priority:
         1. from HPC weights
@@ -73,11 +73,16 @@ def restore_weights(self, model: LightningModule):
         if self.trainer.on_gpu:
             torch.cuda.empty_cache()
 
-    def restore(self, checkpoint_path: str, on_gpu: bool):
+    def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
         """
         Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore.
         All restored states are listed in return value description of `dump_checkpoint`.
         """
+        # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(checkpoint_path)
+        if not fs.exists(checkpoint_path):
+            rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch")
+            return False
 
         # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
         checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
@@ -94,6 +99,9 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
         # restore training state
         self.restore_training_state(checkpoint)
 
+        rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}")
+        return True
+
     def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         """
         Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 25dffa52dcdab..f2e943d2783af 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -251,9 +251,9 @@ def __init__(
                 train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it,
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
-            resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
-                This can be a URL. If resuming from mid-epoch checkpoint, training will start from
-                the beginning of the next epoch.
+            resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is
+                no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint,
+                training will start from the beginning of the next epoch.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 
diff --git a/requirements.txt b/requirements.txt
index 4b8a3efb5c841..2dd5378649851 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,5 @@ future>=0.17.1  # required for builtins in setup.py
 # pyyaml>=3.13
 PyYAML>=5.1  # OmegaConf requirement >=5.1
 tqdm>=4.41.0
-fsspec>=0.8.0
+fsspec[http]>=0.8.1
 tensorboard>=2.2.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 0f8f2005b88b1..df596ed2bdda8 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -11,4 +11,4 @@ https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#eg
 sphinx-autodoc-typehints
 sphinx-paramlinks<0.4.0
 sphinx-togglebutton
-sphinx-copybutton
+sphinx-copybutton
\ No newline at end of file
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 17821570bdfa7..f7773f63aa8c2 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -27,7 +27,7 @@
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
+from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -73,6 +73,25 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
     trainer.fit(model)
 
 
+def test_try_resume_from_non_existing_checkpoint(tmpdir):
+    """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
+    model = BoringModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=False,
+        callbacks=[checkpoint_cb],
+        limit_train_batches=0.1,
+        limit_val_batches=0.1,
+    )
+    # Generate checkpoint `last.ckpt` with BoringModel
+    trainer.fit(model)
+    # `True` if resume/restore successfully else `False`
+    assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
+    assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
+
+
 class CaptureCallbacksBeforeTraining(Callback):
     callbacks = []
 

From f7402455218e5087e61d8255a4a87a8db58a7194 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Tue, 5 Jan 2021 08:24:49 +0530
Subject: [PATCH 067/136] Disable checkpointing, earlystopping and logging with
 fast_dev_run (#5277)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Disable checkpointing, earlystopping and logger with fast_dev_run

* docs

* chlog

* disable callbacks and enable DummyLogger

* add log

* use dummy logger method

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                                  |  2 +
 docs/source/debugging.rst                     |  7 +-
 docs/source/trainer.rst                       |  6 +-
 pytorch_lightning/callbacks/early_stopping.py | 23 ++---
 .../callbacks/gpu_stats_monitor.py            |  3 +-
 pytorch_lightning/callbacks/lr_monitor.py     |  5 +-
 .../callbacks/model_checkpoint.py             | 11 +--
 pytorch_lightning/callbacks/progress.py       |  3 +-
 .../trainer/connectors/debugging_connector.py | 13 ++-
 pytorch_lightning/trainer/properties.py       | 34 ++++++--
 pytorch_lightning/trainer/training_loop.py    |  5 +-
 tests/callbacks/test_early_stopping.py        | 12 +--
 .../test_checkpoint_callback_frequency.py     | 47 +---------
 tests/checkpointing/test_model_checkpoint.py  |  8 +-
 tests/loggers/test_all.py                     |  6 +-
 tests/trainer/flags/test_fast_dev_run.py      | 86 ++++++++++++++++++-
 16 files changed, 168 insertions(+), 103 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68941743ed00e..9801f56f6f1bf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
+
 
 
 ## [1.1.2] - 2020-12-23
diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 5eaf4303d3e4c..f3faa72f1e95e 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -28,13 +28,18 @@ The point is to detect any bugs in the training/validation loop without having t
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
 .. testcode::
-    
+
     # runs 1 train, val, test batch and program ends
     trainer = Trainer(fast_dev_run=True)
 
     # runs 7 train, val, test batches and program ends
     trainer = Trainer(fast_dev_run=7)
 
+.. note::
+
+    This argument will disable tuner, checkpoint callbacks, early stopping callbacks,
+    loggers and logger callbacks like ``LearningRateLogger`` and runs for only 1 epoch.
+
 ----------------
 
 Inspect gradient norms
diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 3dfe120cde959..8d42541a3fbb4 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -666,9 +666,9 @@ Under the hood the pseudocode looks like this when running *fast_dev_run* with a
 .. note::
 
     This argument is a bit different from ``limit_train/val/test_batches``. Setting this argument will
-    disable tuner, logger callbacks like ``LearningRateLogger`` and runs for only 1 epoch. This must be
-    used only for debugging purposes. ``limit_train/val/test_batches`` only limits the number of batches and won't
-    disable anything.
+    disable tuner, checkpoint callbacks, early stopping callbacks, loggers and logger callbacks like
+    ``LearningRateLogger`` and runs for only 1 epoch. This must be used only for debugging purposes.
+    ``limit_train/val/test_batches`` only limits the number of batches and won't disable anything.
 
 flush_logs_every_n_steps
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 4125a924cb2c5..3e15d8462350c 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -28,7 +28,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.metrics.metric import Metric
-from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, TPU_AVAILABLE
 
 
 class EarlyStopping(Callback):
@@ -166,10 +166,10 @@ def on_validation_end(self, trainer, pl_module):
         self._run_early_stopping_check(trainer, pl_module)
 
     def on_validation_epoch_end(self, trainer, pl_module):
-        if trainer.running_sanity_check:
+        if trainer.fast_dev_run or trainer.running_sanity_check:
             return
 
-        if self._validate_condition_metric(trainer.logger_connector.callback_metrics):
+        if self._validate_condition_metric(trainer.callback_metrics):
             # turn off early stopping in on_train_epoch_end
             self.based_on_eval_results = True
 
@@ -178,24 +178,19 @@ def on_train_epoch_end(self, trainer, pl_module, outputs):
         if self.based_on_eval_results:
             return
 
-        # early stopping can also work in the train loop when there is no val loop
-        should_check_early_stop = False
-
-        # fallback to monitor key in result dict
-        if trainer.logger_connector.callback_metrics.get(self.monitor, None) is not None:
-            should_check_early_stop = True
-
-        if should_check_early_stop:
-            self._run_early_stopping_check(trainer, pl_module)
+        self._run_early_stopping_check(trainer, pl_module)
 
     def _run_early_stopping_check(self, trainer, pl_module):
         """
         Checks whether the early stopping condition is met
         and if so tells the trainer to stop the training.
         """
-        logs = trainer.logger_connector.callback_metrics
+        logs = trainer.callback_metrics
 
-        if not self._validate_condition_metric(logs):
+        if (
+            trainer.fast_dev_run  # disable early_stopping with fast_dev_run
+            or not self._validate_condition_metric(logs)  # short circuit if metric not present
+        ):
             return  # short circuit if metric not present
 
         current = logs.get(self.monitor)
diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py
index b083511392bb3..1403d0bdf2e31 100644
--- a/pytorch_lightning/callbacks/gpu_stats_monitor.py
+++ b/pytorch_lightning/callbacks/gpu_stats_monitor.py
@@ -24,7 +24,7 @@
 import shutil
 import subprocess
 import time
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
 
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_only
@@ -213,5 +213,4 @@ def _should_log(trainer) -> bool:
             or trainer.should_stop
         )
 
-        should_log = should_log and not trainer.fast_dev_run
         return should_log
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 9799e0d3298d3..712695d69ecec 100755
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -105,7 +105,7 @@ def on_train_batch_start(self, trainer, *args, **kwargs):
             interval = 'step' if self.logging_interval is None else 'any'
             latest_stat = self._extract_stats(trainer, interval)
 
-            if trainer.logger is not None and latest_stat:
+            if latest_stat:
                 trainer.logger.log_metrics(latest_stat, step=trainer.global_step)
 
     def on_train_epoch_start(self, trainer, *args, **kwargs):
@@ -113,7 +113,7 @@ def on_train_epoch_start(self, trainer, *args, **kwargs):
             interval = 'epoch' if self.logging_interval is None else 'any'
             latest_stat = self._extract_stats(trainer, interval)
 
-            if trainer.logger is not None and latest_stat:
+            if latest_stat:
                 trainer.logger.log_metrics(latest_stat, step=trainer.global_step)
 
     def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
@@ -190,5 +190,4 @@ def _should_log(trainer) -> bool:
             or trainer.should_stop
         )
 
-        should_log = should_log and not trainer.fast_dev_run
         return should_log
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 5a1079f8063f4..a578c1d697f8e 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -20,11 +20,11 @@
 
 """
 
-from copy import deepcopy
 import numbers
 import os
-from pathlib import Path
 import re
+from copy import deepcopy
+from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
@@ -224,7 +224,8 @@ def save_checkpoint(self, trainer, pl_module):
         global_step = trainer.global_step
 
         if (
-            self.save_top_k == 0  # no models are saved
+            trainer.fast_dev_run  # disable checkpointing with fast_dev_run
+            or self.save_top_k == 0  # no models are saved
             or self.period < 1  # no models are saved
             or (epoch + 1) % self.period  # skip epoch
             or trainer.running_sanity_check  # don't save anything during sanity check
@@ -478,14 +479,14 @@ def __resolve_ckpt_dir(self, trainer, pl_module):
             version, name = trainer.accelerator_backend.broadcast((version, trainer.logger.name))
 
             ckpt_path = os.path.join(
-                save_dir, name, version, "checkpoints"
+                save_dir, str(name), version, "checkpoints"
             )
         else:
             ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints")
 
         self.dirpath = ckpt_path
 
-        if trainer.is_global_zero:
+        if not trainer.fast_dev_run and trainer.is_global_zero:
             self._fs.makedirs(self.dirpath, exist_ok=True)
 
     def _add_backward_monitor_support(self, trainer):
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
index 6582f16fd27be..3ed5c11fd75d7 100644
--- a/pytorch_lightning/callbacks/progress.py
+++ b/pytorch_lightning/callbacks/progress.py
@@ -22,7 +22,6 @@
 import importlib
 import sys
 
-
 # check if ipywidgets is installed before importing tqdm.auto
 # to ensure it won't fail and a progress bar is displayed
 if importlib.util.find_spec('ipywidgets') is not None:
@@ -323,7 +322,7 @@ def on_epoch_start(self, trainer, pl_module):
         super().on_epoch_start(trainer, pl_module)
         total_train_batches = self.total_train_batches
         total_val_batches = self.total_val_batches
-        if total_train_batches != float('inf') and not trainer.fast_dev_run:
+        if total_train_batches != float('inf'):
             # val can be checked multiple times per epoch
             val_checks_per_epoch = total_train_batches // trainer.val_check_batch
             total_val_batches = total_val_batches * val_checks_per_epoch
diff --git a/pytorch_lightning/trainer/connectors/debugging_connector.py b/pytorch_lightning/trainer/connectors/debugging_connector.py
index 61d7cbd189fde..ecba35d5dbf55 100644
--- a/pytorch_lightning/trainer/connectors/debugging_connector.py
+++ b/pytorch_lightning/trainer/connectors/debugging_connector.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from typing import Union
-from pytorch_lightning.utilities import rank_zero_warn, rank_zero_info
+
+from pytorch_lightning.loggers.base import DummyLogger
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class DebuggingConnector:
@@ -54,11 +56,16 @@ def on_init_start(
             limit_train_batches = fast_dev_run
             limit_val_batches = fast_dev_run
             limit_test_batches = fast_dev_run
+            self.trainer.max_steps = fast_dev_run
             self.trainer.num_sanity_val_steps = 0
             self.trainer.max_epochs = 1
+            self.trainer.val_check_interval = 1.0
+            self.trainer.check_val_every_n_epoch = 1
+            self.trainer.logger = DummyLogger()
+
             rank_zero_info(
                 'Running in fast_dev_run mode: will run a full train,'
-                f' val and test loop using {fast_dev_run} batch(es)'
+                f' val and test loop using {fast_dev_run} batch(es).'
             )
 
         self.trainer.limit_train_batches = _determine_batch_limits(limit_train_batches, 'limit_train_batches')
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 355bbad3a037e..614c863fa7256 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -15,10 +15,10 @@
 import os
 from abc import ABC
 from argparse import ArgumentParser, Namespace
-from typing import List, Optional, Type, TypeVar, Union, cast
+from typing import cast, List, Optional, Type, TypeVar, Union
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBarBase
+from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.loggers.base import LightningLoggerBase
@@ -27,7 +27,7 @@
 from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
 from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import HOROVOD_AVAILABLE, TPU_AVAILABLE, argparse_utils, rank_zero_warn
+from pytorch_lightning.utilities import argparse_utils, HOROVOD_AVAILABLE, rank_zero_warn, TPU_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.model_utils import is_overridden
 
@@ -196,7 +196,7 @@ def enable_validation(self) -> bool:
         """ Check if we should run validation during training. """
         model_ref = self.model_connector.get_model()
         val_loop_enabled = is_overridden('validation_step', model_ref) and self.limit_val_batches > 0
-        return val_loop_enabled or self.fast_dev_run
+        return val_loop_enabled
 
     @property
     def default_root_dir(self) -> str:
@@ -218,18 +218,38 @@ def weights_save_path(self) -> str:
             return os.path.normpath(self._weights_save_path)
         return self._weights_save_path
 
+    @property
+    def early_stopping_callback(self) -> Optional[EarlyStopping]:
+        """
+        The first :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
+        callback in the Trainer.callbacks list, or ``None`` if it doesn't exist.
+        """
+        callbacks = self.early_stopping_callbacks
+        return callbacks[0] if len(callbacks) > 0 else None
+
+    @property
+    def early_stopping_callbacks(self) -> List[EarlyStopping]:
+        """
+        A list of all instances of :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
+        found in the Trainer.callbacks list.
+        """
+        return [c for c in self.callbacks if isinstance(c, EarlyStopping)]
+
     @property
     def checkpoint_callback(self) -> Optional[ModelCheckpoint]:
         """
-        The first checkpoint callback in the Trainer.callbacks list, or ``None`` if
-        no checkpoint callbacks exist.
+        The first :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
+        callback in the Trainer.callbacks list, or ``None`` if it doesn't exist.
         """
         callbacks = self.checkpoint_callbacks
         return callbacks[0] if len(callbacks) > 0 else None
 
     @property
     def checkpoint_callbacks(self) -> List[ModelCheckpoint]:
-        """ A list of all instances of ModelCheckpoint found in the Trainer.callbacks list. """
+        """
+        A list of all instances of :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
+        found in the Trainer.callbacks list.
+        """
         return [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
 
     def save_checkpoint(self, filepath, weights_only: bool = False):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index fe4525006ebb9..0271afe3c2d91 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -915,9 +915,8 @@ def build_train_args(self, batch, batch_idx, opt_idx, hiddens):
     def save_loggers_on_train_batch_end(self):
         # when loggers should save to disk
         should_flush_logs = self.trainer.logger_connector.should_flush_logs
-        if should_flush_logs or self.trainer.fast_dev_run is True:
-            if self.trainer.is_global_zero and self.trainer.logger is not None:
-                self.trainer.logger.save()
+        if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None:
+            self.trainer.logger.save()
 
     def process_train_step_outputs(self, all_train_step_outputs, early_stopping_accumulator, checkpoint_accumulator):
         """
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 7cecefad03276..5c54f6a84805d 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -13,18 +13,17 @@
 # limitations under the License.
 import os
 import pickle
+from unittest import mock
 
 import cloudpickle
 import numpy as np
 import pytest
 import torch
-from unittest import mock
 
-from pytorch_lightning import _logger
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import _logger, seed_everything, Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from tests.base import EvalModelTemplate, BoringModel
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.base import BoringModel, EvalModelTemplate
 
 
 class EarlyStoppingTestRestore(EarlyStopping):
@@ -87,15 +86,18 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
 def test_early_stopping_no_extraneous_invocations(tmpdir):
     """Test to ensure that callback methods aren't being invoked outside of the callback handler."""
     model = EvalModelTemplate()
+    early_stop_callback = EarlyStopping()
     expected_count = 4
     trainer = Trainer(
         default_root_dir=tmpdir,
-        callbacks=[EarlyStopping()],
+        callbacks=[early_stop_callback],
         val_check_interval=1.0,
         max_epochs=expected_count,
     )
     trainer.fit(model)
 
+    assert trainer.early_stopping_callback == early_stop_callback
+    assert trainer.early_stopping_callbacks == [early_stop_callback]
     assert len(trainer.dev_debugger.early_stopping_history) == expected_count
 
 
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index 857877f8239ba..f9686dce159dd 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -17,55 +17,10 @@
 import pytest
 import torch
 
-from pytorch_lightning import Trainer, callbacks, seed_everything
+from pytorch_lightning import callbacks, seed_everything, Trainer
 from tests.base import BoringModel
 
 
-@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-def test_mc_called_on_fastdevrun(tmpdir):
-    seed_everything(1234)
-
-    train_val_step_model = BoringModel()
-
-    # fast dev run = called once
-    # train loop only, dict, eval result
-    trainer = Trainer(fast_dev_run=True)
-    trainer.fit(train_val_step_model)
-
-    # checkpoint should have been called once with fast dev run
-    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1
-
-    # -----------------------
-    # also called once with no val step
-    # -----------------------
-    class TrainingStepCalled(BoringModel):
-        def __init__(self):
-            super().__init__()
-            self.training_step_called = False
-            self.validation_step_called = False
-            self.test_step_called = False
-
-        def training_step(self, batch, batch_idx):
-            self.training_step_called = True
-            return super().training_step(batch, batch_idx)
-
-    train_step_only_model = TrainingStepCalled()
-    train_step_only_model.validation_step = None
-
-    # fast dev run = called once
-    # train loop only, dict, eval result
-    trainer = Trainer(fast_dev_run=True)
-    trainer.fit(train_step_only_model)
-
-    # make sure only training step was called
-    assert train_step_only_model.training_step_called
-    assert not train_step_only_model.validation_step_called
-    assert not train_step_only_model.test_step_called
-
-    # checkpoint should have been called once with fast dev run
-    assert len(trainer.dev_debugger.checkpoint_callback_history) == 1
-
-
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_mc_called(tmpdir):
     seed_everything(1234)
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 106c34030051e..99ed807f111e5 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -28,7 +28,7 @@
 
 import pytorch_lightning as pl
 import tests.base.develop_utils as tutils
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.utilities.cloud_io import load as pl_load
@@ -896,7 +896,8 @@ def training_step(self, *args):
     )
     trainer = Trainer(
         default_root_dir=tmpdir,
-        fast_dev_run=True,
+        limit_train_batches=1,
+        limit_val_batches=1,
         callbacks=[model_checkpoint],
         logger=False,
         weights_summary=None,
@@ -922,7 +923,8 @@ def __init__(self, hparams):
     )
     trainer = Trainer(
         default_root_dir=tmpdir,
-        fast_dev_run=True,
+        limit_train_batches=1,
+        limit_val_batches=1,
         callbacks=[model_checkpoint],
         logger=False,
         weights_summary=None,
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 89c731d432ee9..795b1a91e688e 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -114,9 +114,9 @@ def log_metrics(self, metrics, step):
     trainer = Trainer(
         max_epochs=1,
         logger=logger,
-        limit_train_batches=0.2,
-        limit_val_batches=0.5,
-        fast_dev_run=True,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        log_every_n_steps=1,
         default_root_dir=tmpdir,
     )
     trainer.fit(model)
diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py
index 00c62cdf48fce..624b3cc6ac9c2 100644
--- a/tests/trainer/flags/test_fast_dev_run.py
+++ b/tests/trainer/flags/test_fast_dev_run.py
@@ -1,14 +1,20 @@
+import os
+from unittest import mock
+
 import pytest
+import torch
+
 from pytorch_lightning import Trainer
-from tests.base import EvalModelTemplate
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.loggers.base import DummyLogger
+from tests.base import BoringModel
 
 
 @pytest.mark.parametrize('tuner_alg', ['batch size scaler', 'learning rate finder'])
 def test_skip_on_fast_dev_run_tuner(tmpdir, tuner_alg):
     """ Test that tuner algorithms are skipped if fast dev run is enabled """
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
@@ -19,3 +25,77 @@ def test_skip_on_fast_dev_run_tuner(tmpdir, tuner_alg):
     expected_message = f'Skipping {tuner_alg} since fast_dev_run is enabled.'
     with pytest.warns(UserWarning, match=expected_message):
         trainer.tune(model)
+
+
+@pytest.mark.parametrize('fast_dev_run', [1, 4])
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
+def test_callbacks_and_logger_not_called_with_fastdevrun(tmpdir, fast_dev_run):
+    """
+    Test that ModelCheckpoint, EarlyStopping and Logger are turned off with fast_dev_run
+    """
+    class FastDevRunModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.training_step_called = False
+            self.validation_step_called = False
+            self.test_step_called = False
+
+        def training_step(self, batch, batch_idx):
+            self.log('some_metric', torch.tensor(7.))
+            self.logger.experiment.dummy_log('some_distribution', torch.randn(7) + batch_idx)
+            self.training_step_called = True
+            return super().training_step(batch, batch_idx)
+
+        def validation_step(self, batch, batch_idx):
+            self.validation_step_called = True
+            return super().validation_step(batch, batch_idx)
+
+    checkpoint_callback = ModelCheckpoint()
+    early_stopping_callback = EarlyStopping()
+    trainer_config = dict(
+        fast_dev_run=fast_dev_run,
+        logger=True,
+        log_every_n_steps=1,
+        callbacks=[checkpoint_callback, early_stopping_callback],
+    )
+
+    def _make_fast_dev_run_assertions(trainer):
+        # there should be no logger with fast_dev_run
+        assert isinstance(trainer.logger, DummyLogger)
+        assert len(trainer.dev_debugger.logged_metrics) == fast_dev_run
+
+        # checkpoint callback should not have been called with fast_dev_run
+        assert trainer.checkpoint_callback == checkpoint_callback
+        assert not os.path.exists(checkpoint_callback.dirpath)
+        assert len(trainer.dev_debugger.checkpoint_callback_history) == 0
+
+        # early stopping should not have been called with fast_dev_run
+        assert trainer.early_stopping_callback == early_stopping_callback
+        assert len(trainer.dev_debugger.early_stopping_history) == 0
+
+    train_val_step_model = FastDevRunModel()
+    trainer = Trainer(**trainer_config)
+    results = trainer.fit(train_val_step_model)
+    assert results
+
+    # make sure both training_step and validation_step were called
+    assert train_val_step_model.training_step_called
+    assert train_val_step_model.validation_step_called
+
+    _make_fast_dev_run_assertions(trainer)
+
+    # -----------------------
+    # also called once with no val step
+    # -----------------------
+    train_step_only_model = FastDevRunModel()
+    train_step_only_model.validation_step = None
+
+    trainer = Trainer(**trainer_config)
+    results = trainer.fit(train_step_only_model)
+    assert results
+
+    # make sure only training_step was called
+    assert train_step_only_model.training_step_called
+    assert not train_step_only_model.validation_step_called
+
+    _make_fast_dev_run_assertions(trainer)

From c7d0f4c3a29bd5524e0b66f9196f123b64d1587a Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Tue, 5 Jan 2021 11:37:28 +0530
Subject: [PATCH 068/136] Add a check for optimizer attatched to lr_scheduler
 (#5338)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add a check for scheduler and optimizer

* pep

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                            |  3 +++
 pytorch_lightning/trainer/optimizers.py |  9 +++++++++
 tests/trainer/test_optimizers.py        | 17 +++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9801f56f6f1bf..8af6984ac98ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,9 +9,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+
+- Added a check for optimizer attached to lr_scheduler ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338))
 - Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
 
 
+
 ### Changed
 
 
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 479d401720261..974ee898ff00b 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -75,7 +75,9 @@ def init_optimizers(self, model: LightningModule) -> Tuple[List, List, List]:
                 ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n'
                 ' * A list of the previously described dict format, with an optional "frequency" key (int)'
             )
+
         lr_schedulers = self.configure_schedulers(lr_schedulers, monitor=monitor)
+        _validate_scheduler_optimizer(optimizers, lr_schedulers)
 
         return optimizers, lr_schedulers, optimizer_frequencies
 
@@ -183,3 +185,10 @@ def zero_grad(self):
 
     def __repr__(self):
         return 'No Optimizer'
+
+
+def _validate_scheduler_optimizer(optimizers, lr_schedulers):
+    if any(sch['scheduler'].optimizer not in optimizers for sch in lr_schedulers):
+        raise MisconfigurationException(
+            "Some schedulers are attatched with an optimizer that wasn't returned from `configure_optimizers`."
+        )
diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py
index 52e085b2b7b8c..e9a422dfb4711 100644
--- a/tests/trainer/test_optimizers.py
+++ b/tests/trainer/test_optimizers.py
@@ -483,3 +483,20 @@ def test_lr_scheduler_with_no_actual_scheduler_raises(tmpdir):
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
     with pytest.raises(MisconfigurationException, match='The lr scheduler dict must have the key "scheduler"'):
         trainer.fit(model)
+
+
+def test_invalid_optimizer_in_scheduler(tmpdir):
+    """
+    Test exception when optimizer attatched to lr_schedulers wasn't returned
+    """
+    class InvalidOptimizerModel(BoringModel):
+        def configure_optimizers(self):
+            opt1 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            opt2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(opt2, step_size=1)
+            return [opt1], [lr_scheduler]
+
+    model = InvalidOptimizerModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    with pytest.raises(MisconfigurationException, match="attatched with an optimizer that wasn't returned"):
+        trainer.fit(model)

From 371daea594f1f9b6b1f3a7071688ba61fe0b335a Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Tue, 5 Jan 2021 12:21:22 +0530
Subject: [PATCH 069/136] Allow log_momentum for adaptive optimizers (#5333)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

* fix

* chlog

* no momentum warning

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* ref

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                              |  5 +-
 pytorch_lightning/callbacks/lr_monitor.py | 64 +++++++++++++---------
 tests/callbacks/test_lr_monitor.py        | 66 +++++++++++++++++++----
 3 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8af6984ac98ca..5614a07b9da01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,10 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-
 - Added a check for optimizer attached to lr_scheduler ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338))
-- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
 
+- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
 
 
 ### Changed
@@ -26,6 +25,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
+
 - Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
 
 
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 712695d69ecec..b3c3f36577a67 100755
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -33,11 +33,11 @@ class LearningRateMonitor(Callback):
     Automatically monitor and logs learning rate for learning rate schedulers during training.
 
     Args:
-        logging_interval: set to `epoch` or `step` to log `lr` of all optimizers
-            at the same interval, set to `None` to log at individual interval
-            according to the `interval` key of each scheduler. Defaults to ``None``.
+        logging_interval: set to ``'epoch'`` or ``'step'`` to log ``lr`` of all optimizers
+            at the same interval, set to ``None`` to log at individual interval
+            according to the ``interval`` key of each scheduler. Defaults to ``None``.
         log_momentum: option to also log the momentum values of the optimizer, if the optimizer
-            has the `momentum` attribute. Defaults to ``False``.
+            has the ``momentum`` or ``betas`` attribute. Defaults to ``False``.
 
     Example::
 
@@ -47,17 +47,19 @@ class LearningRateMonitor(Callback):
         >>> trainer = Trainer(callbacks=[lr_monitor])
 
     Logging names are automatically determined based on optimizer class name.
-    In case of multiple optimizers of same type, they will be named `Adam`,
-    `Adam-1` etc. If a optimizer has multiple parameter groups they will
-    be named `Adam/pg1`, `Adam/pg2` etc. To control naming, pass in a
-    `name` keyword in the construction of the learning rate schdulers
+    In case of multiple optimizers of same type, they will be named ``Adam``,
+    ``Adam-1`` etc. If a optimizer has multiple parameter groups they will
+    be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a
+    ``name`` keyword in the construction of the learning rate schdulers
 
     Example::
 
         def configure_optimizer(self):
             optimizer = torch.optim.Adam(...)
-            lr_scheduler = {'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...)
-                            'name': 'my_logging_name'}
+            lr_scheduler = {
+                'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, ...)
+                'name': 'my_logging_name'
+            }
             return [optimizer], [lr_scheduler]
 
     """
@@ -80,16 +82,28 @@ def on_train_start(self, trainer, *args, **kwargs):
         """
         if not trainer.logger:
             raise MisconfigurationException(
-                'Cannot use LearningRateMonitor callback with Trainer that has no logger.'
+                'Cannot use `LearningRateMonitor` callback with `Trainer` that has no logger.'
             )
 
         if not trainer.lr_schedulers:
             rank_zero_warn(
-                'You are using LearningRateMonitor callback with models that'
+                'You are using `LearningRateMonitor` callback with models that'
                 ' have no learning rate schedulers. Please see documentation'
                 ' for `configure_optimizers` method.', RuntimeWarning
             )
 
+        if self.log_momentum:
+            def _check_no_key(key):
+                return any(
+                    key not in sch['scheduler'].optimizer.defaults for sch in trainer.lr_schedulers
+                )
+
+            if _check_no_key('momentum') and _check_no_key('betas'):
+                rank_zero_warn(
+                    "You have set log_momentum=True, but some optimizers do not"
+                    " have momentum. This will log a value 0 for the momentum.", RuntimeWarning
+                )
+
         # Find names for schedulers
         names = self._find_names(trainer.lr_schedulers)
 
@@ -121,19 +135,17 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
 
         for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers):
             if scheduler['interval'] == interval or interval == 'any':
-                param_groups = scheduler['scheduler'].optimizer.param_groups
-                if len(param_groups) != 1:
-                    for i, pg in enumerate(param_groups):
-                        lr = self._extract_lr(param_group=pg, name=f'{name}/pg{i + 1}')
-                        latest_stat.update(lr)
-                        momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum/pg{i + 1}')
-                        latest_stat.update(momentum)
-
-                else:
-                    pg = param_groups[0]
-                    lr = self._extract_lr(param_group=pg, name=name)
+                opt = scheduler['scheduler'].optimizer
+                param_groups = opt.param_groups
+                use_betas = 'betas' in opt.defaults
+
+                for i, pg in enumerate(param_groups):
+                    suffix = f'/pg{i + 1}' if len(param_groups) > 1 else ''
+                    lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}')
                     latest_stat.update(lr)
-                    momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum')
+                    momentum = self._extract_momentum(
+                        param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas
+                    )
                     latest_stat.update(momentum)
 
         return latest_stat
@@ -143,11 +155,11 @@ def _extract_lr(self, param_group, name: str) -> Dict[str, float]:
         self.lrs[name].append(lr)
         return {name: lr}
 
-    def _extract_momentum(self, param_group, name: str) -> Dict[str, float]:
+    def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]:
         if not self.log_momentum:
             return {}
 
-        momentum = param_group.get('momentum')
+        momentum = param_group.get('betas')[0] if use_betas else param_group.get('momentum', 0)
         self.last_momentum_values[name] = momentum
         return {name: momentum}
 
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
index d29f254df67d0..39dd821e63dcd 100644
--- a/tests/callbacks/test_lr_monitor.py
+++ b/tests/callbacks/test_lr_monitor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
+from torch import optim
 
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
@@ -47,19 +48,34 @@ def test_lr_monitor_single_lr(tmpdir):
         'Names of learning rates not set correctly'
 
 
-def test_lr_monitor_single_lr_with_momentum(tmpdir):
-    """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """
-    tutils.reset_seed()
+@pytest.mark.parametrize('opt', ['SGD', 'Adam'])
+def test_lr_monitor_single_lr_with_momentum(tmpdir, opt):
+    """
+    Test that learning rates and momentum are extracted and logged for single lr scheduler.
+    """
+    class LogMomentumModel(BoringModel):
+        def __init__(self, opt):
+            super().__init__()
+            self.opt = opt
 
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__onecycle_scheduler
+        def configure_optimizers(self):
+            if self.opt == 'SGD':
+                opt_kwargs = {'momentum': 0.9}
+            elif self.opt == 'Adam':
+                opt_kwargs = {'betas': (0.9, 0.999)}
 
+            optimizer = getattr(optim, self.opt)(self.parameters(), lr=1e-2, **opt_kwargs)
+            lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-2, total_steps=10_000)
+            return [optimizer], [lr_scheduler]
+
+    model = LogMomentumModel(opt=opt)
     lr_monitor = LearningRateMonitor(log_momentum=True)
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        limit_val_batches=0.1,
-        limit_train_batches=0.5,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
         callbacks=[lr_monitor],
     )
     result = trainer.fit(model)
@@ -69,7 +85,39 @@ def test_lr_monitor_single_lr_with_momentum(tmpdir):
         'Expected momentum to be logged'
     assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
         'Number of momentum values logged does not match number of lr schedulers'
-    assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \
+    assert all(k == f'lr-{opt}-momentum' for k in lr_monitor.last_momentum_values.keys()), \
+        'Names of momentum values not set correctly'
+
+
+def test_log_momentum_no_momentum_optimizer(tmpdir):
+    """
+    Test that if optimizer doesn't have momentum then a warning is raised with log_momentum=True.
+    """
+    class LogMomentumModel(BoringModel):
+        def configure_optimizers(self):
+            optimizer = optim.ASGD(self.parameters(), lr=1e-2)
+            lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
+
+    model = LogMomentumModel()
+    lr_monitor = LearningRateMonitor(log_momentum=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=2,
+        limit_train_batches=5,
+        log_every_n_steps=1,
+        callbacks=[lr_monitor],
+    )
+    with pytest.warns(RuntimeWarning, match="optimizers do not have momentum."):
+        result = trainer.fit(model)
+        assert result
+
+    assert all(v == 0 for v in lr_monitor.last_momentum_values.values()), \
+        'Expected momentum to be logged'
+    assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \
+        'Number of momentum values logged does not match number of lr schedulers'
+    assert all(k == 'lr-ASGD-momentum' for k in lr_monitor.last_momentum_values.keys()), \
         'Names of momentum values not set correctly'
 
 
@@ -105,7 +153,7 @@ def test_lr_monitor_no_logger(tmpdir):
         logger=False
     )
 
-    with pytest.raises(MisconfigurationException, match='Trainer that has no logger'):
+    with pytest.raises(MisconfigurationException, match='`Trainer` that has no logger'):
         trainer.fit(model)
 
 

From 062800aa99cff6eb82838b355374305d9433507e Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Tue, 5 Jan 2021 13:13:18 +0530
Subject: [PATCH 070/136] Fix invalid value for weights_summary (#5296)

* Fix weights_summary

* use mode

* fix

* optional

* what was I thinking
---
 pytorch_lightning/core/lightning.py        | 20 ++++++++++++++------
 pytorch_lightning/trainer/trainer.py       |  4 ++--
 pytorch_lightning/trainer/training_loop.py | 22 ++++++++++++++++------
 tests/core/test_memory.py                  | 12 +++++++++++-
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index a4330b401936d..421fc5e5cf2ac 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -14,15 +14,15 @@
 
 """nn.Module with additional great features."""
 
-from abc import ABC
-from argparse import Namespace
 import collections
 import copy
 import inspect
 import os
-from pathlib import Path
 import re
 import tempfile
+from abc import ABC
+from argparse import Namespace
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -1327,9 +1327,17 @@ def tbptt_split_batch(self, batch, split_size):
 
         return splits
 
-    def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary:
-        model_summary = ModelSummary(self, mode=mode)
-        log.info("\n" + str(model_summary))
+    def summarize(self, mode: Optional[str] = ModelSummary.MODE_DEFAULT) -> Optional[ModelSummary]:
+        model_summary = None
+
+        if mode in ModelSummary.MODES:
+            model_summary = ModelSummary(self, mode=mode)
+            log.info("\n" + str(model_summary))
+        elif mode is not None:
+            raise MisconfigurationException(
+                f"`mode` can be None, {', '.join(ModelSummary.MODES)}, got {mode}"
+            )
+
         return model_summary
 
     def freeze(self) -> None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f2e943d2783af..06717c6333829 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -311,7 +311,6 @@ def __init__(
         self.plugin_connector = PluginConnector(self)
 
         # training state
-        self.weights_summary = weights_summary
         self.model = None
         self.shown_warnings = set()
 
@@ -374,7 +373,8 @@ def __init__(
             max_steps,
             min_steps,
             num_sanity_val_steps,
-            automatic_optimization
+            automatic_optimization,
+            weights_summary,
         )
         self.evaluation_loop.on_trainer_init()
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 0271afe3c2d91..8597a3003bf34 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -49,7 +49,14 @@ def __init__(self, trainer):
         self._cur_grad_norm_dict = None
 
     def on_trainer_init(
-        self, max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps, automatic_optimization
+        self,
+        max_epochs,
+        min_epochs,
+        max_steps,
+        min_steps,
+        num_sanity_val_steps,
+        automatic_optimization,
+        weights_summary,
     ):
         self.trainer.global_step = 0
         self.trainer.current_epoch = 0
@@ -73,6 +80,12 @@ def on_trainer_init(
         else:
             self.trainer.num_sanity_val_steps = num_sanity_val_steps
 
+        self.trainer.weights_summary = weights_summary
+        if weights_summary is not None and weights_summary not in ModelSummary.MODES:
+            raise MisconfigurationException(
+                f"`weights_summary` can be None, {', '.join(ModelSummary.MODES)}, got {weights_summary}"
+            )
+
     @property
     def num_optimizers(self):
         num_optimizers = len(self.get_optimizers_iterable())
@@ -161,11 +174,8 @@ def setup_training(self, model: LightningModule):
             ref_model.on_pretrain_routine_start()
 
         # print model summary
-        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing:
-            if self.trainer.weights_summary in ModelSummary.MODES:
-                ref_model.summarize(mode=self.trainer.weights_summary)
-            else:
-                raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
+        if self.trainer.is_global_zero and not self.trainer.testing:
+            ref_model.summarize(mode=self.trainer.weights_summary)
 
         # track model now.
         # if cluster resets state, the model will update with the saved weights
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index cfeb302134d24..142159fa48fd8 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -15,8 +15,9 @@
 import torch
 import torch.nn as nn
 
-from pytorch_lightning import LightningModule
+from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.core.memory import UNKNOWN_SIZE, ModelSummary
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.models import ParityModuleRNN
 
 
@@ -68,6 +69,15 @@ def forward(self, x):
         return self.reduce(self.embed(x))
 
 
+def test_invalid_weights_summmary():
+    """ Test that invalid value for weights_summary raises an error. """
+    with pytest.raises(MisconfigurationException, match='`mode` can be None, .* got temp'):
+        UnorderedModel().summarize(mode='temp')
+
+    with pytest.raises(MisconfigurationException, match='`weights_summary` can be None, .* got temp'):
+        Trainer(weights_summary='temp')
+
+
 @pytest.mark.parametrize(['mode'], [
     pytest.param(ModelSummary.MODE_FULL),
     pytest.param(ModelSummary.MODE_TOP),

From d5b367871fa3924090ec74bf903bd172bd3e2343 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Tue, 5 Jan 2021 11:01:59 +0100
Subject: [PATCH 071/136] [bug-fix] Trainer.test points to latest
 best_model_path (#5161)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* resolve bug

* update code

* add set -e

* Update pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* update test

* Update tests/checkpointing/test_trainer_checkpoint.py

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>

* Update tests/checkpointing/test_trainer_checkpoint.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* update on comments

* resolve test

* convert to set

* update

* add error triggering

* update

* update on comments

* update

* resolve import

* update

* update

* Update pytorch_lightning/plugins/rpc_plugin.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .drone.yml                                    |  1 +
 CHANGELOG.md                                  |  5 +-
 .../callbacks/model_checkpoint.py             |  1 +
 pytorch_lightning/plugins/rpc_plugin.py       | 11 ++-
 .../connectors/checkpoint_connector.py        |  3 +-
 pytorch_lightning/trainer/training_loop.py    |  2 +-
 tests/checkpointing/test_model_checkpoint.py  | 13 +--
 .../checkpointing/test_trainer_checkpoint.py  | 87 +++++++++++++++++++
 tests/plugins/test_ddp_sequential_plugin.py   |  3 +-
 tests/special_tests.sh                        |  1 +
 10 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 tests/checkpointing/test_trainer_checkpoint.py

diff --git a/.drone.yml b/.drone.yml
index b0b6c3df1b699..472861852cae7 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -30,6 +30,7 @@ steps:
     MKL_THREADING_LAYER: GNU
 
   commands:
+    - set -e
     - python --version
     - pip --version
     - nvidia-smi
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5614a07b9da01..80e40457ffa0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.3rc] - 2020-12-29
+## [1.1.3] - 2021-01-05
 
 ### Added
 
@@ -25,12 +25,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Skip restore from `resume_from_checkpoint` in while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
+
 - Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
 
 - Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
 
 
-
 ## [1.1.2] - 2020-12-23
 
 ### Added
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index a578c1d697f8e..e5c960b3c002b 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -208,6 +208,7 @@ def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]:
             "best_model_score": self.best_model_score,
             "best_model_path": self.best_model_path,
             "current_score": self.current_score,
+            "dirpath": self.dirpath
         }
 
     def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):
diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/rpc_plugin.py
index 492bddaff0c77..a1464f3c70e0b 100644
--- a/pytorch_lightning/plugins/rpc_plugin.py
+++ b/pytorch_lightning/plugins/rpc_plugin.py
@@ -12,16 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.utilities import RPC_AVAILABLE
+from pytorch_lightning.utilities import _module_available, RPC_AVAILABLE
 
+DEFAULT_RPC_TIMEOUT_SEC = 60.
 if RPC_AVAILABLE:
     from torch.distributed import rpc
+    if _module_available("torch.distributed.rpc.constants") and hasattr(torch.distributed.rpc.constants, "DEFAULT_RPC_TIMEOUT_SEC"):
+        from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC
 
 
 class RPCPlugin(DDPPlugin):
@@ -33,7 +36,8 @@ class RPCPlugin(DDPPlugin):
     that need to be addressed when using RPC communication when building custom RPC Plugins.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, **kwargs):
+        self.rpc_timeout_sec = rpc_timeout_sec
         self.rpc_initialized = False
         super().__init__(**kwargs)
 
@@ -42,6 +46,7 @@ def init_rpc_connection(self,
                             world_size: int) -> None:
         os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000')
         rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size)
+        rpc._set_rpc_timeout(self.rpc_timeout_sec)
         self.rpc_initialized = True
 
     def rpc_save_model(self,
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index c71cbe6ce6180..fc9c70ba46d2e 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -21,6 +21,7 @@
 
 import pytorch_lightning
 from pytorch_lightning import _logger as log
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
@@ -63,7 +64,7 @@ def restore_weights(self, model: LightningModule) -> None:
             rank_zero_info(f'restored hpc model from: {checkpoint_path}')
 
         # 2. Attempt to restore states from `resume_from_checkpoint` file
-        elif self.trainer.resume_from_checkpoint is not None:
+        elif self.trainer.resume_from_checkpoint is not None and not self.trainer.testing:
             self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
 
         # wait for all to catch up
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8597a3003bf34..0d99b071d4567 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -181,7 +181,7 @@ def setup_training(self, model: LightningModule):
         # if cluster resets state, the model will update with the saved weights
         self.trainer.model = model
 
-        # restore training and model before hpc is called
+        # restore training state and model weights before hpc is called
         self.trainer.checkpoint_connector.restore_weights(model)
 
         # on pretrain routine end
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 99ed807f111e5..7dbdee3d8a915 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -11,20 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from argparse import Namespace
 import os
+from pathlib import Path
 import pickle
 import platform
 import re
-from argparse import Namespace
-from pathlib import Path
 from unittest import mock
 from unittest.mock import Mock
 
 import cloudpickle
+from omegaconf import Container, OmegaConf
 import pytest
 import torch
 import yaml
-from omegaconf import Container, OmegaConf
 
 import pytorch_lightning as pl
 import tests.base.develop_utils as tutils
@@ -34,6 +34,7 @@
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel
+import tests.base.develop_utils as tutils
 
 
 class LogInTwoMethods(BoringModel):
@@ -760,9 +761,9 @@ def assert_checkpoint_log_dir(idx):
         model = ExtendedBoringModel()
         trainer.test(model)
         assert not trainer.checkpoint_connector.has_trained
-        assert trainer.global_step == epochs * limit_train_batches
-        assert trainer.current_epoch == epochs
-
+        # resume_from_checkpoint is resumed when calling `.fit`
+        assert trainer.global_step == 0
+        assert trainer.current_epoch == 0
         trainer.fit(model)
         assert not trainer.checkpoint_connector.has_trained
         assert trainer.global_step == epochs * limit_train_batches
diff --git a/tests/checkpointing/test_trainer_checkpoint.py b/tests/checkpointing/test_trainer_checkpoint.py
new file mode 100644
index 0000000000000..9e93a8c297481
--- /dev/null
+++ b/tests/checkpointing/test_trainer_checkpoint.py
@@ -0,0 +1,87 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+import os
+
+import torch
+
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.utilities.cloud_io import load as pl_load
+from tests.base import BoringModel
+
+
+def test_finetuning_with_resume_from_checkpoint(tmpdir):
+    """
+    This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test
+    """
+
+    seed_everything(3)
+
+    checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1)
+
+    class ExtendedBoringModel(BoringModel):
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            return [optimizer], [lr_scheduler]
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log("val_loss", loss, on_epoch=True, prog_bar=True)
+
+    model = ExtendedBoringModel()
+    model.validation_epoch_end = None
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=12,
+        limit_val_batches=6,
+        limit_test_batches=12,
+        callbacks=[checkpoint_callback],
+        logger=False,
+    )
+    trainer.fit(model)
+    assert os.listdir(tmpdir) == ['epoch=00.ckpt']
+
+    best_model_paths = [checkpoint_callback.best_model_path]
+    results = []
+
+    for idx in range(3, 6):
+        # load from checkpoint
+        trainer = pl.Trainer(
+            default_root_dir=tmpdir,
+            max_epochs=idx,
+            limit_train_batches=12,
+            limit_val_batches=12,
+            limit_test_batches=12,
+            resume_from_checkpoint=best_model_paths[-1],
+            progress_bar_refresh_rate=0,
+        )
+        trainer.fit(model)
+        trainer.test()
+        results.append(deepcopy(trainer.callback_metrics))
+        best_model_paths.append(trainer.checkpoint_callback.best_model_path)
+
+    for idx in range(len(results) - 1):
+        assert results[idx]["val_loss"] > results[idx + 1]["val_loss"]
+
+    for idx, best_model_path in enumerate(best_model_paths):
+        if idx == 0:
+            assert best_model_path.endswith(f"epoch=0{idx}.ckpt")
+        else:
+            assert f"epoch={idx + 1}" in best_model_path
diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py
index 23b0b9128b349..8b21c36e73065 100644
--- a/tests/plugins/test_ddp_sequential_plugin.py
+++ b/tests/plugins/test_ddp_sequential_plugin.py
@@ -47,7 +47,8 @@ def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None):
         limit_test_batches=2,
         gpus=2,
         distributed_backend="ddp",
-        plugins=[DDPSequentialPlugin(balance=[2, 1])],
+        plugins=[DDPSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)],
+        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 950e3776bbc7f..8d67cce28b39f 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Running special tests
+set -e
 export PL_RUNNING_SPECIAL_TESTS=1
 DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
 python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp

From a40e3a325e71640786717094a67c41805ab30593 Mon Sep 17 00:00:00 2001
From: LaserBit <31342033+LaserBit@users.noreply.github.com>
Date: Tue, 5 Jan 2021 21:09:52 +0900
Subject: [PATCH 072/136] Change the classifier input from 2048 to 1000.
 (#5232)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Change the classifier input from 2048 to 1000.

* Update docs for Imagenet example

Thanks @rohitgr7

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 docs/source/transfer_learning.rst | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst
index ba44203721b98..3b8f5b004974e 100644
--- a/docs/source/transfer_learning.rst
+++ b/docs/source/transfer_learning.rst
@@ -52,16 +52,22 @@ Example: Imagenet (computer Vision)
 
     class ImagenetTransferLearning(LightningModule):
         def __init__(self):
+            super().__init__()
+
             # init a pretrained resnet
-            num_target_classes = 10
-            self.feature_extractor = models.resnet50(pretrained=True)
-            self.feature_extractor.eval()
+            backbone = models.resnet50(pretrained=True)
+            num_filters = backbone.fc.in_features
+            layers = list(backbone.children())[:-1]
+            self.feature_extractor = torch.nn.Sequential(*layers)
 
             # use the pretrained model to classify cifar-10 (10 image classes)
-            self.classifier = nn.Linear(2048, num_target_classes)
+            num_target_classes = 10
+            self.classifier = nn.Linear(num_filters, num_target_classes)
 
         def forward(self, x):
-            representations = self.feature_extractor(x)
+            self.feature_extractor.eval()
+            with torch.no_grad():
+                representations = self.feature_extractor(x).flatten(1)
             x = self.classifier(representations)
             ...
 

From d568533b6ba0768ac7a6d028b3a25c5970a65e80 Mon Sep 17 00:00:00 2001
From: Abhik Banerjee <38981107+abhik-99@users.noreply.github.com>
Date: Tue, 5 Jan 2021 18:45:06 +0530
Subject: [PATCH 073/136] Updated metrics/classification/precision_recall.py
 (#5348)

There was a typo in Documentation of Code of the ```compute()``` function of ```Recall``` metric at line 210. It said "Computes accuracy over state." which should have been "Computes recall over state."
---
 pytorch_lightning/metrics/classification/precision_recall.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py
index 7e1f843b9c331..6c2bf64a1ecfc 100644
--- a/pytorch_lightning/metrics/classification/precision_recall.py
+++ b/pytorch_lightning/metrics/classification/precision_recall.py
@@ -207,7 +207,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor):
 
     def compute(self):
         """
-        Computes accuracy over state.
+        Computes recall over state.
         """
         if self.average == 'micro':
             return self.true_positives.sum().float() / (self.actual_positives.sum() + METRIC_EPS)

From 410d67fbe866ee20069e88db1be729d27ae0af48 Mon Sep 17 00:00:00 2001
From: Luis Perez <kandluis@users.noreply.github.com>
Date: Tue, 5 Jan 2021 07:36:06 -0800
Subject: [PATCH 074/136] Existence check for hparams now uses underlying
 filesystem (#5250)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/loggers/tensorboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py
index afdb98cb978de..b3365de25b384 100644
--- a/pytorch_lightning/loggers/tensorboard.py
+++ b/pytorch_lightning/loggers/tensorboard.py
@@ -217,7 +217,7 @@ def save(self) -> None:
         hparams_file = os.path.join(dir_path, self.NAME_HPARAMS_FILE)
 
         # save the metatags file if it doesn't exist
-        if not os.path.isfile(hparams_file):
+        if not self._fs.isfile(hparams_file):
             save_hparams_to_yaml(hparams_file, self.hparams)
 
     @rank_zero_only

From ec0fb7a3ec709699243c76dae04ee1e4ce2406a0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 5 Jan 2021 20:34:47 +0100
Subject: [PATCH 075/136] refactor imports of logger dependencies (#4860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor imports of logger dependencies

* fix

* fix

* fix

* name

* fix

* mocks

* fix tests

* fix mlflow

* fix test tube

* fix wandb import check

* whitespace

* name

* name

* hack

* hack

* rev

* fix

* update mlflow import check

* try without installing conda dep

* .

* .

* .

* .

* .

* .

* .

* .

* .

Co-authored-by: Adrian Wälchli <adrian.waelchli@inf.unibe.ch>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 .github/workflows/ci_test-base.yml           |  2 +-
 pytorch_lightning/loggers/__init__.py        | 39 ++++++--------------
 pytorch_lightning/loggers/comet.py           | 34 ++++++++---------
 pytorch_lightning/loggers/mlflow.py          | 20 ++++++----
 pytorch_lightning/loggers/neptune.py         | 20 +++++-----
 pytorch_lightning/loggers/test_tube.py       | 13 ++++---
 pytorch_lightning/loggers/wandb.py           | 16 ++++----
 tests/checkpointing/test_model_checkpoint.py |  1 -
 tests/loggers/test_mlflow.py                 | 13 +++++--
 9 files changed, 78 insertions(+), 80 deletions(-)

diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
index c0b97439737ff..d1ef75db942e8 100644
--- a/.github/workflows/ci_test-base.yml
+++ b/.github/workflows/ci_test-base.yml
@@ -1,4 +1,4 @@
-name: CI base testing
+name: CI basic testing
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py
index 562e027ea36b6..41f42be3f02f3 100644
--- a/pytorch_lightning/loggers/__init__.py
+++ b/pytorch_lightning/loggers/__init__.py
@@ -24,40 +24,25 @@
     'CSVLogger',
 ]
 
-try:
-    # needed to prevent ImportError and duplicated logs.
-    environ["COMET_DISABLE_AUTO_LOGGING"] = "1"
+from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger
+from pytorch_lightning.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger
+from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE, NeptuneLogger
+from pytorch_lightning.loggers.test_tube import _TESTTUBE_AVAILABLE, TestTubeLogger
+from pytorch_lightning.loggers.wandb import _WANDB_AVAILABLE, WandbLogger
 
-    from pytorch_lightning.loggers.comet import CometLogger
-except ImportError:  # pragma: no-cover
-    del environ["COMET_DISABLE_AUTO_LOGGING"]  # pragma: no-cover
-else:
+if _COMET_AVAILABLE:
     __all__.append('CometLogger')
+    # needed to prevent ImportError and duplicated logs.
+    environ["COMET_DISABLE_AUTO_LOGGING"] = "1"
 
-try:
-    from pytorch_lightning.loggers.mlflow import MLFlowLogger
-except ImportError:  # pragma: no-cover
-    pass  # pragma: no-cover
-else:
+if _MLFLOW_AVAILABLE:
     __all__.append('MLFlowLogger')
 
-try:
-    from pytorch_lightning.loggers.neptune import NeptuneLogger
-except ImportError:  # pragma: no-cover
-    pass  # pragma: no-cover
-else:
+if _NEPTUNE_AVAILABLE:
     __all__.append('NeptuneLogger')
 
-try:
-    from pytorch_lightning.loggers.test_tube import TestTubeLogger
-except ImportError:  # pragma: no-cover
-    pass  # pragma: no-cover
-else:
+if _TESTTUBE_AVAILABLE:
     __all__.append('TestTubeLogger')
 
-try:
-    from pytorch_lightning.loggers.wandb import WandbLogger
-except ImportError:  # pragma: no-cover
-    pass  # pragma: no-cover
-else:
+if _WANDB_AVAILABLE:
     __all__.append('WandbLogger')
diff --git a/pytorch_lightning/loggers/comet.py b/pytorch_lightning/loggers/comet.py
index 64c87888da9d2..869bce831f0c2 100644
--- a/pytorch_lightning/loggers/comet.py
+++ b/pytorch_lightning/loggers/comet.py
@@ -21,17 +21,18 @@
 from argparse import Namespace
 from typing import Any, Dict, Optional, Union
 
-try:
-    import comet_ml
+import torch
+from torch import is_tensor
 
-except ModuleNotFoundError:  # pragma: no-cover
-    comet_ml = None
-    CometExperiment = None
-    CometExistingExperiment = None
-    CometOfflineExperiment = None
-    API = None
-    generate_guid = None
-else:
+from pytorch_lightning import _logger as log
+from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
+from pytorch_lightning.utilities import rank_zero_only, _module_available
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+_COMET_AVAILABLE = _module_available("comet_ml")
+
+if _COMET_AVAILABLE:
+    import comet_ml
     from comet_ml import ExistingExperiment as CometExistingExperiment
     from comet_ml import Experiment as CometExperiment
     from comet_ml import OfflineExperiment as CometOfflineExperiment
@@ -41,14 +42,11 @@
     except ImportError:  # pragma: no-cover
         # For more information, see: https://www.comet.ml/docs/python-sdk/releases/#release-300
         from comet_ml.papi import API  # pragma: no-cover
-
-import torch
-from torch import is_tensor
-
-from pytorch_lightning import _logger as log
-from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
+else:
+    # needed for test mocks, these tests shall be updated
+    comet_ml = None
+    CometExperiment, CometExistingExperiment, CometOfflineExperiment = None, None, None
+    API = None
 
 
 class CometLogger(LightningLoggerBase):
diff --git a/pytorch_lightning/loggers/mlflow.py b/pytorch_lightning/loggers/mlflow.py
index 92f1c15d589d4..4987d050c925d 100644
--- a/pytorch_lightning/loggers/mlflow.py
+++ b/pytorch_lightning/loggers/mlflow.py
@@ -21,21 +21,25 @@
 from time import time
 from typing import Any, Dict, Optional, Union
 
-try:
-    import mlflow
-    from mlflow.tracking import MlflowClient
-except ModuleNotFoundError:  # pragma: no-cover
-    mlflow = None
-    MlflowClient = None
-
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
-from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn, _module_available
+
 
 LOCAL_FILE_URI_PREFIX = "file:"
 
 
+_MLFLOW_AVAILABLE = _module_available("mlflow")
+try:
+    import mlflow
+    from mlflow.tracking import MlflowClient
+# todo: there seems to be still some remaining import error with Conda env
+except ImportError:
+    _MLFLOW_AVAILABLE = False
+    mlflow, MlflowClient = None, None
+
+
 class MLFlowLogger(LightningLoggerBase):
     """
     Log using `MLflow <https://mlflow.org>`_.
diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py
index 410473f28614a..9f3c3787a417e 100644
--- a/pytorch_lightning/loggers/neptune.py
+++ b/pytorch_lightning/loggers/neptune.py
@@ -17,21 +17,23 @@
 --------------
 """
 from argparse import Namespace
-from typing import Any, Dict, Iterable, List, Optional, Union
-
-try:
-    import neptune
-    from neptune.experiments import Experiment
-except ImportError:  # pragma: no-cover
-    neptune = None
-    Experiment = None
+from typing import Any, Dict, Iterable, Optional, Union
 
 import torch
 from torch import is_tensor
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
-from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities import rank_zero_only, _module_available
+
+_NEPTUNE_AVAILABLE = _module_available("neptune")
+
+if _NEPTUNE_AVAILABLE:
+    import neptune
+    from neptune.experiments import Experiment
+else:
+    # needed for test mocks, these tests shall be updated
+    neptune, Experiment = None, None
 
 
 class NeptuneLogger(LightningLoggerBase):
diff --git a/pytorch_lightning/loggers/test_tube.py b/pytorch_lightning/loggers/test_tube.py
index 3750a32eab103..65d7deb90f43c 100644
--- a/pytorch_lightning/loggers/test_tube.py
+++ b/pytorch_lightning/loggers/test_tube.py
@@ -19,15 +19,18 @@
 from argparse import Namespace
 from typing import Any, Dict, Optional, Union
 
-try:
-    from test_tube import Experiment
-except ImportError:  # pragma: no-cover
-    Experiment = None
-
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
+from pytorch_lightning.utilities import _module_available
 from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn
 
+_TESTTUBE_AVAILABLE = _module_available("test_tube")
+
+if _TESTTUBE_AVAILABLE:
+    from test_tube import Experiment
+else:
+    Experiment = None
+
 
 class TestTubeLogger(LightningLoggerBase):
     r"""
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 455635690f5c9..f92c44ab27b7f 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -22,16 +22,18 @@
 
 import torch.nn as nn
 
+from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
+from pytorch_lightning.utilities import rank_zero_only, _module_available
+from pytorch_lightning.utilities.warning_utils import WarningCache
+
+_WANDB_AVAILABLE = _module_available("wandb")
+
 try:
     import wandb
     from wandb.wandb_run import Run
-except ImportError:  # pragma: no-cover
-    wandb = None
-    Run = None
-
-from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.warning_utils import WarningCache
+except ImportError:
+    # needed for test mocks, these tests shall be updated
+    wandb, Run = None, None
 
 
 class WandbLogger(LightningLoggerBase):
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 7dbdee3d8a915..3adb45c0b1869 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -34,7 +34,6 @@
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel
-import tests.base.develop_utils as tutils
 
 
 class LogInTwoMethods(BoringModel):
diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py
index c52dd82889f01..c6072afbb69e2 100644
--- a/tests/loggers/test_mlflow.py
+++ b/tests/loggers/test_mlflow.py
@@ -20,7 +20,7 @@
 
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.loggers import MLFlowLogger
+from pytorch_lightning.loggers import _MLFLOW_AVAILABLE, MLFlowLogger
 from tests.base import EvalModelTemplate
 
 
@@ -120,7 +120,7 @@ def test_mlflow_log_dir(client, mlflow, tmpdir):
 
 def test_mlflow_logger_dirs_creation(tmpdir):
     """ Test that the logger creates the folders and files in the right place. """
-    if not importlib.util.find_spec('mlflow'):
+    if not _MLFLOW_AVAILABLE:
         pytest.xfail("test for explicit file creation requires mlflow dependency to be installed.")
 
     assert not os.listdir(tmpdir)
@@ -137,8 +137,13 @@ def test_mlflow_logger_dirs_creation(tmpdir):
         assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}
 
     model = EvalModelTemplate()
-    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3,
-                      log_gpu_memory=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        logger=logger,
+        max_epochs=1,
+        limit_val_batches=3,
+        log_gpu_memory=True,
+    )
     trainer.fit(model)
     assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'}
     assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics')

From 6536ea42fa88eda63f17dae60c51d9669b409b78 Mon Sep 17 00:00:00 2001
From: Wansoo Kim <rladhkstn8@gmail.com>
Date: Wed, 6 Jan 2021 07:04:53 +0900
Subject: [PATCH 076/136] FIX-5311: Cast to string `_flatten_dict` (#5354)

* fix

* params

* add test

* add another types

* chlog

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                      | 2 ++
 pytorch_lightning/loggers/base.py | 5 ++++-
 tests/loggers/test_tensorboard.py | 6 +++---
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 80e40457ffa0f..8bb0d31169b87 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
 
+- Fixed casted key to string in `_flatten_dict` ([#5354](https://github.com/PyTorchLightning/pytorch-lightning/pull/5354))
+
 
 ## [1.1.2] - 2020-12-23
 
diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py
index a27998366b671..ac7ab3e023bdb 100644
--- a/pytorch_lightning/loggers/base.py
+++ b/pytorch_lightning/loggers/base.py
@@ -207,7 +207,7 @@ def _sanitize_callable(val):
         return {key: _sanitize_callable(val) for key, val in params.items()}
 
     @staticmethod
-    def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any]:
+    def _flatten_dict(params: Dict[Any, Any], delimiter: str = '/') -> Dict[str, Any]:
         """
         Flatten hierarchical dict, e.g. ``{'a': {'b': 'c'}} -> {'a/b': 'c'}``.
 
@@ -223,12 +223,15 @@ def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any
             {'a/b': 'c'}
             >>> LightningLoggerBase._flatten_dict({'a': {'b': 123}})
             {'a/b': 123}
+            >>> LightningLoggerBase._flatten_dict({5: {'a': 123}})
+            {'5/a': 123}
         """
 
         def _dict_generator(input_dict, prefixes=None):
             prefixes = prefixes[:] if prefixes else []
             if isinstance(input_dict, MutableMapping):
                 for key, value in input_dict.items():
+                    key = str(key)
                     if isinstance(value, (MutableMapping, Namespace)):
                         value = vars(value) if isinstance(value, Namespace) else value
                         for d in _dict_generator(value, prefixes + [key]):
diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py
index 15a024003ebf0..fa5c711357ba3 100644
--- a/tests/loggers/test_tensorboard.py
+++ b/tests/loggers/test_tensorboard.py
@@ -22,7 +22,7 @@
 from omegaconf import OmegaConf
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import BoringModel, EvalModelTemplate
 
@@ -102,7 +102,7 @@ def test_tensorboard_named_version(tmpdir):
     expected_version = "2020-02-05-162402"
 
     logger = TensorBoardLogger(save_dir=tmpdir, name=name, version=expected_version)
-    logger.log_hyperparams({"a": 1, "b": 2})  # Force data to be written
+    logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5})  # Force data to be written
 
     assert logger.version == expected_version
     assert os.listdir(tmpdir / name) == [expected_version]
@@ -113,7 +113,7 @@ def test_tensorboard_named_version(tmpdir):
 def test_tensorboard_no_name(tmpdir, name):
     """Verify that None or empty name works"""
     logger = TensorBoardLogger(save_dir=tmpdir, name=name)
-    logger.log_hyperparams({"a": 1, "b": 2})  # Force data to be written
+    logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5})  # Force data to be written
     assert logger.root_dir == tmpdir
     assert os.listdir(tmpdir / "version_0")
 

From 4d9db866a11f3b4b9b923bca811911ac79dad914 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 6 Jan 2021 00:43:46 +0100
Subject: [PATCH 077/136] Prepare 1.1.3 release (#5365)

* Prepare 1.1.3 release

* Fix flake8 error

* suppress

* Remove 1.1.4 section

* Add missing commits to CHANGELOG

* Update PR template

* Add missing commit

* fix

* Update CHANGELOG.md

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE.md              | 16 +++++-----
 CHANGELOG.md                                  | 29 +++++++++----------
 .../basic_examples/mnist_datamodule.py        |  5 +++-
 pytorch_lightning/__init__.py                 |  2 +-
 pytorch_lightning/plugins/rpc_plugin.py       |  5 ++--
 tests/checkpointing/test_model_checkpoint.py  |  6 ++--
 6 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index c2ce4a5e8bf26..ada6c6b8c62bd 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,26 +16,26 @@ If we didn't discuss your PR in Github issues there's a high chance it will not
 Fixes # (issue) <- this [links related issue to this PR](https://docs.github.com/en/free-pro-team@latest/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)
 
 ## Before submitting
-- [ ] Was this discussed/approved via a Github issue? (no need for typos and docs improvements)
-- [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), Pull Request section?
+- [ ] Was this discussed/approved via a GitHub issue? (not for typos and docs)
+- [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), **Pull Request** section?
 - [ ] Did you make sure your PR does only one thing, instead of bundling different changes together?
-- [ ] Did you make sure to update the documentation with your changes [if needed]?
-- [ ] Did you write any new necessary tests [no need for typos, docs]? 
+- [ ] Did you make sure to update the documentation with your changes? (if necessary)
+- [ ] Did you write any new necessary tests? (not for typos and docs)
 - [ ] Did you verify new and existing tests pass locally with your changes?
-- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)?
+- [ ] Did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)? (not for typos, docs, test updates, or internal minor changes/refactorings)
 
 <!-- For CHANGELOG separate each item in the unreleased section by a blank line to reduce collisions -->
 
 ## PR review
 Anyone in the community is free to review the PR once the tests have passed.
-Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In short, see the following bullet-list:  
+Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In short, see the following bullet-list:
 
  - [ ] Is this pull request ready for review? (if not, please submit in draft mode)
  - [ ] Check that all items from **Before submitting** are resolved
  - [ ] Make sure the title is self-explanatory and the description concisely explains the PR
  - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified
- - [ ] **Check that target branch and milestone are aligned!**
- 
+ - [ ] **Check that target branch and milestone match!**
+
 
 ## Did you have fun?
 Make sure you had fun coding 🙃
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8bb0d31169b87..869f71f5cc3c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,28 +9,25 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Added a check for optimizer attached to lr_scheduler ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338))
-
-- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
-
+- Added a check for optimizer attached to `lr_scheduler` ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338))
+- Added support for passing non-existing filepaths to `resume_from_checkpoint` ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
 
 ### Changed
 
-
-### Deprecated
-
-
-### Removed
-
-
-### Fixed
-
-- Skip restore from `resume_from_checkpoint` in while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
-
+- Skip restore from `resume_from_checkpoint` while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
 - Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
+- Disabled checkpointing, earlystopping and logging with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
+- Distributed group defaults to `WORLD` if `None` ([#5125](https://github.com/PyTorchLightning/pytorch-lightning/pull/5125))
 
-- Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
+### Fixed
 
+- Fixed `trainer.test` returning non-test metrics ([#5214](https://github.com/PyTorchLightning/pytorch-lightning/pull/5214))
+- Fixed metric state reset ([#5273](https://github.com/PyTorchLightning/pytorch-lightning/pull/5273))
+- Fixed `--num-nodes` on `DDPSequentialPlugin` ([#5327](https://github.com/PyTorchLightning/pytorch-lightning/pull/5327))
+- Fixed invalid value for `weights_summary` ([#5296](https://github.com/PyTorchLightning/pytorch-lightning/pull/5296))
+- Fixed `Trainer.test` not using the latest `best_model_path` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
+- Fixed existence check for hparams not using underlying filesystem ([#5250](https://github.com/PyTorchLightning/pytorch-lightning/pull/5250))
+- Fixed `LightningOptimizer` AMP bug ([#5191](https://github.com/PyTorchLightning/pytorch-lightning/pull/5191))
 - Fixed casted key to string in `_flatten_dict` ([#5354](https://github.com/PyTorchLightning/pytorch-lightning/pull/5354))
 
 
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index 95e20d22e1fdd..27a7590b64ee9 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import platform
 from typing import Optional
 
 from torch.utils.data import DataLoader, random_split
@@ -55,6 +55,9 @@ def __init__(
             normalize: If true applies image normalize
         """
         super().__init__(*args, **kwargs)
+        if platform.system() == "Windows":
+            # see: https://stackoverflow.com/a/59680818/4521646
+            num_workers = 0
 
         self.dims = (1, 28, 28)
         self.data_dir = data_dir
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index d1da4da1963ac..5f7ae6bdee9d2 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '1.1.2'
+__version__ = '1.1.3'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'
diff --git a/pytorch_lightning/plugins/rpc_plugin.py b/pytorch_lightning/plugins/rpc_plugin.py
index a1464f3c70e0b..223a1f0a13110 100644
--- a/pytorch_lightning/plugins/rpc_plugin.py
+++ b/pytorch_lightning/plugins/rpc_plugin.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from contextlib import suppress
 from typing import Optional
 
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
-from pytorch_lightning.utilities import _module_available, RPC_AVAILABLE
+from pytorch_lightning.utilities import RPC_AVAILABLE
 
 DEFAULT_RPC_TIMEOUT_SEC = 60.
 if RPC_AVAILABLE:
     from torch.distributed import rpc
-    if _module_available("torch.distributed.rpc.constants") and hasattr(torch.distributed.rpc.constants, "DEFAULT_RPC_TIMEOUT_SEC"):
+    with suppress(ModuleNotFoundError, ImportError):
         from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC
 
 
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 3adb45c0b1869..8d4a859a88784 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -11,20 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from argparse import Namespace
 import os
-from pathlib import Path
 import pickle
 import platform
 import re
+from argparse import Namespace
+from pathlib import Path
 from unittest import mock
 from unittest.mock import Mock
 
 import cloudpickle
-from omegaconf import Container, OmegaConf
 import pytest
 import torch
 import yaml
+from omegaconf import Container, OmegaConf
 
 import pytorch_lightning as pl
 import tests.base.develop_utils as tutils

From 019e4ff8cddadfde4d3bd48b4e8b8d294950ca2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 6 Jan 2021 13:27:59 +0100
Subject: [PATCH 078/136] Add 1.1.4 section to CHANGELOG (#5378)

---
 CHANGELOG.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 869f71f5cc3c0..4ca2165c8f77e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.1.4] - YYYY-MM-DD
+
+### Added
+
+
+### Changed
+
+
+### Deprecated
+
+
+### Removed
+
+
+### Fixed
+
+
 ## [1.1.3] - 2021-01-05
 
 ### Added

From ee8373110aa89f1049d7ac53c5d491e7eba68cf1 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Wed, 6 Jan 2021 15:02:13 +0000
Subject: [PATCH 079/136] Update sharded install to latest fairscale release,
 add reasoning why fork required for sequential parallelism (#5380)

---
 docs/source/multi_gpu.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 9d868406e2985..7c8bba4621c5b 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -654,7 +654,7 @@ To use Sharded Training, you need to first install FairScale using the command b
 
 .. code-block:: bash
 
-    pip install https://github.com/PyTorchLightning/fairscale/archive/pl_1.1.0.zip
+    pip install fairscale
 
 
 .. code-block:: python
@@ -681,7 +681,7 @@ Reference: https://arxiv.org/abs/1811.06965
 
 .. note:: DDPSequentialPlugin is currently supported only for Pytorch 1.6.
 
-To get started, install FairScale using the command below.
+To get started, install FairScale using the command below. We install a specific branch which contains PyTorch related fixes for Sequential Parallelism.
 
 .. code-block:: bash
 

From cc624358c8e396e966f9c51b3010f6a986047fc6 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Wed, 6 Jan 2021 23:29:42 +0630
Subject: [PATCH 080/136] docker: run ci only docker related files are changed
 (#5203)

* only run ci on docker related files

* docker related files changed!

* install pytorch along with cudatoolkit

* build docker only on SUN

* conda exit status has been fixed

* reverts back to old conda version

* add more docker related files

* conda env update --name

* create env and install pytorch again

* create env and install pytorch again

* ${PYTORCH_CHANNEL}

* dont update pytorch with conda env update

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update dockers/base-conda/Dockerfile

* Apply suggestions from code review

* remove checks in cron job

* Apply suggestions from code review

* readd #

* readd #

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 .github/workflows/ci_dockers.yml | 17 +++++---
 .github/workflows/nightly.yml    | 75 +++++++++++++++-----------------
 dockers/base-conda/Dockerfile    | 10 ++---
 environment.yml                  |  4 +-
 4 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 16f86e0759fce..43550ade8794b 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -2,11 +2,21 @@ name: CI build Docker
 # https://www.docker.com/blog/first-docker-github-action-is-here
 # https://github.com/docker/build-push-action
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
-on:  # Trigger the workflow on push or pull request, but only for the master branch
+on: # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"] # include release branches like release/1.0.x
   pull_request:
     branches: [master, "release/*"]
+    paths:
+      - "dockers/**"
+      - "!dockers/README.md"
+      - "requirements/*.txt"
+      - "environment.yml"
+      - "requirements.txt"
+      - ".github/workflows/ci_dockers.yml"
+      - ".github/workflows/nightly.yml"
+      - ".github/workflows/release-docker.yml"
+      - "setup.py"
 
 jobs:
   build-PL:
@@ -55,7 +65,6 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             XLA_VERSION=${{ matrix.xla_version }}
-          cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
           file: dockers/base-xla/Dockerfile
           push: false
         timeout-minutes: 50
@@ -96,7 +105,6 @@ jobs:
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
-          cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
         timeout-minutes: 50
@@ -139,7 +147,6 @@ jobs:
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
             CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
-          cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
           file: dockers/base-conda/Dockerfile
           push: false
         timeout-minutes: 50
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index eb3e55268b682..1f14dcdd276dd 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -1,49 +1,48 @@
 name: Nightly events
 
 # https://jasonet.co/posts/scheduled-actions/
+# https://github.community/t/distinct-job-for-each-schedule/17811/2
 on:
   schedule:
-    # At the end of every day
-    - cron: "0 0 * * *"
+    - cron: "0 0 * * *" # At the end of every day
 
 # based on https://github.com/pypa/gh-action-pypi-publish
 jobs:
-
   pypi-release:
     runs-on: ubuntu-20.04
 
     steps:
-    # does nightly releases from feature branch
-    - uses: actions/checkout@v2
-      with:
-        ref: release/1.2-dev
-    - uses: actions/setup-python@v2
-      with:
-        python-version: 3.7
-
-    - name: Install dependencies
-      run: >-
-        python -m pip install --user --upgrade setuptools wheel
-
-    - name: Build packages
-      run: |
-        python .github/prepare-nightly_version.py
-        python setup.py sdist bdist_wheel
-        ls -lh dist/
-
-    - name: Delay releasing
-      uses: juliangruber/sleep-action@v1
-      with:
-        time: 5m
-
-      # We do this, since failures on test.pypi aren't that bad
-    - name: Publish to Test PyPI
-      uses: pypa/gh-action-pypi-publish@v1.4.1
-      with:
-        user: __token__
-        password: ${{ secrets.test_pypi_password }}
-        repository_url: https://test.pypi.org/legacy/
-        verbose: true
+      # does nightly releases from feature branch
+      - uses: actions/checkout@v2
+        with:
+          ref: release/1.2-dev
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+
+      - name: Install dependencies
+        run: >-
+          python -m pip install --user --upgrade setuptools wheel
+
+      - name: Build packages
+        run: |
+          python .github/prepare-nightly_version.py
+          python setup.py sdist bdist_wheel
+          ls -lh dist/
+
+      - name: Delay releasing
+        uses: juliangruber/sleep-action@v1
+        with:
+          time: 5m
+
+        # We do this, since failures on test.pypi aren't that bad
+      - name: Publish to Test PyPI
+        uses: pypa/gh-action-pypi-publish@v1.4.1
+        with:
+          user: __token__
+          password: ${{ secrets.test_pypi_password }}
+          repository_url: https://test.pypi.org/legacy/
+          verbose: true
 
   docker-XLA:
     runs-on: ubuntu-20.04
@@ -51,7 +50,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: [3.6, 3.7]
-        xla_version: [1.6, 1.7]  # todo: , "nightly"
+        xla_version: [1.6, 1.7] # todo: , "nightly"
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -72,8 +71,6 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             XLA_VERSION=${{ matrix.xla_version }}
-          cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
-          cache-to: type=inline
           file: dockers/base-xla/Dockerfile
           push: true
           tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
@@ -122,8 +119,6 @@ jobs:
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
-          cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
-          cache-to: type=inline
           file: dockers/base-cuda/Dockerfile
           push: true
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
@@ -138,8 +133,6 @@ jobs:
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
             CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
-          cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
-          cache-to: type=inline
           file: dockers/base-conda/Dockerfile
           push: true
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index 3c58dfcde7dea..e5f2d0cf08f69 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -75,16 +75,16 @@ ENV CONDA_ENV=lightning
 COPY environment.yml environment.yml
 
 # conda init
-RUN conda create -y --name $CONDA_ENV cudatoolkit=${CUDA_VERSION} && \
+RUN conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} cudatoolkit=${CUDA_VERSION} -c ${PYTORCH_CHANNEL} && \
     conda init bash && \
     # NOTE: this requires that the channel is presented in the yaml before packages
-    # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later
+    # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installed later
     python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
-    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
-    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
+    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'- python[>=]+[\d\.]+', '# - python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
+    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'- pytorch[>=]+[\d\.]+', '# - pytorch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
     python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \
     cat environment.yml && \
-    conda env update --file environment.yml && \
+    conda env update --name $CONDA_ENV --file environment.yml && \
     conda clean -ya && \
     rm environment.yml
 
diff --git a/environment.yml b/environment.yml
index 1278f15f718e9..c019580c0b4d0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -26,7 +26,7 @@ dependencies:
     - python>=3.6
     - pip>20.1
     - numpy>=1.16.4
-    - pytorch>=1.3,<1.8
+    - pytorch>=1.3
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
@@ -41,7 +41,7 @@ dependencies:
     - torchtext>=0.3.1
 
     # Examples
-    - torchvision>=0.4.1,<0.9.0
+    - torchvision>=0.4.1
 
     - pip:
         - test-tube>=0.7.5

From 4c6f36e6e14a5e3bace1fe32505ae0fe6f8bc682 Mon Sep 17 00:00:00 2001
From: Arnaud Gelas <arnaudgelas@gmail.com>
Date: Thu, 7 Jan 2021 06:24:47 +0100
Subject: [PATCH 081/136] Fix pre-commit trailing-whitespace and
 end-of-file-fixer hooks. (#5387)

---
 .github/BECOMING_A_CORE_CONTRIBUTOR.md    |  18 ++--
 .github/ISSUE_TEMPLATE/documentation.md   |   2 +-
 .github/ISSUE_TEMPLATE/how-to-question.md |   8 +-
 .github/workflows/docs-checks.yml         |   1 -
 MANIFEST.in                               |   1 -
 docs/.build_docs.sh                       |   2 +-
 docs/Makefile                             |   2 +-
 docs/source/_static/main.css              |   2 +-
 docs/source/asr_nlp_tts.rst               | 104 +++++++++++-----------
 docs/source/cloud_training.rst            |   2 +-
 docs/source/datamodules.rst               |   2 +-
 docs/source/introduction_guide.rst        |   6 +-
 docs/source/loggers.rst                   |   4 +-
 docs/source/lr_finder.rst                 |  28 +++---
 docs/source/metrics.rst                   |   6 +-
 docs/source/new-project.rst               |  18 ++--
 docs/source/optimizers.rst                |   2 +-
 docs/source/sequences.rst                 |   2 +-
 docs/source/slurm.rst                     |   2 +-
 docs/source/test_set.rst                  |   7 +-
 docs/source/training_tricks.rst           |   2 +-
 docs/source/transfer_learning.rst         |   2 +-
 docs/source/weights_loading.rst           |   2 +-
 pl_examples/README.md                     |   6 +-
 pl_examples/basic_examples/README.md      |  20 ++---
 requirements/devel.txt                    |   2 +-
 requirements/docs.txt                     |   2 +-
 requirements/examples.txt                 |   2 +-
 requirements/loggers.txt                  |   2 +-
 tests/README.md                           |   4 +-
 30 files changed, 129 insertions(+), 134 deletions(-)

diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md
index 3fa357ef062ca..828f45aedbecc 100644
--- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md
+++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md
@@ -1,14 +1,14 @@
 # How to become a core contributor
 
-Thanks for your interest in joining the Lightning team! We’re a rapidly growing project which is poised to become the go-to framework for DL researchers! 
-We're currently recruiting for a team of 5 core maintainers. 
+Thanks for your interest in joining the Lightning team! We’re a rapidly growing project which is poised to become the go-to framework for DL researchers!
+We're currently recruiting for a team of 5 core maintainers.
 
 As a core maintainer you will have a strong say in the direction of the project. Big changes will require a majority of maintainers to agree.
 
-### Code of conduct  
+### Code of conduct
 First and foremost, you'll be evaluated against [these core values](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md). Any code we commit or feature we add needs to align with those core values.
 
-### The bar for joining the team   
+### The bar for joining the team
 Lightning is being used to solve really hard problems at the top AI labs in the world. As such, the bar for adding team members is extremely high. Candidates must have solid engineering skills, have a good eye for user experience, and must be a power user of Lightning and PyTorch.
 
 With that said, the Lightning team will be diverse and a reflection of an inclusive AI community. You don't have to be an engineer to contribute! Scientists with great usability intuition and PyTorch ninja skills are welcomed!
@@ -36,10 +36,10 @@ Pleasant/helpful tone.
 - Code is NOT overly engineered or hard to read
 - Ask yourself, could a non-engineer understand what’s happening here?
 - Make sure new tests are written
-- Is this NECESSARY for Lightning? There are some PRs which are just purely about adding engineering complexity which have no place in Lightning. 
+- Is this NECESSARY for Lightning? There are some PRs which are just purely about adding engineering complexity which have no place in Lightning.
 Guidance
 - Some other PRs are for people who are wanting to get involved and add something unnecessary. We do want their help though! So don’t approve the PR, but direct them to a Github issue that they might be interested in helping with instead!
-- To be considered for core contributor, please review 10 PRs and help the authors land it on master. Once you've finished the review, ping me 
+- To be considered for core contributor, please review 10 PRs and help the authors land it on master. Once you've finished the review, ping me
 for a sanity check. At the end of 10 PRs if your PR reviews are inline with expectations described above, then you can merge PRs on your own going forward,
 otherwise we'll do a few more until we're both comfortable :)
 
@@ -47,15 +47,15 @@ otherwise we'll do a few more until we're both comfortable :)
 There are some big decisions which the project must make. For these I expect core contributors to have something meaningful to add if it’s their area of expertise.
 
 #### Diversity
-Lightning should reflect the broader community it serves. As such we should have scientists/researchers from 
-different fields contributing!   
+Lightning should reflect the broader community it serves. As such we should have scientists/researchers from
+different fields contributing!
 
 The first 5 core contributors will fit this profile. Thus if you overlap strongly with experiences and expertise as someone else on the team, you might have to wait until the next set of contributors are added.
 
 #### Summary: Requirements to apply
 The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones.
 
-- Solve 10+ Github issues. 
+- Solve 10+ Github issues.
 - Create 5+ meaningful PRs which solves some reported issue - bug,
 - Perform 10+ PR reviews from other contributors.
 
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
index 2b249089657c8..e78df92a18bab 100644
--- a/.github/ISSUE_TEMPLATE/documentation.md
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -12,7 +12,7 @@ assignees: ''
 For typos and doc fixes, please go ahead and:
 
 1. Create an issue.
-2. Fix the typo.   
+2. Fix the typo.
 3. Submit a PR.
 
 Thanks!
diff --git a/.github/ISSUE_TEMPLATE/how-to-question.md b/.github/ISSUE_TEMPLATE/how-to-question.md
index 2a307e18de5c7..786244d2f5e74 100644
--- a/.github/ISSUE_TEMPLATE/how-to-question.md
+++ b/.github/ISSUE_TEMPLATE/how-to-question.md
@@ -9,10 +9,10 @@ assignees: ''
 
 ## ❓ Questions and Help
 
-### Before asking: 
+### Before asking:
 1. Try to find answers to your questions in [the Lightning Forum!](https://forums.pytorchlightning.ai/)
-2. Search for similar [issues](https://github.com/PyTorchLightning/pytorch-lightning/issues).   
-3. Search the [docs](https://pytorch-lightning.readthedocs.io/en/latest/).    
+2. Search for similar [issues](https://github.com/PyTorchLightning/pytorch-lightning/issues).
+3. Search the [docs](https://pytorch-lightning.readthedocs.io/en/latest/).
 
 <!-- If you still can't find what you need: -->
 
@@ -20,7 +20,7 @@ assignees: ''
 
 #### Code
 
-<!-- Please paste a code snippet if your question requires it! -->   
+<!-- Please paste a code snippet if your question requires it! -->
 
 #### What have you tried?
 
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 3f6b35ba7b7cb..247c5cf61f9c1 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -109,4 +109,3 @@ jobs:
           path: docs/build/html/
         # Use always() to always run this step to publish test results when there are test failures
         if: success()
-
diff --git a/MANIFEST.in b/MANIFEST.in
index 8db3912027d6d..450a9ec576d0b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -69,4 +69,3 @@ prune temp*
 prune test*
 prune benchmark*
 prune dockers
-
diff --git a/docs/.build_docs.sh b/docs/.build_docs.sh
index 2b57c47953675..6cf6eab2fd398 100644
--- a/docs/.build_docs.sh
+++ b/docs/.build_docs.sh
@@ -1,3 +1,3 @@
 rm -rf source/generated
 make clean
-make html --debug --jobs 2 SPHINXOPTS="-W"
\ No newline at end of file
+make html --debug --jobs 2 SPHINXOPTS="-W"
diff --git a/docs/Makefile b/docs/Makefile
index 69fe55ecfa9aa..ba501f6f5b1bf 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -16,4 +16,4 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/_static/main.css b/docs/source/_static/main.css
index 7441b775a4be5..82aa8b338ad39 100644
--- a/docs/source/_static/main.css
+++ b/docs/source/_static/main.css
@@ -1,3 +1,3 @@
 col {
     width: 50% !important;
-}
\ No newline at end of file
+}
diff --git a/docs/source/asr_nlp_tts.rst b/docs/source/asr_nlp_tts.rst
index a5f1ac59bf696..49bed0a981a6e 100644
--- a/docs/source/asr_nlp_tts.rst
+++ b/docs/source/asr_nlp_tts.rst
@@ -10,16 +10,16 @@ These are amazing ecosystems to help with Automatic Speech Recognition (ASR), Na
 NeMo
 ****
 
-`NVIDIA NeMo <https://github.com/NVIDIA/NeMo>`_ is a toolkit for building new State-of-the-Art 
-Conversational AI models. NeMo has separate collections for Automatic Speech Recognition (ASR), 
-Natural Language Processing (NLP), and Text-to-Speech (TTS) models. Each collection consists of 
-prebuilt modules that include everything needed to train on your data. 
-Every module can easily be customized, extended, and composed to create new Conversational AI 
+`NVIDIA NeMo <https://github.com/NVIDIA/NeMo>`_ is a toolkit for building new State-of-the-Art
+Conversational AI models. NeMo has separate collections for Automatic Speech Recognition (ASR),
+Natural Language Processing (NLP), and Text-to-Speech (TTS) models. Each collection consists of
+prebuilt modules that include everything needed to train on your data.
+Every module can easily be customized, extended, and composed to create new Conversational AI
 model architectures.
 
-Conversational AI architectures are typically very large and require a lot of data  and compute 
-for training. NeMo uses PyTorch Lightning for easy and performant multi-GPU/multi-node 
-mixed-precision training. 
+Conversational AI architectures are typically very large and require a lot of data  and compute
+for training. NeMo uses PyTorch Lightning for easy and performant multi-GPU/multi-node
+mixed-precision training.
 
 .. note:: Every NeMo model is a LightningModule that comes equipped with all supporting infrastructure for training and reproducibility.
 
@@ -31,7 +31,7 @@ NeMo Models
 NeMo Models contain everything needed to train and reproduce state of the art Conversational AI
 research and applications, including:
 
-- neural network architectures 
+- neural network architectures
 - datasets/data loaders
 - data preprocessing/postprocessing
 - data augmentors
@@ -83,7 +83,7 @@ To install from a local clone of NeMo:
 
     ./reinstall.sh # from cloned NeMo's git root
 
-For Docker users, the NeMo container is available on 
+For Docker users, the NeMo container is available on
 `NGC <https://ngc.nvidia.com/catalog/containers/nvidia:nemo>`_.
 
 .. code-block:: bash
@@ -97,7 +97,7 @@ For Docker users, the NeMo container is available on
 Experiment Manager
 ------------------
 
-NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, 
+NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing,
 TensorBoard Logging, and Weights and Biases logging. The Experiment Manager is included by default
 in all NeMo example scripts.
 
@@ -126,11 +126,11 @@ Optionally launch Tensorboard to view training results in ./nemo_experiments (by
 Automatic Speech Recognition (ASR)
 ==================================
 
-Everything needed to train Convolutional ASR models is included with NeMo. 
-NeMo supports multiple Speech Recognition architectures, including Jasper and QuartzNet. 
-`NeMo Speech Models <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_ 
-can be trained from scratch on custom datasets or 
-fine-tuned using pre-trained checkpoints trained on thousands of hours of audio 
+Everything needed to train Convolutional ASR models is included with NeMo.
+NeMo supports multiple Speech Recognition architectures, including Jasper and QuartzNet.
+`NeMo Speech Models <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_
+can be trained from scratch on custom datasets or
+fine-tuned using pre-trained checkpoints trained on thousands of hours of audio
 that can be restored for immediate use.
 
 Some typical ASR tasks are included with NeMo:
@@ -141,7 +141,7 @@ Some typical ASR tasks are included with NeMo:
 - `Voice Activity Detection <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/asr/06_Voice_Activiy_Detection.ipynb>`_
 - `Speaker Recognition <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/speaker_recognition/speaker_reco.py>`_
 
-See this `asr notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/asr/01_ASR_with_NeMo.ipynb>`_ 
+See this `asr notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/asr/01_ASR_with_NeMo.ipynb>`_
 for a full tutorial on doing ASR with NeMo, PyTorch Lightning, and Hydra.
 
 Specify ASR Model Configurations with YAML File
@@ -149,7 +149,7 @@ Specify ASR Model Configurations with YAML File
 
 NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra.
 
-See this `asr config <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/asr/conf/config.yaml>`_ 
+See this `asr config <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/asr/conf/config.yaml>`_
 for the entire speech to text .yaml file.
 
 .. code-block:: yaml
@@ -198,7 +198,7 @@ Developing ASR Model From Scratch
         trainer.fit(asr_model)
 
 
-Hydra makes every aspect of the NeMo model, 
+Hydra makes every aspect of the NeMo model,
 including the PyTorch Lightning Trainer, customizable from the command line.
 
 .. code-block:: bash
@@ -259,7 +259,7 @@ with PyTorch Lightning since every NeMo model is a Lightning Module.
             log_probs = self.decoder(encoder_output=encoded)
             greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
             return log_probs, encoded_len, greedy_predictions
-    
+
         # PTL-specific methods
         def training_step(self, batch, batch_nb):
             audio_signal, audio_signal_len, transcript, transcript_len = batch
@@ -281,7 +281,7 @@ Neural Types in NeMo ASR
 ------------------------
 
 NeMo Models and Neural Modules come with Neural Type checking.
-Neural type checking is extremely useful when combining many different neural 
+Neural type checking is extremely useful when combining many different neural
 network architectures for a production-grade application.
 
 .. code-block:: python
@@ -311,12 +311,12 @@ Natural Language Processing (NLP)
 =================================
 
 Everything needed to finetune BERT-like language models for NLP tasks is included with NeMo.
-`NeMo NLP Models <https://ngc.nvidia.com/catalog/models/nvidia:nemonlpmodels>`_  
-include `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ 
-and `NVIDIA Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_ BERT and Bio-Megatron models. 
+`NeMo NLP Models <https://ngc.nvidia.com/catalog/models/nvidia:nemonlpmodels>`_
+include `HuggingFace Transformers <https://github.com/huggingface/transformers>`_
+and `NVIDIA Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_ BERT and Bio-Megatron models.
 NeMo can also be used for pretraining BERT-based language models from HuggingFace.
 
-Any of the HuggingFace encoders or Megatron-LM encoders can easily be used for the NLP tasks 
+Any of the HuggingFace encoders or Megatron-LM encoders can easily be used for the NLP tasks
 that are included with NeMo:
 
 - `Glue Benchmark (All tasks) <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/nlp/GLUE_Benchmark.ipynb>`_
@@ -339,7 +339,7 @@ for a full tutorial on doing NER with NeMo, PyTorch Lightning, and Hydra.
 Specify NER Model Configurations with YAML File
 -----------------------------------------------
 
-.. note:: NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra. 
+.. note:: NeMo Models and the PyTorch Lightning Trainer can be fully configured from .yaml files using Hydra.
 
 See this `token classification config <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/nlp/token_classification/conf/token_classification_config.yaml>`_
 for the entire NER (token classification) .yaml file.
@@ -368,7 +368,7 @@ for the entire NER (token classification) .yaml file.
         pretrained_model_name: bert-base-uncased
         lm_checkpoint: null
         ...
-    # the classifier for the downstream task 
+    # the classifier for the downstream task
       head:
         num_fc_layers: 2
         fc_dropout: 0.5
@@ -435,12 +435,12 @@ Hydra makes every aspect of the NeMo model, including the PyTorch Lightning Trai
 Tokenizers
 ----------
 
-Tokenization is the process of converting natural language text into integer arrays 
+Tokenization is the process of converting natural language text into integer arrays
 which can be used for machine learning.
-For NLP tasks, tokenization is an essential part of data preprocessing. 
-NeMo supports all BERT-like model tokenizers from 
+For NLP tasks, tokenization is an essential part of data preprocessing.
+NeMo supports all BERT-like model tokenizers from
 `HuggingFace's AutoTokenizer <https://huggingface.co/transformers/model_doc/auto.html#autotokenizer>`_
-and also supports `Google's SentencePieceTokenizer <https://github.com/google/sentencepiece>`_ 
+and also supports `Google's SentencePieceTokenizer <https://github.com/google/sentencepiece>`_
 which can be trained on custom data.
 
 To see the list of supported tokenizers:
@@ -451,18 +451,18 @@ To see the list of supported tokenizers:
 
     nemo_nlp.modules.get_tokenizer_list()
 
-See this `tokenizer notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/nlp/02_NLP_Tokenizers.ipynb>`_ 
+See this `tokenizer notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/nlp/02_NLP_Tokenizers.ipynb>`_
 for a full tutorial on using tokenizers in NeMo.
 
 Language Models
 ---------------
 
-Language models are used to extract information from (tokenized) text. 
+Language models are used to extract information from (tokenized) text.
 Much of the state-of-the-art in natural language processing is achieved
-by fine-tuning pretrained language models on the downstream task. 
+by fine-tuning pretrained language models on the downstream task.
 
-With NeMo, you can either `pretrain <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/nlp/language_modeling/bert_pretraining.py>`_ 
-a BERT model on your data or use a pretrained language model from `HuggingFace Transformers <https://github.com/huggingface/transformers>`_  
+With NeMo, you can either `pretrain <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/examples/nlp/language_modeling/bert_pretraining.py>`_
+a BERT model on your data or use a pretrained language model from `HuggingFace Transformers <https://github.com/huggingface/transformers>`_
 or `NVIDIA Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_.
 
 To see the list of language models available in NeMo:
@@ -483,11 +483,11 @@ for a full tutorial on using pretrained language models in NeMo.
 Using a Pre-trained NER Model
 -----------------------------
 
-NeMo has pre-trained NER models that can be used 
+NeMo has pre-trained NER models that can be used
 to get started with Token Classification right away.
-Models are automatically downloaded from NGC, 
+Models are automatically downloaded from NGC,
 cached locally to disk,
-and loaded into GPU memory using the `.from_pretrained` method. 
+and loaded into GPU memory using the `.from_pretrained` method.
 
 .. code-block:: python
 
@@ -511,7 +511,7 @@ and loaded into GPU memory using the `.from_pretrained` method.
 NeMo NER Model Under the Hood
 -----------------------------
 
-Any aspect of NLP training or model architecture design can easily be customized with PyTorch Lightning 
+Any aspect of NLP training or model architecture design can easily be customized with PyTorch Lightning
 since every NeMo model is a Lightning Module.
 
 .. code-block:: python
@@ -546,8 +546,8 @@ since every NeMo model is a Lightning Module.
 Neural Types in NeMo NLP
 ------------------------
 
-NeMo Models and Neural Modules come with Neural Type checking. 
-Neural type checking is extremely useful when combining many different neural network architectures 
+NeMo Models and Neural Modules come with Neural Type checking.
+Neural type checking is extremely useful when combining many different neural network architectures
 for a production-grade application.
 
 .. code-block:: python
@@ -565,11 +565,11 @@ for a production-grade application.
 Text-To-Speech (TTS)
 ====================
 
-Everything needed to train TTS models and generate audio is included with NeMo. 
-`NeMo TTS Models <https://ngc.nvidia.com/catalog/models/nvidia:nemottsmodels>`_ 
+Everything needed to train TTS models and generate audio is included with NeMo.
+`NeMo TTS Models <https://ngc.nvidia.com/catalog/models/nvidia:nemottsmodels>`_
 can be trained from scratch on your own data or pretrained models can be downloaded
-automatically. NeMo currently supports  a two step inference procedure. 
-First, a model is used to generate a mel spectrogram from text. 
+automatically. NeMo currently supports  a two step inference procedure.
+First, a model is used to generate a mel spectrogram from text.
 Second, a model is used to generate audio from a mel spectrogram.
 
 Mel Spectrogram Generators:
@@ -647,10 +647,10 @@ Hydra makes every aspect of the NeMo model, including the PyTorch Lightning Trai
 Using State-Of-The-Art Pre-trained TTS Model
 --------------------------------------------
 
-Generate speech using models trained on `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`, 
+Generate speech using models trained on `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`,
 around 24 hours of single speaker data.
 
-See this `TTS notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/tts/1_TTS_inference.ipynb>`_ 
+See this `TTS notebook <https://github.com/NVIDIA/NeMo/blob/v1.0.0b1/tutorials/tts/1_TTS_inference.ipynb>`_
 for a full tutorial on generating speech with NeMo, PyTorch Lightning, and Hydra.
 
 .. code-block:: python
@@ -673,7 +673,7 @@ for a full tutorial on generating speech with NeMo, PyTorch Lightning, and Hydra
         if isinstance(audio, torch.Tensor):
             audio = audio.to('cpu').numpy()
         return spectrogram, audio
-        
+
     text_to_generate = input("Input what you want the model to say: ")
     spec, audio = infer(spec_gen, vocoder, text_to_generate)
 
@@ -763,8 +763,8 @@ be customized with PyTorch Lightning since every NeMo model is a LightningModule
 Neural Types in NeMo TTS
 ------------------------
 
-NeMo Models and Neural Modules come with Neural Type checking. 
-Neural type checking is extremely useful when combining many different neural network architectures 
+NeMo Models and Neural Modules come with Neural Type checking.
+Neural type checking is extremely useful when combining many different neural network architectures
 for a production-grade application.
 
 .. code-block:: python
@@ -793,7 +793,7 @@ Learn More
 - Visit the `NVIDIA NeMo Developer Website <https://developer.nvidia.com/nvidia-nemo>`_
 - Read the `NVIDIA NeMo PyTorch Blog <https://medium.com/pytorch/nvidia-nemo-neural-modules-and-models-for-conversational-ai-d660480d9696>`_
 - Download pre-trained `ASR <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_, `NLP <https://ngc.nvidia.com/catalog/models/nvidia:nemonlpmodels>`_, and `TTS <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_ models on `NVIDIA NGC <https://ngc.nvidia.com/>`_ to quickly get started with NeMo.
-- Become an expert on Building Conversational AI applications with our `tutorials <https://github.com/NVIDIA/NeMo#tutorials>`_, and `example scripts <https://github.com/NVIDIA/NeMo/tree/v1.0.0b1/examples>`_, 
+- Become an expert on Building Conversational AI applications with our `tutorials <https://github.com/NVIDIA/NeMo#tutorials>`_, and `example scripts <https://github.com/NVIDIA/NeMo/tree/v1.0.0b1/examples>`_,
 - See our `developer guide <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/>`_ for more information on core NeMo concepts, ASR/NLP/TTS collections, and the NeMo API.
 
 .. note:: NeMo tutorial notebooks can be run on `Google Colab <https://colab.research.google.com/notebooks/intro.ipynb>`_.
diff --git a/docs/source/cloud_training.rst b/docs/source/cloud_training.rst
index 9fef417da7442..127bee6478dfd 100644
--- a/docs/source/cloud_training.rst
+++ b/docs/source/cloud_training.rst
@@ -26,4 +26,4 @@ using over 20+ distributions, lists, etc. Of course, you can also configure all
 can be dynamically assembled at runtime.
 
 
-.. hint:: Grid supports the search strategy of your choice! (and much more than just sweeps)
\ No newline at end of file
+.. hint:: Grid supports the search strategy of your choice! (and much more than just sweeps)
diff --git a/docs/source/datamodules.rst b/docs/source/datamodules.rst
index 2589ac605ee11..bc79d7dc3d6ea 100644
--- a/docs/source/datamodules.rst
+++ b/docs/source/datamodules.rst
@@ -129,7 +129,7 @@ Here's a more realistic, complex DataModule that shows how much more reusable th
 
             # self.dims is returned when you call dm.size()
             # Setting default dims here because we know them.
-            # Could optionally be assigned dynamically in dm.setup() 
+            # Could optionally be assigned dynamically in dm.setup()
             self.dims = (1, 28, 28)
 
         def prepare_data(self):
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index d4cf578e10bda..52f1182d1508f 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -1051,7 +1051,7 @@ would be the particular system and how it's trained (ie: A GAN or VAE or GPT).
     out = decoder(features, x)
 
     loss = perceptual_loss(x1, x2, x) + CE(out, x)
-    
+
 In Lightning, this code is organized into a :ref:`lightning_module`.
 
 Engineering code
@@ -1071,7 +1071,7 @@ over GPUs, 16-bit precision, etc. This is normally code that is THE SAME across
         download_data()
 
     dist.barrier()
-    
+
 In Lightning, this code is abstracted out by the :ref:`trainer`.
 
 Non-essential code
@@ -1090,7 +1090,7 @@ This is code that helps the research but isn't relevant to the research code. So
     z = Q.rsample()
     generated = decoder(z)
     self.experiment.log('images', generated)
-    
+
 In Lightning this code is organized into :ref:`callbacks`.
 
 Data code
diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst
index b74fe292b251b..08b3b1e997555 100644
--- a/docs/source/loggers.rst
+++ b/docs/source/loggers.rst
@@ -9,7 +9,7 @@
 Loggers
 *******
 
-Lightning supports the most popular logging frameworks (TensorBoard, Comet, etc...). TensorBoard is used by default, 
+Lightning supports the most popular logging frameworks (TensorBoard, Comet, etc...). TensorBoard is used by default,
 but you can pass to the :class:`~pytorch_lightning.trainer.trainer.Trainer` any combination of the following loggers.
 
 .. note::
@@ -247,7 +247,7 @@ Lightning supports the use of multiple loggers, just pass a list to the
     logger1 = TensorBoardLogger('tb_logs', name='my_model')
     logger2 = TestTubeLogger('tb_logs', name='my_model')
     trainer = Trainer(logger=[logger1, logger2])
-   
+
 The loggers are available as a list anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
diff --git a/docs/source/lr_finder.rst b/docs/source/lr_finder.rst
index fbeb1f5fd959d..a5c3b312f30fc 100755
--- a/docs/source/lr_finder.rst
+++ b/docs/source/lr_finder.rst
@@ -2,7 +2,7 @@
 
     from pytorch_lightning.trainer.trainer import Trainer
     from pytorch_lightning.core.lightning import LightningModule
-    
+
 .. _lr_finder:
 
 Learning Rate Finder
@@ -22,14 +22,14 @@ for both better performance and faster convergence. Even optimizers such as
 choices.
 
 To reduce the amount of guesswork concerning choosing a good initial learning
-rate, a `learning rate finder` can be used. As described in this `paper <https://arxiv.org/abs/1506.01186>`_ 
-a learning rate finder does a small run where the learning rate is increased 
-after each processed batch and the corresponding loss is logged. The result of 
+rate, a `learning rate finder` can be used. As described in this `paper <https://arxiv.org/abs/1506.01186>`_
+a learning rate finder does a small run where the learning rate is increased
+after each processed batch and the corresponding loss is logged. The result of
 this is a `lr` vs. `loss` plot that can be used as guidance for choosing a optimal
-initial lr. 
+initial lr.
 
-.. warning:: 
-    For the moment, this feature only works with models having a single optimizer. 
+.. warning::
+    For the moment, this feature only works with models having a single optimizer.
     LR Finder support for DDP is not implemented yet, it is coming soon.
 
 ----------
@@ -52,7 +52,7 @@ which can be accessed via ``self.learning_rate`` or ``self.lr``.
 
         def configure_optimizers(self):
             return Adam(self.parameters(), lr=(self.lr or self.learning_rate))
-            
+
     model = LitModel()
 
     # finds learning rate automatically
@@ -81,26 +81,26 @@ method of the trainer. A typical example of this would look like
 
     model = MyModelClass(hparams)
     trainer = Trainer()
-    
+
     # Run learning rate finder
     lr_finder = trainer.tuner.lr_find(model)
-    
+
     # Results can be found in
     lr_finder.results
-    
+
     # Plot with
     fig = lr_finder.plot(suggest=True)
     fig.show()
-    
+
     # Pick point based on plot, or get suggestion
     new_lr = lr_finder.suggestion()
-    
+
     # update hparams of the model
     model.hparams.lr = new_lr
 
     # Fit model
     trainer.fit(model)
-    
+
 The figure produced by ``lr_finder.plot()`` should look something like the figure
 below. It is recommended to not pick the learning rate that achieves the lowest
 loss, but instead something in the middle of the sharpest downward slope (red point).
diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 67640debb665a..3c853f45a70d6 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -264,7 +264,7 @@ Classification Metrics
 Input types
 -----------
 
-For the purposes of classification metrics, inputs (predictions and targets) are split 
+For the purposes of classification metrics, inputs (predictions and targets) are split
 into these categories (``N`` stands for the batch size and ``C`` for number of classes):
 
 .. csv-table:: \*dtype ``binary`` means integers that are either 0 or 1
@@ -279,10 +279,10 @@ into these categories (``N`` stands for the batch size and ``C`` for number of c
     "Multi-dimensional multi-class with probabilities", "(N, C, ...)", "``float``", "(N, ...)", "``int``"
 
 .. note::
-    All dimensions of size 1 (except ``N``) are "squeezed out" at the beginning, so 
+    All dimensions of size 1 (except ``N``) are "squeezed out" at the beginning, so
     that, for example, a tensor of shape ``(N, 1)`` is treated as ``(N, )``.
 
-When predictions or targets are integers, it is assumed that class labels start at 0, i.e. 
+When predictions or targets are integers, it is assumed that class labels start at 0, i.e.
 the possible class labels are 0, 1, 2, 3, etc. Below are some examples of different input types
 
 .. testcode::
diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst
index 30e06f76ae5bd..4c9c16e9faa0d 100644
--- a/docs/source/new-project.rst
+++ b/docs/source/new-project.rst
@@ -132,7 +132,7 @@ Examples of systems are:
 - `DQN <https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=IAlT0-75T_Kv>`_
 - `GAN <https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/models/gans/basic/basic_gan_module.py>`_
 - `Image classifier <https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=gEulmrbxwaYL>`_
-- Seq2seq 
+- Seq2seq
 - `SimCLR <https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/models/self_supervised/simclr/simclr_module.py>`_
 - `VAE <https://github.com/PyTorchLightning/pytorch-lightning-bolts/blob/master/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py>`_
 
@@ -195,7 +195,7 @@ First, define the data however you want. Lightning just needs a :class:`~torch.u
 
     dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
     train_loader = DataLoader(dataset)
-    
+
 Next, init the :ref:`lightning_module` and the PyTorch Lightning :class:`~pytorch_lightning.trainer.Trainer`,
 then call fit with both the data and model.
 
@@ -392,7 +392,7 @@ It's trivial to use CPUs, GPUs or TPUs in Lightning. There's **NO NEED** to chan
 
     # train on 1 GPU
     trainer = pl.Trainer(gpus=1)
-    
+
 .. code-block:: python
 
     # train on multiple GPUs across nodes (32 gpus here)
@@ -400,7 +400,7 @@ It's trivial to use CPUs, GPUs or TPUs in Lightning. There's **NO NEED** to chan
         gpus=4,
         num_nodes=8
     )
-    
+
 .. code-block:: python
 
     # train on gpu 1, 3, 5 (3 gpus total)
@@ -428,7 +428,7 @@ Without changing a SINGLE line of your code, you can now do the following with t
         limit_train_batches=0.5,
         val_check_interval=0.25
     )
-    
+
 -----------
 
 Checkpoints
@@ -709,7 +709,7 @@ Lightning has many tools for debugging. Here is an example of just a few of them
 
 .. code-block:: python
 
-    # Automatically overfit the sane batch of your model for a sanity test 
+    # Automatically overfit the sane batch of your model for a sanity test
     trainer = pl.Trainer(overfit_batches=1)
 
 .. code-block:: python
@@ -719,7 +719,7 @@ Lightning has many tools for debugging. Here is an example of just a few of them
     trainer = pl.Trainer(fast_dev_run=True)
 
 .. code-block:: python
-   
+
    # train only 20% of an epoch
    trainer = pl.Trainer(limit_train_batches=0.2)
 
@@ -729,10 +729,10 @@ Lightning has many tools for debugging. Here is an example of just a few of them
     trainer = pl.Trainer(val_check_interval=0.25)
 
 .. code-block:: python
-    
+
     # Profile your code to find speed/memory bottlenecks
     Trainer(profiler=True)
- 
+
 ---------------
 
 ********************
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 2680c01e4c7ec..5e96b5da0da8c 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -247,7 +247,7 @@ The default ``optimizer_step`` is relying on the internal ``LightningOptimizer``
 .. testcode::
 
     from pytorch_lightning.core.optimizer import LightningOptimizer
-   
+
     # function hook in LightningModule
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
       if not isinstance(optimizer, LightningOptimizer):
diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index 93fefad0d0e35..759a671cc42ef 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -2,7 +2,7 @@
 
     from torch.utils.data import IterableDataset
     from pytorch_lightning.trainer.trainer import Trainer
-    
+
 .. _sequences:
 
 Sequential Data
diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst
index be40810c3f944..da6de596db5a2 100644
--- a/docs/source/slurm.rst
+++ b/docs/source/slurm.rst
@@ -1,7 +1,7 @@
 .. testsetup:: *
 
     from pytorch_lightning.trainer.trainer import Trainer
-    
+
 .. _slurm:
 
 Computing cluster (SLURM)
diff --git a/docs/source/test_set.rst b/docs/source/test_set.rst
index 9fe9640aa723b..d9e989a4182f3 100644
--- a/docs/source/test_set.rst
+++ b/docs/source/test_set.rst
@@ -41,7 +41,7 @@ You can run the test set on multiple models using the same trainer instance.
 
     model1 = LitModel()
     model2 = GANModel()
-    
+
     trainer = Trainer()
     trainer.test(model1)
     trainer.test(model2)
@@ -87,7 +87,7 @@ is not available at the time your model was declared.
 
 You can either pass in a single dataloader or a list of them. This optional named
 parameter can be used in conjunction with any of the above use cases. Additionally,
-you can also pass in an :ref:`datamodules` that have overridden the 
+you can also pass in an :ref:`datamodules` that have overridden the
 :ref:`datamodule-test-dataloader-label` method.
 
 .. code-block:: python
@@ -102,6 +102,3 @@ you can also pass in an :ref:`datamodules` that have overridden the
 
     # test (pass in datamodule)
     trainer.test(datamodule=dm)
-    
-
-
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 10ee668a97fa8..d7230a1fd687a 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -130,4 +130,4 @@ Sequential Model Parallelism with Checkpointing
 PyTorch Lightning integration for Sequential Model Parallelism using `FairScale <https://github.com/facebookresearch/fairscale>`_.
 Sequential Model Parallelism splits a sequential module onto multiple GPUs, reducing peak GPU memory requirements substantially.
 
-For more information, refer to :ref:`sequential-parallelism`.
\ No newline at end of file
+For more information, refer to :ref:`sequential-parallelism`.
diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst
index 3b8f5b004974e..bf5d4fc5d6e05 100644
--- a/docs/source/transfer_learning.rst
+++ b/docs/source/transfer_learning.rst
@@ -1,7 +1,7 @@
 .. testsetup:: *
 
     from pytorch_lightning.core.lightning import LightningModule
-    
+
 Transfer Learning
 -----------------
 
diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index f22e355a09d17..1c8babd72ed18 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -92,7 +92,7 @@ You can also control more advanced options, like `save_top_k`, to save the best
     )
 
     trainer = Trainer(callbacks=[checkpoint_callback])
-    
+
 You can retrieve the checkpoint after training by calling
 
 .. code-block:: python
diff --git a/pl_examples/README.md b/pl_examples/README.md
index 936f1cc3df0cf..a1cb856eb1e33 100644
--- a/pl_examples/README.md
+++ b/pl_examples/README.md
@@ -1,4 +1,4 @@
-# Examples   
+# Examples
 Our most robust examples showing all sorts of implementations
 can be found in our sister library [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2).
 
@@ -14,6 +14,6 @@ In this folder we add 3 simple examples:
 ---
 
 ## Domain examples
-This folder contains older examples. You should instead use the examples 
-in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) 
+This folder contains older examples. You should instead use the examples
+in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2)
 for advanced use cases.
diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md
index 18ae204396290..199c453566c6f 100644
--- a/pl_examples/basic_examples/README.md
+++ b/pl_examples/basic_examples/README.md
@@ -1,5 +1,5 @@
-## Basic Examples   
-Use these examples to test how lightning works.   
+## Basic Examples
+Use these examples to test how lightning works.
 
 #### MNIST
 Trains MNIST where the model is defined inside the LightningModule.
@@ -36,7 +36,7 @@ python image_classifier.py --gpus 2
 python image_classifier.py --gpus 2 --distributed_backend 'dp'
 ```
 
----   
+---
 #### Autoencoder
 Showing the power of a system... arbitrarily complex training loops
 ```bash
@@ -49,23 +49,23 @@ python autoencoder.py --gpus 2
 # dataparallel
 python autoencoder.py --gpus 2 --distributed_backend 'dp'
 ```
----    
-# Multi-node example   
+---
+# Multi-node example
 
 This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total).
 To run this demo do the following:
 
-1. Log into the jumphost node of your SLURM-managed cluster.  
-2. Create a conda environment with Lightning and a GPU PyTorch version.   
-3. Choose a script to submit    
+1. Log into the jumphost node of your SLURM-managed cluster.
+2. Create a conda environment with Lightning and a GPU PyTorch version.
+3. Choose a script to submit
 
-#### DDP  
+#### DDP
 Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each)
 ```bash
 sbatch submit_ddp_job.sh YourEnv
 ```
 
-#### DDP2  
+#### DDP2
 Submit this job to run with a different implementation of DistributedDataParallel.
 In this version, each node acts like DataParallel but syncs across nodes like DDP.
 ```bash
diff --git a/requirements/devel.txt b/requirements/devel.txt
index a8c5293c8c7db..dcf66495ee46f 100644
--- a/requirements/devel.txt
+++ b/requirements/devel.txt
@@ -8,4 +8,4 @@
 -r ./test.txt
 
 # install all extra dependencies for running examples
--r ./examples.txt
\ No newline at end of file
+-r ./examples.txt
diff --git a/requirements/docs.txt b/requirements/docs.txt
index df596ed2bdda8..0f8f2005b88b1 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -11,4 +11,4 @@ https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#eg
 sphinx-autodoc-typehints
 sphinx-paramlinks<0.4.0
 sphinx-togglebutton
-sphinx-copybutton
\ No newline at end of file
+sphinx-copybutton
diff --git a/requirements/examples.txt b/requirements/examples.txt
index c87d10a39346f..6e48778cb222a 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,2 +1,2 @@
 torchvision>=0.4.1
-gym>=0.17.0
\ No newline at end of file
+gym>=0.17.0
diff --git a/requirements/loggers.txt b/requirements/loggers.txt
index 3ec7b25db4643..001210855871d 100644
--- a/requirements/loggers.txt
+++ b/requirements/loggers.txt
@@ -3,4 +3,4 @@ neptune-client>=0.4.109
 comet-ml>=3.1.12
 mlflow>=1.0.0
 test_tube>=0.7.5
-wandb>=0.8.21
\ No newline at end of file
+wandb>=0.8.21
diff --git a/tests/README.md b/tests/README.md
index 8ef006c4d879a..7b857a1901fd7 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -33,8 +33,8 @@ The GPU machine must have:
 3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_OPERATIONS=NCCL pip install horovod`
 
 
-## Running Coverage   
-Make sure to run coverage on a GPU machine with at least 2 GPUs and NVIDIA apex installed. 
+## Running Coverage
+Make sure to run coverage on a GPU machine with at least 2 GPUs and NVIDIA apex installed.
 
 ```bash
 cd pytorch-lightning

From 72525f0a8396ae6dce5cf78ddf71e75fbba2dbfc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 8 Jan 2021 16:36:49 +0100
Subject: [PATCH 082/136] tests for legacy checkpoints (#5223)

* wip

* generate

* clean

* tests

* copy

* download

* download

* download

* download

* download

* download

* download

* download

* download

* download

* download

* flake8

* extend

* aws

* extension

* pull

* pull

* pull

* pull

* pull

* pull

* pull

* try

* try

* try

* got it

* Apply suggestions from code review
---
 .drone.yml                                    |  8 ++
 .github/workflows/ci_test-conda.yml           | 15 ++-
 .github/workflows/ci_test-full.yml            | 13 ++-
 .github/workflows/nightly.yml                 |  2 +-
 .github/workflows/release-pypi.yml            | 50 +++++++++-
 .gitignore                                    |  4 +
 MANIFEST.in                                   |  1 +
 dockers/base-conda/Dockerfile                 |  2 +
 dockers/base-cuda/Dockerfile                  |  2 +
 dockers/tpu-tests/Dockerfile                  |  6 ++
 legacy/checkpoints/.gitkeep                   |  0
 legacy/generate_checkpoints.sh                | 40 ++++++++
 legacy/zero_training.py                       | 92 +++++++++++++++++++
 setup.py                                      |  2 +-
 tests/__init__.py                             |  2 +
 .../checkpointing/test_legacy_checkpoints.py  | 54 +++++++++++
 16 files changed, 286 insertions(+), 7 deletions(-)
 create mode 100644 legacy/checkpoints/.gitkeep
 create mode 100644 legacy/generate_checkpoints.sh
 create mode 100644 legacy/zero_training.py
 create mode 100644 tests/checkpointing/test_legacy_checkpoints.py

diff --git a/.drone.yml b/.drone.yml
index 472861852cae7..91ccba28a1175 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -39,6 +39,14 @@ steps:
     # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0"
     - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed
     - pip list
+    # todo: remove unzip install after new nigtly docker is created
+    - apt-get update -qq
+    - apt-get install -y --no-install-recommends unzip
+    # get legacy checkpoints
+    - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/
+    - unzip -o legacy/checkpoints.zip -d legacy/
+    - ls -l legacy/checkpoints/
+    # testing...
     - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=25 # --flake8
     # Running special tests
     - sh tests/special_tests.sh
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index d64fedbfbe590..284a9792090e8 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -34,10 +34,21 @@ jobs:
       # todo this probably does not work with docker images, rather cache dockers
       uses: actions/cache@v2
       with:
-        path: Datasets # This path is specific to Ubuntu
-        # Look to see if there is a cache hit for the corresponding requirements file
+        path: Datasets
         key: pl-dataset
 
+    - name: Pull checkpoints from S3
+      # todo: consider adding coma caching, but ATM all models have less then 100KB
+      run: |
+        # todo: remove unzip install after new nigtly docker is created
+        apt-get update -qq
+        apt-get install -y --no-install-recommends unzip
+        # enter legacy and update checkpoints from S3
+        cd legacy
+        curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip
+        unzip -o checkpoints.zip
+        ls -l checkpoints/
+
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 719b374d76efb..3eb8ed8409f64 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -104,6 +104,16 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-
 
+    - name: Pull checkpoints from S3
+      # todo: consider adding some caching, but ATM all models have less then 100KB
+      run: |
+        cd legacy
+        # wget is simpler but does not work on Windows
+        python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')"
+        ls -l .
+        unzip -o checkpoints.zip
+        ls -l checkpoints/
+
     - name: Install dependencies
       env:
         # MAKEFLAGS: "-j2"
@@ -136,8 +146,7 @@ jobs:
     - name: Cache datasets
       uses: actions/cache@v2
       with:
-        path: Datasets # This path is specific to Ubuntu
-        # Look to see if there is a cache hit for the corresponding requirements file
+        path: Datasets
         key: pl-dataset
 
     - name: Tests
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1f14dcdd276dd..df8c5e5411369 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -35,7 +35,7 @@ jobs:
         with:
           time: 5m
 
-        # We do this, since failures on test.pypi aren't that bad
+      # We do this, since failures on test.pypi aren't that bad
       - name: Publish to Test PyPI
         uses: pypa/gh-action-pypi-publish@v1.4.1
         with:
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index 3cc3157ffbf89..b0310c3d36ccc 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -5,7 +5,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
   push:
     branches: [master, "release/*"]  # include release branches like release/1.0.x
   release:
-    types: [created, "release/*"]
+    types: [created]
 
 
 jobs:
@@ -61,3 +61,51 @@ jobs:
       with:
         user: __token__
         password: ${{ secrets.pypi_password }}
+
+    # Note: This uses an internal pip API and may not always work
+    # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
+    - name: Cache pip
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
+        restore-keys: ${{ runner.os }}-pip-
+
+    - name: Install dependencies
+      run: |
+        pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
+        pip install virtualenv
+        pip install awscli
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
+        aws-region: us-east-1
+
+    - name: Pull files from S3
+      run: |
+        aws s3 cp --recursive s3://pl-public-data/legacy/checkpoints/ legacy/checkpoints/ #  --acl public-read
+        ls -l legacy/checkpoints/
+
+    - name: Generate checkpoint
+      if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
+      run: |
+        virtualenv vEnv --system-site-packages
+        source vEnv/bin/activate
+        pip install dist/*
+
+        pl_ver=$(python -c "import pytorch_lightning as pl ; print(pl.__version__)" 2>&1)
+        # generate checkpoint to this version
+        bash legacy/generate_checkpoints.sh $pl_ver
+
+        deactivate
+        rm -rf vEnv
+
+    - name: Push files to S3
+      run: |
+        aws s3 sync legacy/checkpoints/ s3://pl-public-data/legacy/checkpoints/
+        cd legacy
+        zip -r checkpoints.zip checkpoints
+        aws s3 cp checkpoints.zip s3://pl-public-data/legacy/ --acl public-read
diff --git a/.gitignore b/.gitignore
index 743fdaaf33dc2..d6ae2ef48ed01 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ timit_data/
 # C extensions
 *.so
 
+# PyCharm
 .idea/
 
 # Distribution / packaging
@@ -126,11 +127,14 @@ ENV/
 
 # mypy
 .mypy_cache/
+# pytest
+.pytest_cache/
 
 # data
 .data/
 Datasets/
 mnist/
+legacy/checkpoints/
 
 # pl tests
 ml-runs/
diff --git a/MANIFEST.in b/MANIFEST.in
index 450a9ec576d0b..95672548f724c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -69,3 +69,4 @@ prune temp*
 prune test*
 prune benchmark*
 prune dockers
+prune legacy
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index e5f2d0cf08f69..83c8fe9e7a59b 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -40,7 +40,9 @@ RUN apt-get update -qq && \
         build-essential \
         cmake \
         git \
+        wget \
         curl \
+        unzip \
         ca-certificates \
     && \
 
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index bdaf271f2b854..e6b0c5061c02c 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -45,6 +45,8 @@ RUN apt-get update -qq && \
         cmake \
         git \
         wget \
+        curl \
+        unzip \
         ca-certificates \
         software-properties-common \
     && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index 464f7fd8f309e..9ba8f98d440a2 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -23,6 +23,12 @@ MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 COPY ./ ./pytorch-lightning/
 
+# Pull the legacy checkpoints
+RUN cd pytorch-lightning && \
+    wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \
+    unzip -o legacy/checkpoints.zip -d legacy/ && \
+    ls -l legacy/checkpoints/
+
 # If using this image for tests, intall more dependencies and don"t delete the source code where the tests live.
 RUN \
     # Install pytorch-lightning at the current PR, plus dependencies.
diff --git a/legacy/checkpoints/.gitkeep b/legacy/checkpoints/.gitkeep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/legacy/generate_checkpoints.sh b/legacy/generate_checkpoints.sh
new file mode 100644
index 0000000000000..c9f4dabff46c5
--- /dev/null
+++ b/legacy/generate_checkpoints.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Sample call:
+#  bash generate_checkpoints.sh 1.0.2 1.0.3 1.0.4
+
+LEGACY_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+
+echo $LEGACY_PATH
+# install some PT version here so it does not need to reinstalled for each env
+pip install virtualenv "torch==1.5" --quiet --no-cache-dir
+
+ENV_PATH="$LEGACY_PATH/vEnv"
+
+# iterate over all arguments assuming that each argument is version
+for ver in "$@"
+do
+	echo "processing version: $ver"
+	# mkdir "$LEGACY_PATH/$ver"
+
+  # create local env
+  echo $ENV_PATH
+  virtualenv $ENV_PATH --system-site-packages
+  # activate and install PL version
+  source "$ENV_PATH/bin/activate"
+  pip install "pytorch_lightning==$ver" --quiet -U --no-cache-dir
+
+  python --version
+  pip --version
+  pip list | grep torch
+
+  python "$LEGACY_PATH/zero_training.py"
+  cp "$LEGACY_PATH/zero_training.py" ${LEGACY_PATH}/checkpoints/${ver}
+
+  mv ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs/version_0/checkpoints/*.ckpt ${LEGACY_PATH}/checkpoints/${ver}/
+  rm -rf ${LEGACY_PATH}/checkpoints/${ver}/lightning_logs
+
+  deactivate
+  # clear env
+  rm -rf $ENV_PATH
+
+done
diff --git a/legacy/zero_training.py b/legacy/zero_training.py
new file mode 100644
index 0000000000000..4e4952a3bb1db
--- /dev/null
+++ b/legacy/zero_training.py
@@ -0,0 +1,92 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+from torch.utils.data import Dataset
+
+import pytorch_lightning as pl
+
+PATH_LEGACY = os.path.dirname(__file__)
+
+
+class RandomDataset(Dataset):
+    def __init__(self, size, length: int = 100):
+        self.len = length
+        self.data = torch.randn(length, size)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+class DummyModel(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+        self.layer = torch.nn.Linear(32, 2)
+
+    def forward(self, x):
+        return self.layer(x)
+
+    def _loss(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
+
+    def _step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self._loss(batch, output)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def validation_step(self, batch, batch_idx):
+        self._step(batch, batch_idx)
+
+    def test_step(self, batch, batch_idx):
+        self._step(batch, batch_idx)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
+
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    def test_dataloader(self):
+        return torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+
+def main_train(dir_path, max_epochs: int = 5):
+
+    trainer = pl.Trainer(
+        default_root_dir=dir_path,
+        checkpoint_callback=True,
+        max_epochs=max_epochs,
+    )
+
+    model = DummyModel()
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    path_dir = os.path.join(PATH_LEGACY, 'checkpoints', str(pl.__version__))
+    main_train(path_dir)
diff --git a/setup.py b/setup.py
index c548d508ab434..dd36842d84a38 100755
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,7 @@
     url=pytorch_lightning.__homepage__,
     download_url='https://github.com/PyTorchLightning/pytorch-lightning',
     license=pytorch_lightning.__license__,
-    packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks']),
+    packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks', 'legacy', 'legacy/*']),
 
     long_description=_load_long_description(PATH_ROOT),
     long_description_content_type='text/markdown',
diff --git a/tests/__init__.py b/tests/__init__.py
index 1bb81c466e6eb..b4a7291dfec66 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -18,6 +18,8 @@
 TEST_ROOT = os.path.dirname(__file__)
 PROJECT_ROOT = os.path.dirname(TEST_ROOT)
 TEMP_PATH = os.path.join(PROJECT_ROOT, 'test_temp')
+DATASETS_PATH = os.path.join(PROJECT_ROOT, 'Datasets')
+LEGACY_PATH = os.path.join(PROJECT_ROOT, 'legacy')
 
 # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
 if PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
new file mode 100644
index 0000000000000..cb9fe443a316b
--- /dev/null
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -0,0 +1,54 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os
+import sys
+
+import pytest
+
+from pytorch_lightning import Trainer
+from tests import LEGACY_PATH
+
+LEGACY_CHECKPOINTS_PATH = os.path.join(LEGACY_PATH, 'checkpoints')
+CHECKPOINT_EXTENSION = ".ckpt"
+
+
+# todo: add more legacy checkpoints :]
+@pytest.mark.parametrize("pl_version", [
+    "0.10.0", "1.0.0", "1.0.1", "1.0.2", "1.0.3", "1.0.4", "1.0.5", "1.0.6", "1.0.7", "1.0.8"
+])
+def test_resume_legacy_checkpoints(tmpdir, pl_version):
+    path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)
+
+    # todo: make this as mock, so it is cleaner...
+    orig_sys_paths = list(sys.path)
+    sys.path.insert(0, path_dir)
+    from zero_training import DummyModel
+
+    path_ckpts = sorted(glob.glob(os.path.join(path_dir, f'*{CHECKPOINT_EXTENSION}')))
+    assert path_ckpts, 'No checkpoints found in folder "%s"' % path_dir
+    path_ckpt = path_ckpts[-1]
+
+    model = DummyModel.load_from_checkpoint(path_ckpt)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=6)
+    result = trainer.fit(model)
+    assert result
+
+    # todo
+    # model = DummyModel()
+    # trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, resume_from_checkpoint=path_ckpt)
+    # result = trainer.fit(model)
+    # assert result
+
+    sys.path = orig_sys_paths

From d510707bc99d43dd2cfd877428d9cc16af8b4074 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Fri, 8 Jan 2021 18:05:22 +0100
Subject: [PATCH 083/136] [bug-fix] Call transfer_batch_to_device in DDPlugin  
 (#5195)

* hacking out

* update

* remove useless on_before_forward

* update

* remove overriden

* iremove os

* use on_before_forward

* resolve flake8

* add test

* update

* add single_process_per_device

* resolve flake8

* update

* resolve

* update

* update

* update

* add comment

* resolve bug with sharded

* update

* remove property

* update

* resolve test

* resolve bug

* update on comments

* update doc

* Update pytorch_lightning/core/hooks.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* update on comments

* Update pytorch_lightning/plugins/ddp_plugin.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* Update pytorch_lightning/plugins/ddp_plugin.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

* resolve pep8

* add device_ids to pipe

* update on comments

* update

* resolve

* update

* update

* update

Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 CHANGELOG.md                                  |  3 ++
 .../accelerators/ddp2_accelerator.py          |  3 +-
 .../accelerators/ddp_accelerator.py           |  3 +-
 .../accelerators/ddp_cpu_spawn_accelerator.py |  3 +-
 .../accelerators/ddp_hpc_accelerator.py       |  3 +-
 .../accelerators/ddp_spawn_accelerator.py     |  3 +-
 pytorch_lightning/core/hooks.py               |  9 ++--
 pytorch_lightning/plugins/ddp_plugin.py       | 26 +++++-----
 .../plugins/ddp_sequential_plugin.py          |  2 +-
 pytorch_lightning/plugins/sharded_plugin.py   |  7 +--
 pytorch_lightning/trainer/training_loop.py    |  1 -
 pytorch_lightning/utilities/apply_func.py     |  8 ++--
 tests/models/test_hooks.py                    | 48 ++++++++++++++++++-
 tests/models/test_sync_batchnorm.py           |  4 +-
 tests/special_tests.sh                        |  1 +
 15 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ca2165c8f77e..be7585ab1fc24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/PyTorchLightning/pytorch-lightning/pull/5195))
+
+
 
 ## [1.1.3] - 2021-01-05
 
diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py
index 2e864029f8767..46d944a35cb62 100644
--- a/pytorch_lightning/accelerators/ddp2_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp2_accelerator.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
 
 if HYDRA_AVAILABLE:
@@ -213,6 +213,7 @@ def ddp_train(self, process_idx, mp_queue, model):
     def configure_ddp(
             self, model: LightningModule, device_ids: List[int]
     ) -> DistributedDataParallel:
+        self.ddp_plugin.device_ids = device_ids
         model = self.ddp_plugin.configure_ddp(model, device_ids)
         return model
 
diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py
index da9eb2d3ea937..1f1f1f42f52ff 100644
--- a/pytorch_lightning/accelerators/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_accelerator.py
@@ -30,7 +30,7 @@
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import (
     all_gather_ddp_if_available,
     find_free_network_port,
@@ -314,6 +314,7 @@ def ddp_train(self, process_idx, model):
     def configure_ddp(
             self, model: LightningModule, device_ids: List[int]
     ) -> DistributedDataParallel:
+        self.ddp_plugin.device_ids = device_ids
         model = self.ddp_plugin.configure_ddp(model, device_ids)
         return model
 
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
index 91a6dee484f30..cc178dc14b49d 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import (
     all_gather_ddp_if_available,
     find_free_network_port,
@@ -241,6 +241,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
     def configure_ddp(
             self, model: LightningModule, device_ids: List[int]
     ) -> DistributedDataParallel:
+        self.ddp_plugin.device_ids = device_ids
         model = self.ddp_plugin.configure_ddp(model, device_ids)
         return model
 
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index b257884e34aef..c2915b9d570bb 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, rank_zero_only, sync_ddp_if_available
 
 if HYDRA_AVAILABLE:
@@ -205,6 +205,7 @@ def ddp_train(self, process_idx, model):
     def configure_ddp(
             self, model: LightningModule, device_ids: List[int]
     ) -> DistributedDataParallel:
+        self.ddp_plugin.device_ids = device_ids
         model = self.ddp_plugin.configure_ddp(model, device_ids)
         return model
 
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
index a49e17fc0b31d..f35b42342d88a 100644
--- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
@@ -27,7 +27,7 @@
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
-from pytorch_lightning.utilities import HYDRA_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HYDRA_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.distributed import (
@@ -273,6 +273,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
     def configure_ddp(
             self, model: LightningModule, device_ids: List[int]
     ) -> DistributedDataParallel:
+        self.ddp_plugin.device_ids = device_ids
         model = self.ddp_plugin.configure_ddp(model, device_ids)
         return model
 
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index f24a4ce8beb8a..4a1eeb4e9f608 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -17,10 +17,11 @@
 from typing import Any, Dict, List, Optional, Union
 
 import torch
-from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
 
+from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn
+
 
 class ModelHooks:
     """Hooks to be used in LightningModule."""
@@ -539,9 +540,9 @@ def transfer_batch_to_device(self, batch, device)
             any other device than the one passed in as argument (unless you know what you are doing).
 
         Note:
-            This hook only runs on single GPU training (no data-parallel). If you need multi-GPU support
-            for your custom batch objects, you need to define your custom
-            :class:`~torch.nn.parallel.DistributedDataParallel` or
+            This hook only runs on single GPU training and DDP.
+            If you need multi-GPU support for your custom batch objects in ``dp`` or ``ddp2``,
+            you need to define your custom :class:`~torch.nn.parallel.DistributedDataParallel` or
             :class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel` and
             override :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_ddp`.
 
diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py
index 281074cb37813..6d5ad1e9e2119 100644
--- a/pytorch_lightning/plugins/ddp_plugin.py
+++ b/pytorch_lightning/plugins/ddp_plugin.py
@@ -1,7 +1,8 @@
 import os
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Union
 
+import torch
 import torch.distributed as torch_distrib
 from torch.optim import Optimizer
 
@@ -47,7 +48,7 @@ def configure_ddp(
 
             def configure_ddp(self, model, device_ids):
                 model = LightningDistributedDataParallel(
-                    model, device_ids=device_ids, find_unused_parameters=True
+                    model, device_ids=device_ids, find_unused_parameters=False
                 )
                 return model
 
@@ -59,9 +60,9 @@ def configure_ddp(self, model, device_ids):
             the model wrapped in LightningDistributedDataParallel
 
         """
-        # if unset, default `find_unused_parameters` `True`
+        # if unset, default `find_unused_parameters` `False`
         self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get(
-            "find_unused_parameters", True
+            "find_unused_parameters", False
         )
         model = LightningDistributedDataParallel(
             model,
@@ -91,22 +92,23 @@ def init_ddp_connection(
                 torch_backend, rank=global_rank, world_size=world_size
             )
 
+    @property
+    def is_running_single_process_per_device(self) -> bool:
+        # objects do not need to be scattered in single process per device, move objects upfront to device
+        # This property is used in ``self.on_before_forward`` function.
+        return self.device_ids is not None and len(self.device_ids) == 1
+
     def on_before_forward(self, model: LightningModule, *args):
         """
-        Override to handle custom input to device logic. For DDP, no logic is required as this is handled internally
-        within the DDP wrapper.
-
-        Example::
-
-            def on_before_forward(self, model, *args):
-                batch, batch_idx = args
-                return batch.to(model.device)
+        Override to handle custom edge case.
 
         Args:
             args: Inputs to the model.
             model: Model to train.
         Returns: args moved to correct device if needed.
         """
+        if self.is_running_single_process_per_device:
+            args = model.transfer_batch_to_device(args, model.device)
         return args
 
     def optimizer_state(self, optimizer: Optimizer) -> dict:
diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/ddp_sequential_plugin.py
index cb8740742db73..4d2835c518b2d 100644
--- a/pytorch_lightning/plugins/ddp_sequential_plugin.py
+++ b/pytorch_lightning/plugins/ddp_sequential_plugin.py
@@ -19,8 +19,8 @@
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
 
-from pytorch_lightning import LightningModule
 from pytorch_lightning import _logger as log
+from pytorch_lightning import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
 from pytorch_lightning.utilities import FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/sharded_plugin.py
index 937538561ccdd..b87a2c2a389ef 100644
--- a/pytorch_lightning/plugins/sharded_plugin.py
+++ b/pytorch_lightning/plugins/sharded_plugin.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Union, Any
+from typing import Any, List, Optional, Union
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.sharded_native_amp_plugin import ShardedNativeAMPPlugin
-from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, AMPType, rank_zero_only
+from pytorch_lightning.utilities import AMPType, FAIRSCALE_AVAILABLE, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if FAIRSCALE_AVAILABLE:
@@ -42,9 +42,6 @@ def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]:
         optimizer.consolidate_state_dict()
         return self._optim_state_dict(optimizer)
 
-    def on_before_forward(self, model: LightningModule, *args):
-        return model.transfer_batch_to_device(args, model.trainer.root_gpu)
-
     def _check_fairscale(self):
         if not FAIRSCALE_AVAILABLE:
             raise MisconfigurationException(
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 0d99b071d4567..6ae0fc9af8fbd 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from contextlib import contextmanager
 from copy import copy, deepcopy
 
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
index 775c22dbbfa0a..76ac0a6c595aa 100644
--- a/pytorch_lightning/utilities/apply_func.py
+++ b/pytorch_lightning/utilities/apply_func.py
@@ -49,12 +49,14 @@ def apply_to_collection(data: Any, dtype: Union[type, tuple], function: Callable
         return function(data, *args, **kwargs)
 
     # Recursively apply to collection items
-    elif isinstance(data, Mapping):
+    if isinstance(data, Mapping):
         return elem_type({k: apply_to_collection(v, dtype, function, *args, **kwargs)
                           for k, v in data.items()})
-    elif isinstance(data, tuple) and hasattr(data, '_fields'):  # named tuple
+
+    if isinstance(data, tuple) and hasattr(data, '_fields'):  # named tuple
         return elem_type(*(apply_to_collection(d, dtype, function, *args, **kwargs) for d in data))
-    elif isinstance(data, Sequence) and not isinstance(data, str):
+
+    if isinstance(data, Sequence) and not isinstance(data, str):
         return elem_type([apply_to_collection(d, dtype, function, *args, **kwargs) for d in data])
 
     # data is neither of dtype, nor a collection
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index f3af5b745a380..8a5d2f667bc32 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import os
+from unittest.mock import MagicMock
 
 import pytest
 import torch
-from unittest.mock import MagicMock
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
-from tests.base import EvalModelTemplate, BoringModel
+from tests.base import BoringModel, EvalModelTemplate, RandomDataset
 
 
 @pytest.mark.parametrize('max_steps', [1, 2, 3])
@@ -124,6 +125,49 @@ def transfer_batch_to_device(self, data, device):
     assert batch_gpu.samples.device == batch_gpu.targets.device == expected
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+def test_transfer_batch_hook_ddp(tmpdir):
+    """
+    Test custom data are properly moved to the right device using ddp
+    """
+
+    class CustomBatch:
+
+        def __init__(self, data):
+            self.samples = data[0]
+
+        def to(self, device, **kwargs):
+            self.samples = self.samples.to(device, **kwargs)
+            return self
+
+    def collate_fn(batch):
+        return CustomBatch(batch)
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            assert batch.samples.device == self.device
+            assert isinstance(batch_idx, int)
+
+        def train_dataloader(self):
+            return torch.utils.data.DataLoader(RandomDataset(32, 64), collate_fn=collate_fn)
+
+    model = TestModel()
+    model.validation_step = None
+    model.training_epoch_end = None
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        max_epochs=1,
+        weights_summary=None,
+        accelerator="ddp",
+        gpus=2,
+    )
+    trainer.fit(model)
+
+
 @pytest.mark.parametrize(
     'max_epochs,batch_idx_',
     [(2, 5), (3, 8), (4, 12)]
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index fd771c98635ab..dd54c6b5d654e 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -16,8 +16,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from pytorch_lightning import Trainer, seed_everything, LightningModule
+from pytorch_lightning import LightningModule, seed_everything, Trainer
 from pytorch_lightning.core.step_result import TrainResult
+from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.utilities import FLOAT16_EPSILON
 from tests.base.datamodules import MNISTDataModule
 from tests.base.develop_utils import set_random_master_port
@@ -108,6 +109,7 @@ def test_sync_batchnorm_ddp(tmpdir):
         sync_batchnorm=True,
         num_sanity_val_steps=0,
         replace_sampler_ddp=False,
+        plugins=[DDPPlugin(find_unused_parameters=True)]
     )
 
     result = trainer.fit(model, dm)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 8d67cce28b39f..a86243628e914 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -21,3 +21,4 @@ python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequent
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
 python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
+python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp

From f2e99d617f05ec65fded81ccc6d0d59807c47573 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Fri, 8 Jan 2021 22:13:12 +0100
Subject: [PATCH 084/136] deprecate enable_pl_optimizer as it is not restored
 properly (#5244)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update

* clean test

* still in progress

* udpdate test

* update

* update

* resolve flake

* add test for zero_grad

* update

* works without accumulated_grad

* update

* update

* resolve amp

* revert back to True

* update

* clean tests

* cleaned out

* typo

* update test

* git repare bug

* remove print

* udpate

* Fix formatting/optimizer imports

* Refactor the test for cleanliness

* Add vanilla model to the test, better var names

* Fixed var names, let's clean up these mock tests

* repare test

* update test

* resolve flake8

* add manual_optimization

* update tests

* resolve flake8

* add random accumulate_grad_batches

* improve test

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update

* clean tests

* correct bug

* Apply suggestions from code review

* format

* adress comments

* update on comments

* wip

* typo

* depreceate enable_pl_optimizer

* resolve latest bugs

* update

* resolve merge

* add comment

* Update pytorch_lightning/core/lightning.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/deprecated_api/test_remove_1-3.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/trainer/connectors/optimizer_connector.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/trainer/trainer.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/trainer/trainer.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update on comments

* update restore

* add a property

* remove setstate as not needed anymore

* update test

* provide optimizer to on_before_zero_grad

* update on comments

* update on comments

* Update pytorch_lightning/trainer/trainer.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Update tests/trainer/optimization/test_parity_automatic_optimization.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* mofidy import

* update changelog

* resolve flake8

* update

* update

* clean doc

Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 CHANGELOG.md                                  |   2 +
 README.md                                     |   3 +-
 benchmarks/test_sharded_parity.py             |   3 +-
 docs/source/new-project.rst                   |   3 +-
 docs/source/optimizers.rst                    |  31 ++++-
 docs/source/trainer.rst                       |   6 +-
 .../accelerators/cpu_accelerator.py           |   4 +-
 .../accelerators/ddp2_accelerator.py          |   2 -
 .../accelerators/ddp_accelerator.py           |   4 +-
 .../accelerators/ddp_cpu_spawn_accelerator.py |   2 -
 .../accelerators/ddp_hpc_accelerator.py       |   4 +-
 .../accelerators/ddp_spawn_accelerator.py     |   2 -
 .../accelerators/dp_accelerator.py            |   2 -
 .../accelerators/gpu_accelerator.py           |   2 -
 .../accelerators/horovod_accelerator.py       |   6 +-
 .../accelerators/tpu_accelerator.py           |   4 +-
 pytorch_lightning/core/lightning.py           |   7 +-
 pytorch_lightning/core/optimizer.py           |  20 ++-
 .../plugins/ddp_sequential_plugin.py          |   5 +-
 pytorch_lightning/plugins/native_amp.py       |   2 +-
 pytorch_lightning/plugins/sharded_plugin.py   |   3 +-
 .../trainer/configuration_validator.py        |  17 ---
 .../trainer/connectors/optimizer_connector.py |   6 +-
 .../trainer/connectors/precision_connector.py |   3 +-
 pytorch_lightning/trainer/optimizers.py       |   7 +-
 pytorch_lightning/trainer/properties.py       |  20 +--
 pytorch_lightning/trainer/trainer.py          |   5 +-
 pytorch_lightning/trainer/training_loop.py    |   2 +-
 tests/callbacks/test_callbacks.py             |   4 +-
 tests/checkpointing/test_model_checkpoint.py  |   9 +-
 tests/checkpointing/test_torch_saving.py      |   6 +-
 tests/core/test_lightning_module.py           |   8 +-
 tests/core/test_lightning_optimizer.py        |  12 +-
 tests/deprecated_api/test_remove_1-3.py       |   5 +
 tests/models/test_amp.py                      |   8 +-
 tests/models/test_cpu.py                      |  23 +---
 tests/models/test_horovod.py                  |  24 ++--
 tests/models/test_restore.py                  |  19 +--
 .../optimization/test_manual_optimization.py  |  42 +-----
 .../test_parity_automatic_optimization.py     | 120 ++++++++++++------
 .../test_parity_manual_optimization.py        |  10 +-
 tests/trainer/test_optimizers.py              |   5 +-
 tests/trainer/test_trainer.py                 |  16 +--
 43 files changed, 218 insertions(+), 270 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be7585ab1fc24..3c32b93cc0dec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Changed depreceated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244))
+
 
 ### Deprecated
 
diff --git a/README.md b/README.md
index cd9eb7cf02fc2..73286edc2c53b 100644
--- a/README.md
+++ b/README.md
@@ -225,7 +225,8 @@ with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
 ```python
 class LitAutoEncoder(pl.LightningModule):
     def training_step(self, batch, batch_idx, opt_idx):
-        (opt_a, opt_b) = self.optimizers()
+        # access your optimizers with use_pl_optimizer=False. Default is True
+        (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
 
         loss_a = ...
         self.manual_backward(loss_a, opt_a)
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index fae343d921035..5d688a8b374ff 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -186,7 +186,8 @@ def train_dataloader(self):
 class SeedTrainLoaderManualModel(SeedTrainLoaderModel):
     def training_step(self, batch, batch_idx, optimizer_idx):
         # manual
-        (opt_a, opt_b) = self.optimizers()
+        # access your optimizers with use_pl_optimizer=False. Default is True
+        (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
         loss_1 = self.step(batch)
 
         self.manual_backward(loss_1, opt_a)
diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst
index 4c9c16e9faa0d..def273f7a8257 100644
--- a/docs/source/new-project.rst
+++ b/docs/source/new-project.rst
@@ -268,7 +268,8 @@ Now you own the train loop!
 .. code-block:: python
 
     def training_step(self, batch, batch_idx, opt_idx):
-        (opt_a, opt_b, opt_c) = self.optimizers()
+        # access your optimizers with use_pl_optimizer=False. Default is True
+        (opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True)
 
         loss_a = self.generator(batch[0])
 
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 5e96b5da0da8c..588bdefb367e3 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -28,8 +28,15 @@ to manually manage the optimization process. To do so, do the following:
 .. code-block:: python
 
     def training_step(self, batch, batch_idx, optimizer_idx):
-        # ignore optimizer_idx
-        (opt_g, opt_d) = self.optimizers()
+
+        # 1. ignore optimizer_idx
+        # 2. `use_pl_optimizer=True` means `opt_g` and `opt_d` will be of type `LightingOptimizer`
+        # `LightingOptimizer` simply wrapped your optimizer and behave the same way !
+        # When calling `optimizer.step`, `LightingOptimizer` will just handle TPU, AMP, accumulate_grad_batches, etc ... for you.
+
+        # access your optimizers with `use_pl_optimizer=False` or `optimizer.optimizer` when using use_pl_optimizer=True
+        # use_pl_optimizer=True is the default
+        (opt_g, opt_d) = self.optimizers(use_pl_optimizer=True)
 
         # do anything you want
         loss_a = ...
@@ -242,19 +249,29 @@ Here we add a learning-rate warm up
         # update params
         optimizer.step(closure=closure)
 
-The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step.
+.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ...
 
 .. testcode::
 
-    from pytorch_lightning.core.optimizer import LightningOptimizer
+    # function hook in LightningModule
+    def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
+      optimizer.step(closure=closure)
+
+.. note:: To access your wrapped Optimizer from ``LightningOptimizer``, do as follow.
+
+.. testcode::
 
     # function hook in LightningModule
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
-      if not isinstance(optimizer, LightningOptimizer):
-         # wraps into LightingOptimizer only for running step
-         optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
+
+      # `optimizer is a ``LightningOptimizer`` wrapping the optimizer.
+      # To access it, do as follow:
+      optimizer = optimizer.optimizer
+
+      # run step. However, it won't work on TPU, AMP, etc...
       optimizer.step(closure=closure)
 
+
 ----------
 
 Using the closure functions for optimization
diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst
index 8d42541a3fbb4..d461c30a20a6f 100644
--- a/docs/source/trainer.rst
+++ b/docs/source/trainer.rst
@@ -335,7 +335,8 @@ optimizer behavior
 Example::
 
     def training_step(self, batch, batch_idx):
-        opt = self.optimizers()
+        # access your optimizers with use_pl_optimizer=False. Default is True
+        opt = self.optimizers(use_pl_optimizer=True)
 
         loss = ...
         self.manual_backward(loss, opt)
@@ -350,7 +351,8 @@ In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizer
 Example::
 
     def training_step(self, batch, batch_idx, optimizer_idx):
-        (opt_a, opt_b) = self.optimizers()
+        # access your optimizers with use_pl_optimizer=False. Default is True
+        (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
 
         gen_loss = ...
         self.manual_backward(gen_loss, opt_a)
diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py
index 25302cabbc70f..e034b209bf34c 100644
--- a/pytorch_lightning/accelerators/cpu_accelerator.py
+++ b/pytorch_lightning/accelerators/cpu_accelerator.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, Union, Callable
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -48,8 +48,6 @@ def setup(self, model):
         # allow for lr schedulers as well
         self.setup_optimizers(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         self.trainer.model = model
 
     def train(self):
diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py
index 46d944a35cb62..68af3f579a6e8 100644
--- a/pytorch_lightning/accelerators/ddp2_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp2_accelerator.py
@@ -192,8 +192,6 @@ def ddp_train(self, process_idx, mp_queue, model):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # device ids change depending on the DDP setup
         device_ids = self.get_device_ids()
 
diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py
index 1f1f1f42f52ff..f0d9f2171bf48 100644
--- a/pytorch_lightning/accelerators/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_accelerator.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 import os
+from os.path import abspath
 import subprocess
 import sys
-from os.path import abspath
 from time import sleep
 from typing import Any, List, Optional, Union
 
@@ -291,8 +291,6 @@ def ddp_train(self, process_idx, model):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # device ids change depending on the DDP setup
         device_ids = self.get_device_ids()
 
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
index cc178dc14b49d..e7ef38c8df3b4 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
@@ -152,8 +152,6 @@ def ddp_train(self, process_idx, mp_queue, model):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # DDP spawn already spawned off each process... no need to do anything
         device_ids = self.get_device_ids()
 
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index c2915b9d570bb..c25e082ee348d 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -15,8 +15,8 @@
 from typing import Any, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
 import torch.distributed as dist
+import torch.distributed as torch_distrib
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
@@ -183,8 +183,6 @@ def ddp_train(self, process_idx, model):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # device ids change depending on the DDP setup
         device_ids = self.get_device_ids()
 
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
index f35b42342d88a..23783fada72f1 100644
--- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
@@ -167,8 +167,6 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # device ids change depending on the DDP setup
         device_ids = self.get_device_ids()
 
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py
index 834a920b505d9..fc01c4686f04f 100644
--- a/pytorch_lightning/accelerators/dp_accelerator.py
+++ b/pytorch_lightning/accelerators/dp_accelerator.py
@@ -65,8 +65,6 @@ def setup(self, model):
         if self.trainer.amp_backend:
             model = self.__init_half_precision(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         self.trainer.model = model
 
     def __init_torch_data_parallel(self, model):
diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py
index 1310777e0d890..49f21e9e34816 100644
--- a/pytorch_lightning/accelerators/gpu_accelerator.py
+++ b/pytorch_lightning/accelerators/gpu_accelerator.py
@@ -54,8 +54,6 @@ def setup(self, model):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         self.trainer.model = model
 
     def train(self):
diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py
index 5895025673b9a..2013d75df7b1e 100644
--- a/pytorch_lightning/accelerators/horovod_accelerator.py
+++ b/pytorch_lightning/accelerators/horovod_accelerator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import ExitStack
-from typing import Any, Optional, Union, Callable
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
@@ -20,7 +20,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.cluster_environments import ClusterEnvironment
-from pytorch_lightning.utilities import HOROVOD_AVAILABLE, AMPType
+from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
 if HOROVOD_AVAILABLE:
@@ -91,8 +91,6 @@ def _filter_named_parameters(model, optimizer):
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
-        self.trainer.convert_to_lightning_optimizers()
-
         # Update logger rank info from Horovod to avoid race conditions from  different ranks
         # creating directories / writing files in the same locations.
         self.trainer.global_rank = hvd.rank()
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 9d1eec5594d82..7dcfaae401ca7 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -26,11 +26,11 @@
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.utilities import (
-    TPU_AVAILABLE,
     move_data_to_device,
     rank_zero_info,
     rank_zero_only,
     rank_zero_warn,
+    TPU_AVAILABLE,
 )
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -230,8 +230,6 @@ def __setup_tpu_training(self, model: LightningModule, trainer):
                  f' global rank: {trainer.tpu_global_core_rank}'
                  f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
 
-        self.trainer.convert_to_lightning_optimizers()
-
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         # do backward pass
         if self.trainer.train_loop.automatic_optimization:
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 421fc5e5cf2ac..bd6784cc3b4bb 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -112,8 +112,11 @@ def __init__(self, *args, **kwargs):
         self._current_hook_fx_name = None
         self._current_dataloader_idx = None
 
-    def optimizers(self):
-        opts = self.trainer.optimizers
+    def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]:
+        if use_pl_optimizer:
+            opts = list(self.trainer.lightning_optimizers.values())
+        else:
+            opts = self.trainer.optimizers
 
         # single optimizer
         if isinstance(opts, list) and len(opts) == 1 and isinstance(opts[0], Optimizer):
diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index f0b361de6133e..ed5e9490983b0 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -17,7 +17,7 @@
 
 from torch.optim.optimizer import Optimizer
 
-from pytorch_lightning.utilities import TPU_AVAILABLE
+from pytorch_lightning.utilities import AMPType, TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if TPU_AVAILABLE:
@@ -62,6 +62,10 @@ def __init__(self,
         self._accumulate_grad_batches = accumulate_grad_batches
         self._optimizer_idx = None
 
+    @property
+    def optimizer(self):
+        return self._optimizer
+
     @property
     def defaults(self):
         return self._optimizer.defaults
@@ -102,11 +106,13 @@ def _on_trainer_init(self, trainer):
                 break
 
     @classmethod
-    def to_lightning_optimizer(cls, optimizer, trainer):
-        if isinstance(optimizer, LightningOptimizer):
-            return optimizer
-        optimizer = cls(optimizer)
-        optimizer._on_trainer_init(trainer)
+    def _to_lightning_optimizer(cls, optimizer, trainer, opt_idx):
+        # apex overrides .step function and need to be wrapped on each step
+        if trainer.amp_backend == AMPType.APEX:
+            optimizer = cls(optimizer)
+            optimizer._on_trainer_init(trainer)
+        else:
+            optimizer = trainer.lightning_optimizers[opt_idx]
         return optimizer
 
     def _accumulated_batches_reached(self):
@@ -148,7 +154,7 @@ def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_n
                     **kwargs
                 )
 
-        trainer.train_loop.on_before_zero_grad(self)
+        trainer.train_loop.on_before_zero_grad(optimizer)
 
         model.optimizer_zero_grad(
             trainer.current_epoch,
diff --git a/pytorch_lightning/plugins/ddp_sequential_plugin.py b/pytorch_lightning/plugins/ddp_sequential_plugin.py
index 4d2835c518b2d..069b1754fbce0 100644
--- a/pytorch_lightning/plugins/ddp_sequential_plugin.py
+++ b/pytorch_lightning/plugins/ddp_sequential_plugin.py
@@ -15,8 +15,8 @@
 from typing import Any, List, Optional
 
 import torch
-import torch.distributed as torch_distrib
 from torch import nn
+import torch.distributed as torch_distrib
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning import _logger as log
@@ -27,8 +27,8 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if FAIRSCALE_PIPE_AVAILABLE:
-    import fairscale.nn.model_parallel as mpu
     from fairscale.nn import PipeRPCWrapper
+    import fairscale.nn.model_parallel as mpu
     from fairscale.nn.pipe import balance as pipe_balance
     from fairscale.nn.pipe import rpc as rpc_pipe
     from fairscale.nn.pipe.pipeline import PipelineStyle
@@ -380,7 +380,6 @@ def register_optimizers(ctx, model):
     model.trainer.optimizers = optimizers
     model.trainer.lr_schedulers = lr_schedulers
     model.trainer.optimizer_frequencies = optimizer_frequencies
-    model.trainer.convert_to_lightning_optimizers()
 
 
 def run_optimizer(ctx, model):
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 3d64fe91388b8..9df1ba3262afa 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -54,7 +54,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         # unscale gradient to allow analyze within `on_after_backward`
         if not self.trainer.train_loop.should_accumulate() and automatic_optimization:
             if isinstance(optimizer, LightningOptimizer):
-                self.trainer.scaler.unscale_(optimizer._optimizer)
+                self.trainer.scaler.unscale_(optimizer.optimizer)
             else:
                 self.trainer.scaler.unscale_(optimizer)
 
diff --git a/pytorch_lightning/plugins/sharded_plugin.py b/pytorch_lightning/plugins/sharded_plugin.py
index b87a2c2a389ef..d989b6237ad72 100644
--- a/pytorch_lightning/plugins/sharded_plugin.py
+++ b/pytorch_lightning/plugins/sharded_plugin.py
@@ -63,7 +63,7 @@ def _reinit_with_fairscale_oss(self, trainer):
         optimizers = trainer.optimizers
         for x, optimizer in enumerate(optimizers):
             if is_lightning_optimizer(optimizer):
-                optimizer = optimizer._optimizer
+                optimizer = optimizer.optimizer
             if not isinstance(optimizer, OSS):
                 optim_class = type(optimizer)
                 zero_optimizer = OSS(
@@ -73,7 +73,6 @@ def _reinit_with_fairscale_oss(self, trainer):
                 )
                 optimizers[x] = zero_optimizer
                 del optimizer
-        trainer.convert_to_lightning_optimizers()
 
     def get_model_from_plugin(
             self,
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index 21d6af043df02..20992255ba29e 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -73,17 +73,7 @@ def __verify_train_loop_configuration(self, model):
 
         trainer.overriden_optimizer_step = is_overridden('optimizer_step', model)
         trainer.overriden_optimizer_zero_grad = is_overridden('optimizer_zero_grad', model)
-
-        enable_pl_optimizer = trainer._enable_pl_optimizer
         automatic_optimization = trainer.train_loop.automatic_optimization
-        if trainer.overriden_optimizer_step and not enable_pl_optimizer and automatic_optimization:
-            rank_zero_warn(
-                "When overriding `LightningModule` optimizer_step with"
-                " `Trainer(..., enable_pl_optimizer=False, ...)`,"
-                " we won't be calling `.zero_grad` we can't assume when you call your `optimizer.step()`."
-                " For Lightning to take care of it, please use `Trainer(enable_pl_optimizer=True)`."
-            )
-
         going_to_accumulate_grad_batches = trainer.accumulation_scheduler.going_to_accumulate_grad_batches()
 
         has_overriden_optimization_functions = trainer.overriden_optimizer_step or trainer.overriden_optimizer_zero_grad
@@ -94,13 +84,6 @@ def __verify_train_loop_configuration(self, model):
                 ' It ensures optimizer_step or optimizer_zero_grad are called on every batch.'
             )
 
-        if (enable_pl_optimizer) and trainer.overriden_optimizer_zero_grad and not automatic_optimization:
-            raise MisconfigurationException(
-                'When overriding `LightningModule` optimizer_zero_grad'
-                ' and preserving model property `automatic_optimization` as True with'
-                ' `Trainer(enable_pl_optimizer=True, ...) is not supported'
-            )
-
     def __verify_eval_loop_configuration(self, model, eval_loop_name):
         step_name = f'{eval_loop_name}_step'
 
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index 8c352c8e5ffeb..8b23203e42bc3 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -20,7 +20,11 @@ def __init__(self, trainer):
         self.trainer = trainer
 
     def on_trainer_init(self, enable_pl_optimizer):
-        self.trainer._enable_pl_optimizer = enable_pl_optimizer
+        if enable_pl_optimizer is not None:
+            rank_zero_warn(
+                "Trainer argument `enable_pl_optimizer` is deprecated in v1.1.3. It will be removed in v1.3.0",
+                DeprecationWarning
+            )
         self.trainer.lr_schedulers = []
         self.trainer.optimizers = []
         self.trainer.optimizer_frequencies = []
diff --git a/pytorch_lightning/trainer/connectors/precision_connector.py b/pytorch_lightning/trainer/connectors/precision_connector.py
index 37d5315e5d11b..822c3ef634fdc 100644
--- a/pytorch_lightning/trainer/connectors/precision_connector.py
+++ b/pytorch_lightning/trainer/connectors/precision_connector.py
@@ -15,7 +15,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.plugins.apex import ApexPlugin
 from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
-from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, AMPType, rank_zero_warn
+from pytorch_lightning.utilities import AMPType, APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, rank_zero_warn
 
 
 class PrecisionConnector:
@@ -67,7 +67,6 @@ def _setup_amp_backend(self, amp_type: str):
                 self.trainer.amp_backend = AMPType.APEX
                 self.backend = ApexPlugin(self.trainer)
                 log.warn("LightningOptimizer doesn't support Apex")
-                self.trainer._enable_pl_optimizer = False
 
         if not self.trainer.amp_backend:
             raise ModuleNotFoundError(
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 974ee898ff00b..a8cb1e279984f 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from abc import ABC
+from collections import OrderedDict
 from typing import List, Optional, Tuple
 
 import torch
@@ -88,8 +89,10 @@ def _convert_to_lightning_optimizer(trainer, optimizer):
             optimizer._on_trainer_init(trainer)
             return optimizer
 
-        if self._enable_pl_optimizer:
-            self.optimizers = [_convert_to_lightning_optimizer(self, opt) for opt in self.optimizers]
+        self._lightning_optimizers = {
+            opt_idx: _convert_to_lightning_optimizer(self, opt)
+            for opt_idx, opt in enumerate(self.optimizers)
+        }
 
     def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
         # Convert each scheduler into dict structure with relevant information
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index 614c863fa7256..3fa2af79e5530 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import inspect
-import os
 from abc import ABC
 from argparse import ArgumentParser, Namespace
+import inspect
+import os
 from typing import cast, List, Optional, Type, TypeVar, Union
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
@@ -59,6 +59,7 @@ class TrainerProperties(ABC):
     model_connector: ModelConnector
     checkpoint_connector: CheckpointConnector
     callbacks: List[Callback]
+    _lightning_optimizers = None
 
     @property
     def log_dir(self):
@@ -258,16 +259,17 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
     def get_model(self):
         return self.model_connector.get_model()
 
+    @property
+    def lightning_optimizers(self):
+        if self._lightning_optimizers is None:
+            self.convert_to_lightning_optimizers()
+        return self._lightning_optimizers
+
     def __getstate__(self):
-        # unwrap optimizer
-        self.optimizers = [opt._optimizer if is_lightning_optimizer(opt) else opt for opt in self.optimizers]
+        # remove lightning_optimizers
+        self._lightning_optimizers = None
         return self.__dict__
 
-    def __setstate__(self, d):
-        self.__dict__ = d
-        # wrap optimizers in enable_pl_optimzer is True
-        self.convert_to_lightning_optimizers()
-
     @property
     def require_distributed_sampler(self):
         if self.accelerator_backend is not None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 06717c6333829..2c1867a21552d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -134,7 +134,7 @@ def __init__(
         distributed_backend: Optional[str] = None,
         automatic_optimization: Optional[bool] = None,
         move_metrics_to_cpu: bool = False,
-        enable_pl_optimizer: bool = False,
+        enable_pl_optimizer: bool = None,  # todo: remove in v1.3
     ):
         r"""
         Customize every aspect of training via flags
@@ -283,7 +283,8 @@ def __init__(
 
             enable_pl_optimizer: If True, each optimizer will be wrapped by
                 `pytorch_lightning.core.optimizer.LightningOptimizer`. It allows Lightning to
-                handle AMP, TPU, accumulated_gradients, etc..
+                handle AMP, TPU, accumulated_gradients, etc.
+                .. warning:: Currently deprecated and it will be removed in v1.3
         """
         super().__init__()
         self._device_type = DeviceType.CPU
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 6ae0fc9af8fbd..3c8a8d45d0411 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -499,7 +499,7 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_
                 ' To request, please file a Github issue in PyTorch and tag @mcarilli')
 
         # wraps into LightingOptimizer only for running step
-        optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
+        optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx)
 
         # model hook
         model_ref.optimizer_step(
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 070bb4e9f6989..53debcebeb7cd 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from unittest import mock
-from unittest.mock import ANY, MagicMock, call
+from unittest.mock import ANY, call, MagicMock
 
 from pytorch_lightning import Trainer
 from tests.base import BoringModel
@@ -33,8 +33,6 @@ def test_trainer_callback_system(torch_save):
         limit_train_batches=3,
         limit_test_batches=2,
         progress_bar_refresh_rate=0,
-        # todo: enabled since internally we wrap the model for optimizer step, this should be fixed
-        enable_pl_optimizer=True
     )
 
     # no call yet
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 8d4a859a88784..3de26ef1a6fb6 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -636,8 +636,7 @@ def validation_epoch_end(self, outputs):
 
 
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
+def test_checkpoint_repeated_strategy(tmpdir):
     """
     This test validates that the checkpoint can be called when provided to callbacks list
     """
@@ -657,7 +656,6 @@ def validation_step(self, batch, batch_idx):
         limit_val_batches=2,
         limit_test_batches=2,
         callbacks=[checkpoint_callback],
-        enable_pl_optimizer=enable_pl_optimizer,
         weights_summary=None,
         progress_bar_refresh_rate=0,
     )
@@ -674,7 +672,6 @@ def validation_step(self, batch, batch_idx):
             limit_val_batches=2,
             limit_test_batches=2,
             resume_from_checkpoint=checkpoint_callback.best_model_path,
-            enable_pl_optimizer=enable_pl_optimizer,
             weights_summary=None,
             progress_bar_refresh_rate=0,
         )
@@ -685,8 +682,7 @@ def validation_step(self, batch, batch_idx):
 
 
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir):
+def test_checkpoint_repeated_strategy_extended(tmpdir):
     """
     This test validates checkpoint can be called several times without
     increasing internally its global step if nothing run.
@@ -731,7 +727,6 @@ def assert_checkpoint_log_dir(idx):
         limit_train_batches=limit_train_batches,
         limit_val_batches=3,
         limit_test_batches=4,
-        enable_pl_optimizer=enable_pl_optimizer,
         callbacks=[checkpoint_cb],
     )
     trainer = pl.Trainer(**trainer_config)
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index 493aa0dabe126..b322cfe5a7fd3 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -22,15 +22,13 @@
 from tests.base import BoringModel
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_model_torch_save(tmpdir, enable_pl_optimizer):
+def test_model_torch_save(tmpdir):
     """Test to ensure torch save does not fail for model and trainer."""
     model = BoringModel()
     num_epochs = 1
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=num_epochs,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     temp_path = os.path.join(tmpdir, 'temp.pt')
     trainer.fit(model)
@@ -39,8 +37,6 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer):
     torch.save(trainer.model, temp_path)
     torch.save(trainer, temp_path)
     trainer = torch.load(temp_path)
-    is_lightning_optimizer = isinstance(trainer.optimizers[0], LightningOptimizer)
-    assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer
 
 
 @pytest.mark.skipif(platform.system() == "Windows",
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 01319365d9051..64b68245ba66e 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -45,8 +45,7 @@ def optimizer_step(self, *_, **__):
         assert "It ensures optimizer_step or optimizer_zero_grad are called on every batch" in str(e)
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_automatic_optimization_num_calls(enable_pl_optimizer, tmpdir):
+def test_automatic_optimization_num_calls(tmpdir):
 
     with patch("torch.optim.SGD.step") as sgd_step, \
          patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \
@@ -90,7 +89,6 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
             default_root_dir=tmpdir,
             limit_train_batches=8,
             accumulate_grad_batches=1,
-            enable_pl_optimizer=enable_pl_optimizer
         )
 
         trainer.fit(model)
@@ -101,8 +99,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
     assert adam_zero_grad.call_count == 2
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_params_groups_and_state_are_accessible(enable_pl_optimizer, tmpdir):
+def test_params_groups_and_state_are_accessible(tmpdir):
 
     with patch("torch.optim.SGD.step") as sgd_step, \
          patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \
@@ -139,7 +136,6 @@ def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_idx, clos
             default_root_dir=tmpdir,
             limit_train_batches=8,
             accumulate_grad_batches=1,
-            enable_pl_optimizer=enable_pl_optimizer
         )
 
         trainer.fit(model)
diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
index 530f20f86a3db..171cf00ad4f66 100644
--- a/tests/core/test_lightning_optimizer.py
+++ b/tests/core/test_lightning_optimizer.py
@@ -47,13 +47,12 @@ def configure_optimizers(self):
         limit_val_batches=1,
         max_epochs=1,
         weights_summary=None,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
     groups = "{'dampening': 0, 'initial_lr': 0.1, 'lr': 0.01, 'momentum': 0, 'nesterov': False, 'weight_decay': 0}"
     expected = f"LightningSGD(groups=[{groups}])"
-    assert trainer.optimizers[0].__repr__() == expected
+    assert trainer._lightning_optimizers[0].__repr__() == expected
 
 
 def test_lightning_optimizer_from_user(tmpdir):
@@ -75,13 +74,12 @@ def configure_optimizers(self):
         limit_val_batches=1,
         max_epochs=1,
         weights_summary=None,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
     groups = "{'amsgrad': False, 'betas': (0.9, 0.999), 'eps': 1e-08, 'initial_lr': 0.1, 'lr': 0.01, 'weight_decay': 0}"
     expected = f"LightningAdam(groups=[{groups}])"
-    assert trainer.optimizers[0].__repr__() == expected
+    assert trainer._lightning_optimizers[0].__repr__() == expected
 
 
 @patch("torch.optim.Adam.step", autospec=True)
@@ -129,7 +127,6 @@ def automatic_optimization(self) -> bool:
         limit_val_batches=1,
         max_epochs=1,
         weights_summary=None,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
@@ -183,7 +180,6 @@ def automatic_optimization(self) -> bool:
         max_epochs=1,
         weights_summary=None,
         accumulate_grad_batches=2,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
@@ -263,7 +259,6 @@ def configure_optimizers(self):
         limit_val_batches=1,
         max_epochs=1,
         weights_summary=None,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
@@ -316,7 +311,6 @@ def configure_optimizers(self):
             limit_val_batches=1,
             max_epochs=1,
             weights_summary=None,
-            enable_pl_optimizer=True,
         )
         trainer.fit(model)
 
@@ -376,7 +370,6 @@ def configure_optimizers(self):
                 limit_val_batches=1,
                 max_epochs=1,
                 weights_summary=None,
-                enable_pl_optimizer=True,
             )
             trainer.fit(model)
 
@@ -429,7 +422,6 @@ def configure_optimizers(self):
             limit_val_batches=1,
             max_epochs=1,
             weights_summary=None,
-            enable_pl_optimizer=True,
         )
         trainer.fit(model)
 
diff --git a/tests/deprecated_api/test_remove_1-3.py b/tests/deprecated_api/test_remove_1-3.py
index 7ec69796b1e46..4a5bed4de9b55 100644
--- a/tests/deprecated_api/test_remove_1-3.py
+++ b/tests/deprecated_api/test_remove_1-3.py
@@ -135,3 +135,8 @@ def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, ex
     assert getattr(args, "profiler") == expected_parsed_arg
     trainer = Trainer.from_argparse_args(args)
     assert isinstance(trainer.profiler, expected_profiler)
+
+
+def test_trainer_enable_pl_optimizer(tmpdir):
+    with pytest.deprecated_call(match='will be removed in v1.3'):
+        Trainer(enable_pl_optimizer=True)
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 269a2069e4266..60da3ba55eba4 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -17,14 +17,14 @@
 import pytest
 import torch
 
-import tests.base.develop_pipelines as tpipes
-import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
+import tests.base.develop_pipelines as tpipes
+import tests.base.develop_utils as tutils
 
 
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@@ -145,8 +145,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
     assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir):
+def test_cpu_model_with_amp(tmpdir):
     """Make sure model trains on CPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -155,7 +154,6 @@ def test_cpu_model_with_amp(enable_pl_optimizer, tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.4,
         precision=16,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
 
     model = EvalModelTemplate()
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index 2848ab2e74f3c..8fea2ab941418 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -11,23 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from distutils.version import LooseVersion
 import os
 import platform
-from distutils.version import LooseVersion
 
 import pytest
 import torch
 
-import tests.base.develop_pipelines as tpipes
-import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.step_result import TrainResult
 from tests.base import EvalModelTemplate
+import tests.base.develop_pipelines as tpipes
+import tests.base.develop_utils as tutils
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
+def test_cpu_slurm_save_load(tmpdir):
     """Verify model save/load/checkpoint on CPU."""
     hparams = EvalModelTemplate.get_default_hparams()
     model = EvalModelTemplate(**hparams)
@@ -44,7 +43,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
         limit_train_batches=0.2,
         limit_val_batches=0.2,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     result = trainer.fit(model)
     real_global_step = trainer.global_step
@@ -81,7 +79,6 @@ def test_cpu_slurm_save_load(enable_pl_optimizer, tmpdir):
         max_epochs=1,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     model = EvalModelTemplate(**hparams)
 
@@ -101,8 +98,7 @@ def assert_pred_same():
     trainer.fit(model)
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
+def test_early_stopping_cpu_model(tmpdir):
     """Test each of the trainer options."""
     stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1)
     trainer_options = dict(
@@ -114,7 +110,6 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
         track_grad_norm=2,
         limit_train_batches=0.1,
         limit_val_batches=0.1,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
 
     model = EvalModelTemplate()
@@ -130,8 +125,7 @@ def test_early_stopping_cpu_model(enable_pl_optimizer, tmpdir):
 @pytest.mark.skipif((platform.system() == "Darwin" and
                      LooseVersion(torch.__version__) < LooseVersion("1.3.0")),
                     reason="Distributed training is not supported on MacOS before Torch 1.3.0")
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir):
+def test_multi_cpu_model_ddp(tmpdir):
     """Make sure DDP works."""
     tutils.set_random_master_port()
 
@@ -144,7 +138,6 @@ def test_multi_cpu_model_ddp(enable_pl_optimizer, tmpdir):
         gpus=None,
         num_processes=2,
         accelerator='ddp_cpu',
-        enable_pl_optimizer=enable_pl_optimizer,
     )
 
     model = EvalModelTemplate()
@@ -284,8 +277,7 @@ def test_cpu_model(tmpdir):
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_all_features_cpu_model(enable_pl_optimizer, tmpdir):
+def test_all_features_cpu_model(tmpdir):
     """Test each of the trainer options."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -297,7 +289,6 @@ def test_all_features_cpu_model(enable_pl_optimizer, tmpdir):
         max_epochs=1,
         limit_train_batches=0.4,
         limit_val_batches=0.4,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
 
     model = EvalModelTemplate()
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index f47c13021edde..a047bfde6f7a2 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -20,18 +20,18 @@
 
 import numpy as np
 import pytest
-import torch
 from sklearn.metrics import accuracy_score
+import torch
 
-import tests.base.develop_pipelines as tpipes
-import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator
 from pytorch_lightning.core.step_result import EvalResult, Result, TrainResult
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
-from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVAILABLE, HOROVOD_AVAILABLE, _module_available
+from pytorch_lightning.utilities import _module_available, APEX_AVAILABLE, HOROVOD_AVAILABLE, NATIVE_AMP_AVAILABLE
 from tests.base import EvalModelTemplate
 from tests.base.boring_model import BoringModel
+import tests.base.develop_pipelines as tpipes
+import tests.base.develop_utils as tutils
 from tests.base.models import BasicGAN
 
 if HOROVOD_AVAILABLE:
@@ -69,8 +69,7 @@ def _run_horovod(trainer_options, on_gpu=False):
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_horovod_cpu(enable_pl_optimizer, tmpdir):
+def test_horovod_cpu(tmpdir):
     """Test Horovod running multi-process on CPU."""
     trainer_options = dict(
         default_root_dir=str(tmpdir),
@@ -82,14 +81,12 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir):
         limit_val_batches=0.2,
         accelerator='horovod',
         deterministic=True,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
+def test_horovod_cpu_implicit(tmpdir):
     """Test Horovod without specifying a backend, inferring from env set by `horovodrun`."""
     trainer_options = dict(
         default_root_dir=str(tmpdir),
@@ -100,7 +97,6 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
         limit_train_batches=0.4,
         limit_val_batches=0.2,
         deterministic=True,
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
 
@@ -206,8 +202,7 @@ def validation_step(self, batch, *args, **kwargs):
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir):
+def test_horovod_multi_optimizer(tmpdir):
     model = BasicGAN(**EvalModelTemplate.get_default_hparams())
 
     # fit model
@@ -219,7 +214,6 @@ def test_horovod_multi_optimizer(enable_pl_optimizer, tmpdir):
         limit_val_batches=0.2,
         deterministic=True,
         accelerator='horovod',
-        enable_pl_optimizer=enable_pl_optimizer,
     )
     result = trainer.fit(model)
     assert result == 1, 'model failed to complete'
@@ -241,8 +235,7 @@ def get_optimizer_params(optimizer):
 
 @pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable")
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_result_reduce_horovod(enable_pl_optimizer, tmpdir):
+def test_result_reduce_horovod(tmpdir):
     """Make sure result logging works with Horovod.
 
     This test mirrors tests/core/test_results.py::_ddp_test_fn
@@ -282,7 +275,6 @@ def training_epoch_end(self, outputs) -> None:
             max_epochs=1,
             log_every_n_steps=1,
             weights_summary=None,
-            enable_pl_optimizer=enable_pl_optimizer,
         )
 
         trainer.fit(model)
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index f7773f63aa8c2..ded9deb0d0a45 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from copy import deepcopy
 import glob
 import logging as log
 import os
 import pickle
-from copy import deepcopy
 
 import cloudpickle
 import pytest
@@ -23,10 +23,10 @@
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 
+from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
-from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything
-from pytorch_lightning.callbacks import ModelCheckpoint
 from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
 
 
@@ -52,8 +52,7 @@ def on_train_end(self, trainer, pl_module):
         self._check_properties(trainer, pl_module)
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
+def test_model_properties_resume_from_checkpoint(tmpdir):
     """ Test that properties like `current_epoch` and `global_step`
     in model and trainer are always the same. """
     model = EvalModelTemplate()
@@ -62,7 +61,6 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         logger=False,
-        enable_pl_optimizer=enable_pl_optimizer,
         callbacks=[checkpoint_callback, ModelTrainerPropertyParity()],  # this performs the assertions
     )
     trainer = Trainer(**trainer_args)
@@ -99,8 +97,7 @@ def on_train_start(self, trainer, pl_module):
         self.callbacks = deepcopy(trainer.callbacks)
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
+def test_callbacks_state_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint restores callbacks that persist state. """
     model = EvalModelTemplate()
     callback_capture = CaptureCallbacksBeforeTraining()
@@ -111,7 +108,6 @@ def get_trainer_args():
             default_root_dir=tmpdir,
             max_steps=1,
             logger=False,
-            enable_pl_optimizer=enable_pl_optimizer,
             callbacks=[
                 checkpoint,
                 callback_capture,
@@ -138,11 +134,10 @@ def get_trainer_args():
             assert before.best_model_score == after.best_model_score
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_callbacks_references_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
+def test_callbacks_references_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint sets references as expected. """
     model = EvalModelTemplate()
-    args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False, "enable_pl_optimizer": enable_pl_optimizer}
+    args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}
 
     # initial training
     checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 33d14e852b285..f0d7c6d96914e 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -21,7 +21,7 @@
 import torch.distributed as torch_distrib
 import torch.nn.functional as F
 
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.utilities import APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base.boring_model import BoringModel
@@ -454,7 +454,6 @@ def test_manual_optimization_and_return_tensor(tmpdir):
         amp_backend='native',
         accelerator="ddp_spawn",
         gpus=2,
-        enable_pl_optimizer=True
     )
     trainer.fit(model)
 
@@ -573,7 +572,6 @@ def automatic_optimization(self) -> bool:
         amp_backend='native',
         accumulate_grad_batches=4,
         gpus=1,
-        enable_pl_optimizer=True,
     )
     trainer.fit(model)
 
@@ -648,7 +646,6 @@ def automatic_optimization(self) -> bool:
         precision=16,
         amp_backend='native',
         gpus=1,
-        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
@@ -730,7 +727,6 @@ def automatic_optimization(self) -> bool:
         limit_val_batches=2,
         max_epochs=1,
         log_every_n_steps=1,
-        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
@@ -795,7 +791,6 @@ def automatic_optimization(self) -> bool:
         max_epochs=1,
         log_every_n_steps=1,
         accumulate_grad_batches=2,
-        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
@@ -851,7 +846,6 @@ def automatic_optimization(self) -> bool:
         max_epochs=1,
         log_every_n_steps=1,
         accumulate_grad_batches=2,
-        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
@@ -929,7 +923,6 @@ def automatic_optimization(self) -> bool:
         max_epochs=1,
         log_every_n_steps=1,
         accumulate_grad_batches=2,
-        enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
@@ -1037,7 +1030,6 @@ def automatic_optimization(self) -> bool:
         max_epochs=1,
         log_every_n_steps=1,
         accumulate_grad_batches=2,
-        enable_pl_optimizer=True,
         gpus=2,
         accelerator="ddp",
     )
@@ -1048,35 +1040,3 @@ def automatic_optimization(self) -> bool:
 
     expected_calls = [call(closure=ANY, optim='adam')] * 2
     mock_adam_step.assert_has_calls(expected_calls)
-
-
-def test_step_with_misconfiguraiton_error_when_overriding_optimizer_zero_grad(tmpdir):
-    """
-    Tests that `optimizer_zero_grad` in manual_optimization triggers a MisconfigurationException
-    """
-    try:
-        class TestModel(BoringModel):
-
-            def optimizer_zero_grad(self, *_):
-                pass
-
-            @property
-            def automatic_optimization(self) -> bool:
-                return False
-
-        model = TestModel()
-        model.val_dataloader = None
-        model.training_epoch_end = None
-
-        limit_train_batches = 8
-        trainer = Trainer(
-            default_root_dir=tmpdir,
-            limit_train_batches=limit_train_batches,
-            limit_val_batches=2,
-            max_epochs=1,
-            log_every_n_steps=1,
-            accumulate_grad_batches=2,
-            enable_pl_optimizer=True,
-        )
-    except MisconfigurationException as e:
-        assert "`Trainer(enable_pl_optimizer=True, ...) is not supported" in str(e)
diff --git a/tests/trainer/optimization/test_parity_automatic_optimization.py b/tests/trainer/optimization/test_parity_automatic_optimization.py
index 4a1d6c384cd52..4f5cc855a3164 100644
--- a/tests/trainer/optimization/test_parity_automatic_optimization.py
+++ b/tests/trainer/optimization/test_parity_automatic_optimization.py
@@ -21,7 +21,9 @@
 import torch
 from torch.optim import Optimizer
 
+import pytorch_lightning as pl
 from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from tests.base.boring_model import BoringModel
 
@@ -35,7 +37,13 @@
 
 class BaseParityAutomaticOptimizationModel(BoringModel):
 
-    def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_batches=None):
+    def __init__(
+        self,
+        optimizer_cls,
+        optimizer_is_mocked=False,
+        accumulate_grad_batches=None,
+        lr=0.1
+    ):
         super().__init__()
         self.optimizer_cls = optimizer_cls
         self.losses = []
@@ -44,6 +52,7 @@ def __init__(self, optimizer_cls, optimizer_is_mocked=False, accumulate_grad_bat
         self.optimizer_is_mocked = optimizer_is_mocked
         self.grad_checked = False
         self.accumulate_grad_batches = accumulate_grad_batches
+        self.lr = lr
 
     def on_before_zero_grad(self, optimizer):
         self.on_before_zero_grad_count += 1
@@ -51,7 +60,7 @@ def on_before_zero_grad(self, optimizer):
             self.grads.append(self.layer.weight.grad.clone())
 
     def configure_optimizers(self):
-        optimizer = self.optimizer_cls(self.layer.parameters(), lr=0.1)
+        optimizer = self.optimizer_cls(self.layer.parameters(), lr=self.lr)
         assert isinstance(optimizer, Optimizer)
         return optimizer
 
@@ -86,7 +95,7 @@ def optimizer_step(
         Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard
         """
         # Get the unwrapped optimizer
-        optimizer = optimizer._optimizer
+        optimizer = optimizer.optimizer
         assert not isinstance(optimizer, LightningOptimizer)
 
         optimizer_closure()
@@ -136,7 +145,7 @@ def optimizer_step(
         Override the optimizer step to define manual optimizer steps, as we use LightningOptimizer wrapper as standard
         """
         # Get the unwrapped optimizer
-        optimizer = optimizer._optimizer
+        optimizer = optimizer.optimizer
         assert not isinstance(optimizer, LightningOptimizer)
 
         optimizer_closure()
@@ -211,10 +220,8 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal
 
     with patch("torch.optim.SGD.step") as mock_sgd_step, \
             patch("torch.optim.Adam.step") as mock_adam_step, \
-            patch("torch.optim.AdamW.step") as mock_adamw_step, \
             patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \
-            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \
-            patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad:
+            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad:
 
         max_epochs = 2
         limit_train_batches = 10
@@ -238,8 +245,62 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal
         assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches)
         assert mock_sgd_step.call_count == mock_adam_step.call_count
         assert mock_sgd_step.call_count == mock_adam_step.call_count
-        assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count
-        assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count
+
+
+def train_with_restore(tmpdir, model_cls, restore_from=None):
+    # init model
+    if restore_from is not None:
+        seed_everything(42)
+    model = model_cls(torch.optim.Adam, accumulate_grad_batches=1, lr=10e-1)
+    ckpt_saver = ModelCheckpoint(dirpath=f"{tmpdir}/mckpt", save_last=True, save_weights_only=False)
+    # Initialize a trainer
+    trainer = pl.Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=(1 + bool(restore_from)),
+        limit_train_batches=8,
+        callbacks=([ckpt_saver] if restore_from is None else []),
+        checkpoint_callback=(not restore_from),
+        resume_from_checkpoint=restore_from,
+        num_sanity_val_steps=0,
+    )
+
+    # Train the model
+    trainer.fit(model)
+    return ckpt_saver.best_model_path, model
+
+
+def test_parity_checkpointing(tmpdir):
+    """
+    This test assert that reloading a checkpoint and finetunning gives the same result
+    with / without LightningOptimizer
+    """
+
+    # Initial train run of the model.
+    seed_everything(0)
+    ckpt_path, first_epoch_pl_optimizer_model = train_with_restore(
+        tmpdir,
+        model_cls=BaseParityAutomaticOptimizationModel,
+        restore_from=None)
+
+    assert "last" in ckpt_path
+    _, second_epoch_pl_optimizer_model = train_with_restore(
+        tmpdir,
+        model_cls=BaseParityAutomaticOptimizationModel,
+        restore_from=ckpt_path)
+
+    seed_everything(0)
+    ckpt_path, first_epoch_pure_pytorch_optimizer_model = train_with_restore(
+        tmpdir,
+        model_cls=AutomaticOptimizationPurePytorchOptimizerModel,
+        restore_from=None)
+
+    _, second_epoch_pure_pytorch_optimizer_model = train_with_restore(
+        tmpdir,
+        model_cls=AutomaticOptimizationPurePytorchOptimizerModel,
+        restore_from=ckpt_path)
+
+    assert first_epoch_pl_optimizer_model.losses == first_epoch_pure_pytorch_optimizer_model.losses
+    assert second_epoch_pl_optimizer_model.losses == second_epoch_pure_pytorch_optimizer_model.losses
 
 
 def run_lightning_optimizer_equality(
@@ -261,22 +322,12 @@ def run_lightning_optimizer_equality(
         torch.optim.SGD,
         expected_num_batches=expected_num_batches,
         optimizer_is_mocked=optimizer_is_mocked,
-        enable_pl_optimizer=True,
-        **trainer_kwargs,
-    )
-
-    no_pl_optimizer_initial_model_weights, no_pl_optimizer_model = train_specific_optimizer_model(
-        lightning_model_cls,
-        torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD,
-        expected_num_batches=expected_num_batches,
-        optimizer_is_mocked=optimizer_is_mocked,
-        enable_pl_optimizer=False,  # Disable pl optimizer
         **trainer_kwargs,
     )
 
     pure_pytorch_optimizer_initial_model_weights, pure_pytorch_optimizer_model = train_specific_optimizer_model(
         vanilla_model_cls,
-        torch.optim.AdamW if optimizer_is_mocked else torch.optim.SGD,
+        torch.optim.Adam if optimizer_is_mocked else torch.optim.SGD,
         expected_num_batches=expected_num_batches,
         optimizer_is_mocked=optimizer_is_mocked,
         replace_optimizer_step_with_pure_pytorch=True,
@@ -288,8 +339,6 @@ def run_lightning_optimizer_equality(
         assert_model_equality(
             pl_optimizer_initial_model_weights=pl_optimizer_initial_model_weights,
             pl_optimizer_model=pl_optimizer_model,
-            no_pl_optimizer_initial_model_weights=no_pl_optimizer_initial_model_weights,
-            no_pl_optimizer_model=no_pl_optimizer_model,
             pure_pytorch_optimizer_initial_model_weights=pure_pytorch_optimizer_initial_model_weights,
             pure_pytorch_optimizer_model=pure_pytorch_optimizer_model,
             expected_num_batches=expected_num_batches,
@@ -300,35 +349,24 @@ def run_lightning_optimizer_equality(
 def assert_model_equality(
         pl_optimizer_initial_model_weights,
         pl_optimizer_model,
-        no_pl_optimizer_initial_model_weights,
-        no_pl_optimizer_model,
         pure_pytorch_optimizer_initial_model_weights,
         pure_pytorch_optimizer_model,
         expected_num_batches,
         precision,
 ):
 
-    assert torch.equal(pl_optimizer_initial_model_weights, no_pl_optimizer_initial_model_weights)
     assert torch.equal(pl_optimizer_initial_model_weights, pure_pytorch_optimizer_initial_model_weights)
     assert len(pl_optimizer_model.losses) == expected_num_batches
     assert pure_pytorch_optimizer_model.grad_checked
-    assert pure_pytorch_optimizer_model.losses == no_pl_optimizer_model.losses
-    assert not torch.isnan(torch.FloatTensor(no_pl_optimizer_model.losses)).any()
-
-    assert torch.equal(torch.FloatTensor(no_pl_optimizer_model.losses), torch.FloatTensor(pl_optimizer_model.losses))
-    assert no_pl_optimizer_model.on_before_zero_grad_count == pl_optimizer_model.on_before_zero_grad_count
+    assert not torch.isnan(torch.FloatTensor(pl_optimizer_model.losses)).any()
 
-    for pytorch_grad, no_pl_optim_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads,
-                                                             no_pl_optimizer_model.grads,
-                                                             pl_optimizer_model.grads):
-        assert torch.equal(no_pl_optim_grad, pl_optim_grad), 'Grad parameters are different'
-        assert torch.equal(pytorch_grad, no_pl_optim_grad), 'Grad parameters are different'
+    for pytorch_grad, pl_optim_grad in zip(pure_pytorch_optimizer_model.grads,
+                                           pl_optimizer_model.grads):
+        assert torch.equal(pytorch_grad, pl_optim_grad), 'Grad parameters are different'
 
-    for pytorch_weight, no_pl_optim_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(),
-                                                                   no_pl_optimizer_model.parameters(),
-                                                                   pl_optimizer_model.parameters()):
-        assert torch.equal(no_pl_optim_weight, pl_optim_weight), 'Model parameters are different'
-        assert torch.equal(pytorch_weight, no_pl_optim_weight), 'Model parameters are different'
+    for pytorch_weight, pl_optim_weight in zip(pure_pytorch_optimizer_model.parameters(),
+                                               pl_optimizer_model.parameters()):
+        assert torch.equal(pytorch_weight, pl_optim_weight), 'Model parameters are different'
 
 
 # train function
@@ -336,7 +374,6 @@ def train_specific_optimizer_model(
         model_cls,
         optimizer_cls,
         expected_num_batches,
-        enable_pl_optimizer=False,
         optimizer_is_mocked=False,
         replace_optimizer_step_with_pure_pytorch=False,
         **trainer_kwargs,
@@ -362,7 +399,6 @@ def train_specific_optimizer_model(
     model.training_epoch_end = None
 
     trainer = Trainer(
-        enable_pl_optimizer=enable_pl_optimizer,
         **trainer_kwargs
     )
     trainer.fit(model)
diff --git a/tests/trainer/optimization/test_parity_manual_optimization.py b/tests/trainer/optimization/test_parity_manual_optimization.py
index 5d110b2fbdca7..08e4e9908f592 100644
--- a/tests/trainer/optimization/test_parity_manual_optimization.py
+++ b/tests/trainer/optimization/test_parity_manual_optimization.py
@@ -76,7 +76,7 @@ def training_step(self, batch, batch_idx):
 class ManualOptimizationPurePytorchOptimizerModel(BaseParityManualOptimizationModel):
 
     def training_step(self, batch, batch_idx):
-        optimizer = self.optimizers()
+        optimizer = self.optimizers(use_pl_optimizer=False)
         output = self.layer(batch)
         loss = self.loss(batch, output)
         self.losses.append(loss.detach().item())
@@ -104,7 +104,7 @@ def __init__(self, *args, **kwargs):
         self.scaler = torch.cuda.amp.GradScaler()
 
     def training_step(self, batch, batch_idx):
-        optimizer = self.optimizers()
+        optimizer = self.optimizers(use_pl_optimizer=False)
         with torch.cuda.amp.autocast():
             output = self.layer(batch)
             loss = self.loss(batch, output)
@@ -178,10 +178,8 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal
 
     with patch("torch.optim.SGD.step") as mock_sgd_step, \
             patch("torch.optim.Adam.step") as mock_adam_step, \
-            patch("torch.optim.AdamW.step") as mock_adamw_step, \
             patch("torch.optim.SGD.zero_grad") as mock_sgd_zero_grad, \
-            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad, \
-            patch("torch.optim.AdamW.zero_grad") as mock_adamw_zero_grad:
+            patch("torch.optim.Adam.zero_grad") as mock_adam_zero_grad:
 
         max_epochs = 2
         limit_train_batches = 10
@@ -206,6 +204,4 @@ def test_lightning_optimizer_and_no_lightning_optimizer_equality_check_optim_cal
         assert mock_sgd_step.call_count == (expected_num_batches // accumulate_grad_batches)
         assert mock_sgd_zero_grad.call_count == (expected_num_batches // accumulate_grad_batches)
         assert mock_sgd_step.call_count == mock_adam_step.call_count
-        assert mock_sgd_step.call_count == mock_adam_step.call_count
         assert mock_sgd_zero_grad.call_count == mock_adam_zero_grad.call_count
-        assert mock_sgd_zero_grad.call_count == mock_adamw_zero_grad.call_count
diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py
index e9a422dfb4711..2d2ebd6a6a2dd 100644
--- a/tests/trainer/test_optimizers.py
+++ b/tests/trainer/test_optimizers.py
@@ -180,9 +180,8 @@ def test_reducelronplateau_scheduling(tmpdir):
     ), 'lr scheduler was not correctly converted to dict'
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_optimizer_return_options(enable_pl_optimizer):
-    trainer = Trainer(enable_pl_optimizer=enable_pl_optimizer)
+def test_optimizer_return_options():
+    trainer = Trainer()
     model = EvalModelTemplate()
 
     # single optimizer
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9e5ceccf9b646..8b66e7141957e 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from argparse import Namespace
+from copy import deepcopy
 import math
 import os
+from pathlib import Path
 import pickle
 import sys
-from argparse import Namespace
-from copy import deepcopy
-from pathlib import Path
 from unittest.mock import ANY, call, patch
 
 import cloudpickle
+from omegaconf import OmegaConf
 import pytest
 import torch
-from omegaconf import OmegaConf
 
-import tests.base.develop_utils as tutils
 from pytorch_lightning import Callback, LightningModule, Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
@@ -37,6 +36,7 @@
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel, EvalModelTemplate
+import tests.base.develop_utils as tutils
 
 
 @pytest.mark.parametrize("url_ckpt", [True, False])
@@ -496,16 +496,13 @@ def test_model_checkpoint_only_weights(tmpdir):
 
 
 def test_model_freeze_unfreeze():
-
     model = EvalModelTemplate()
-
     model.freeze()
     model.unfreeze()
 
 
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
 @pytest.mark.parametrize("url_ckpt", [True, False])
-def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt, enable_pl_optimizer):
+def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
     """Verify resuming from checkpoint runs the right number of epochs"""
     # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
     monkeypatch.setenv("TORCH_HOME", tmpdir)
@@ -533,7 +530,6 @@ def on_load_checkpoint(self, _):
         callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)],
         default_root_dir=tmpdir,
         val_check_interval=1.0,
-        enable_pl_optimizer=enable_pl_optimizer,
         progress_bar_refresh_rate=0,
         logger=False,
         weights_summary=None,

From a053d758d03558d2aa5a328b2f6befbc133a0ebc Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Sat, 9 Jan 2021 01:35:47 +0100
Subject: [PATCH 085/136] [bugfix] Logging only on `not should_accumulate()`
 during training (#5417)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* resolve bug

* resolve tests

* update

* Update tests/loggers/test_tensorboard.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 .../logger_connector/logger_connector.py      | 15 +++++++--------
 tests/loggers/test_all.py                     |  4 ++--
 tests/loggers/test_tensorboard.py             | 19 +++++++++++--------
 .../test_train_loop_logging_1_0.py            |  6 +++++-
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 54bf2f9a90cea..6cf020aa65fa1 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from copy import deepcopy
 import os
+from copy import deepcopy
 from pprint import pprint
 from typing import Iterable, Union
 
@@ -158,7 +158,7 @@ def cache_training_step_metrics(self, opt_closure_result):
         self.logged_metrics.update(logged_metrics_tmp)
         self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)
 
-    def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False):
+    def log_metrics(self, metrics, grad_norm_dic, step=None):
         """Logs the metric dict passed in.
         If `step` parameter is None and `step` key is presented is metrics,
         uses metrics["step"] as a step
@@ -186,11 +186,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=
 
         elif step is None:
             # added metrics by Lightning for convenience
-            if log_train_step_metrics:
-                step = self.trainer.total_batch_idx
-            else:
-                scalar_metrics['epoch'] = self.trainer.current_epoch
-                step = self.trainer.global_step
+            scalar_metrics['epoch'] = self.trainer.current_epoch
+            step = self.trainer.global_step
 
         # log actual metrics
         if self.trainer.logger is not None:
@@ -593,6 +590,8 @@ def __gather_result_across_time_and_optimizers(self, epoch_output):
         return gathered_epoch_outputs
 
     def log_train_step_metrics(self, batch_output):
+        if self.trainer.train_loop.should_accumulate() and self.trainer.train_loop.automatic_optimization:
+            return
         _, batch_log_metrics = self.cached_results.update_logger_connector()
         # when metrics should be logged
         if self.should_update_logs or self.trainer.fast_dev_run is True:
@@ -601,5 +600,5 @@ def log_train_step_metrics(self, batch_output):
             if grad_norm_dic is None:
                 grad_norm_dic = {}
             if len(batch_log_metrics) > 0 or len(grad_norm_dic) > 0:
-                self.log_metrics(batch_log_metrics, grad_norm_dic, log_train_step_metrics=True)
+                self.log_metrics(batch_log_metrics, grad_norm_dic)
                 self.callback_metrics.update(batch_log_metrics)
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 795b1a91e688e..4bf15ff8d99a1 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -126,7 +126,7 @@ def log_metrics(self, metrics, step):
     if logger_class == TensorBoardLogger:
         expected = [
             (0, ['hp_metric']),
-            (0, ['train_some_val']),
+            (0, ['epoch', 'train_some_val']),
             (0, ['early_stop_on', 'epoch', 'val_acc']),
             (0, ['hp_metric']),
             (1, ['epoch', 'test_acc', 'test_loss'])
@@ -134,7 +134,7 @@ def log_metrics(self, metrics, step):
         assert log_metric_names == expected
     else:
         expected = [
-            (0, ['train_some_val']),
+            (0, ['epoch', 'train_some_val']),
             (0, ['early_stop_on', 'epoch', 'val_acc']),
             (1, ['epoch', 'test_acc', 'test_loss'])
         ]
diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py
index fa5c711357ba3..148ad550e74c7 100644
--- a/tests/loggers/test_tensorboard.py
+++ b/tests/loggers/test_tensorboard.py
@@ -213,8 +213,11 @@ def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmp
     Tests to ensure that tensorboard log properly when accumulated_gradients > 1
     """
     class TestModel(BoringModel):
-        _count = 0
-        _indexes = []
+
+        def __init__(self):
+            super().__init__()
+            self._count = 0
+            self._indexes = []
 
         def training_step(self, batch, batch_idx):
             output = self.layer(batch)
@@ -222,10 +225,10 @@ def training_step(self, batch, batch_idx):
             self.log('count', self._count, on_step=True, on_epoch=True)
             self.log('loss', loss, on_step=True, on_epoch=True)
 
-            if self.trainer.logger_connector.should_update_logs:
-                self._indexes.append(self._count)
+            if not self.trainer.train_loop.should_accumulate():
+                if self.trainer.logger_connector.should_update_logs:
+                    self._indexes.append(self.trainer.global_step)
 
-            self._count += 1
             return loss
 
         def validation_step(self, batch, batch_idx):
@@ -245,14 +248,13 @@ def configure_optimizers(self):
 
     logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)
 
-    accumulate_grad_batches = 2
     trainer = Trainer(
         default_root_dir=tmpdir,
         limit_train_batches=12,
-        limit_val_batches=12,
+        limit_val_batches=0,
         max_epochs=3,
         gpus=0,
-        accumulate_grad_batches=accumulate_grad_batches,
+        accumulate_grad_batches=2,
         logger=[logger_0],
         log_every_n_steps=3,
     )
@@ -260,5 +262,6 @@ def configure_optimizers(self):
 
     mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
     assert mock_count_epochs == expected
+
     mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
     assert model._indexes == mock_count_steps
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index 51b9c2ac69496..617cd6fa3cbd1 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -24,12 +24,16 @@
 import numpy as np
 import pytest
 import torch
-from torch.utils.data import Dataset
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset, random_split
+from torchvision import transforms
+from torchvision.datasets.mnist import MNIST
 
 import pytorch_lightning as pl
 from pytorch_lightning import callbacks, Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.loggers import WandbLogger
 from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset
 from tests.base.deterministic_model import DeterministicModel
 

From bb5031b3bf7a9b5afac0d2918b7476f3f887ee35 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Sat, 9 Jan 2021 14:55:55 +0100
Subject: [PATCH 086/136] bugfix: Resolve interpolation bug with Hydra (#5406)

* resolve bug

* Apply suggestions from code review

* resolve package import

* resolve import

* update on comments

* update on comments

* hacky fix

* update

* exit

* update

* to_container

* typo

* resolve import

* update

* resolve pep8

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/core/saving.py             | 46 +++++++++-----
 pytorch_lightning/utilities/__init__.py      | 26 +-------
 pytorch_lightning/utilities/package_utils.py | 36 +++++++++++
 pytorch_lightning/utilities/parsing.py       |  5 +-
 tests/models/conf/config.yaml                | 17 +++++
 tests/models/conf/training/default.yaml      |  2 +
 tests/models/test_hparams.py                 | 65 +++++++++++++++++---
 7 files changed, 150 insertions(+), 47 deletions(-)
 create mode 100644 pytorch_lightning/utilities/package_utils.py
 create mode 100644 tests/models/conf/config.yaml
 create mode 100644 tests/models/conf/training/default.yaml

diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py
index 6741236a7e5f5..12a29246888f7 100644
--- a/pytorch_lightning/core/saving.py
+++ b/pytorch_lightning/core/saving.py
@@ -17,16 +17,19 @@
 import inspect
 import os
 from argparse import Namespace
-from typing import Union, Dict, Any, Optional, Callable, MutableMapping, IO
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Dict, IO, MutableMapping, Optional, Union
 from warnings import warn
 
 import torch
 import yaml
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.utilities import rank_zero_warn, AttributeDict, OMEGACONF_AVAILABLE
-from pytorch_lightning.utilities.cloud_io import load as pl_load
+from pytorch_lightning.utilities import AttributeDict, OMEGACONF_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import get_filesystem
+from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.parsing import parse_class_init_keys
 
 PRIMITIVE_TYPES = (bool, int, float, str)
@@ -34,6 +37,9 @@
 
 if OMEGACONF_AVAILABLE:
     from omegaconf import OmegaConf
+    from omegaconf.dictconfig import DictConfig
+    from omegaconf.errors import UnsupportedValueType, ValidationError
+
 
 # the older shall be on the top
 CHECKPOINT_PAST_HPARAMS_KEYS = (
@@ -321,9 +327,14 @@ def save_hparams_to_tags_csv(tags_csv: str, hparams: Union[dict, Namespace]) ->
             writer.writerow({"key": k, "value": v})
 
 
-def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]:
+def load_hparams_from_yaml(config_yaml: str, use_omegaconf: bool = True) -> Dict[str, Any]:
     """Load hparams from a file.
 
+        Args:
+            config_yaml: Path to config yaml file
+            use_omegaconf: If both `OMEGACONF_AVAILABLE` and `use_omegaconf` are True,
+                the hparams will be converted to `DictConfig` if possible
+
     >>> hparams = Namespace(batch_size=32, learning_rate=0.001, data_root='./any/path/here')
     >>> path_yaml = './testing-hparams.yaml'
     >>> save_hparams_to_yaml(path_yaml, hparams)
@@ -338,9 +349,15 @@ def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]:
         return {}
 
     with fs.open(config_yaml, "r") as fp:
-        tags = yaml.full_load(fp)
+        hparams = yaml.full_load(fp)
 
-    return tags
+    if OMEGACONF_AVAILABLE:
+        if use_omegaconf:
+            try:
+                return OmegaConf.create(hparams)
+            except (UnsupportedValueType, ValidationError):
+                pass
+    return hparams
 
 
 def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None:
@@ -361,15 +378,16 @@ def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None:
 
     # saving with OmegaConf objects
     if OMEGACONF_AVAILABLE:
-        if OmegaConf.is_config(hparams):
-            with fs.open(config_yaml, "w", encoding="utf-8") as fp:
-                OmegaConf.save(hparams, fp, resolve=True)
-            return
-        for v in hparams.values():
-            if OmegaConf.is_config(v):
-                with fs.open(config_yaml, "w", encoding="utf-8") as fp:
-                    OmegaConf.save(OmegaConf.create(hparams), fp, resolve=True)
+        # deepcopy: hparams from user shouldn't be resolved
+        hparams = deepcopy(hparams)
+        to_container = partial(OmegaConf.to_container, resolve=True)
+        hparams = apply_to_collection(hparams, DictConfig, to_container)
+        with fs.open(config_yaml, "w", encoding="utf-8") as fp:
+            try:
+                OmegaConf.save(hparams, fp)
                 return
+            except (UnsupportedValueType, ValidationError):
+                pass
 
     assert isinstance(hparams, dict)
     hparams_allowed = {}
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index e5641337cc8d2..c5dade86c348a 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -23,36 +23,16 @@
 
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.distributed import AllGatherGrad, rank_zero_info, rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities.package_utils import _module_available
 from pytorch_lightning.utilities.parsing import AttributeDict, flatten_dict, is_picklable
 from pytorch_lightning.utilities.xla_device_utils import XLA_AVAILABLE, XLADeviceUtils
 
-
-def _module_available(module_path: str) -> bool:
-    """Testing if given module is avalaible in your env
-
-    >>> _module_available('os')
-    True
-    >>> _module_available('bla.bla')
-    False
-    """
-    # todo: find a better way than try / except
-    try:
-        mods = module_path.split('.')
-        assert mods, 'nothing given to test'
-        # it has to be tested as per partets
-        for i in range(len(mods)):
-            module_path = '.'.join(mods[:i + 1])
-            if importlib.util.find_spec(module_path) is None:
-                return False
-        return True
-    except AttributeError:
-        return False
-
-
+OMEGACONF_AVAILABLE = _module_available("omegaconf")
 APEX_AVAILABLE = _module_available("apex.amp")
 NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
 OMEGACONF_AVAILABLE = _module_available("omegaconf")
 HYDRA_AVAILABLE = _module_available("hydra")
+HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental")
 HOROVOD_AVAILABLE = _module_available("horovod.torch")
 BOLTS_AVAILABLE = _module_available("pl_bolts")
 
diff --git a/pytorch_lightning/utilities/package_utils.py b/pytorch_lightning/utilities/package_utils.py
new file mode 100644
index 0000000000000..99fd6fcc7ebb5
--- /dev/null
+++ b/pytorch_lightning/utilities/package_utils.py
@@ -0,0 +1,36 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+
+
+def _module_available(module_path: str) -> bool:
+    """Testing if given module is avalaible in your env
+
+    >>> _module_available('os')
+    True
+    >>> _module_available('bla.bla')
+    False
+    """
+    # todo: find a better way than try / except
+    try:
+        mods = module_path.split('.')
+        assert mods, 'nothing given to test'
+        # it has to be tested as per partets
+        for i in range(len(mods)):
+            module_path = '.'.join(mods[:i + 1])
+            if importlib.util.find_spec(module_path) is None:
+                return False
+        return True
+    except AttributeError:
+        return False
diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py
index b207320c25ccc..5d90583345b4a 100644
--- a/pytorch_lightning/utilities/parsing.py
+++ b/pytorch_lightning/utilities/parsing.py
@@ -15,9 +15,11 @@
 import inspect
 import pickle
 from argparse import Namespace
-from typing import Dict, Union, Tuple
+from typing import Dict, Tuple, Union
 
 from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.apply_func import apply_to_collection
+from pytorch_lightning.utilities.package_utils import _module_available
 
 
 def str_to_bool_or_str(val: str) -> Union[str, bool]:
@@ -115,7 +117,6 @@ def get_init_args(frame) -> dict:
     self_var, args_var, kwargs_var = parse_class_init_keys(cls)
     filtered_vars = [n for n in (self_var, args_var, kwargs_var) if n]
     exclude_argnames = (*filtered_vars, '__class__', 'frame', 'frame_args')
-
     # only collect variables that appear in the signature
     local_args = {k: local_vars[k] for k in init_parameters.keys()}
     local_args.update(local_args.get(kwargs_var, {}))
diff --git a/tests/models/conf/config.yaml b/tests/models/conf/config.yaml
new file mode 100644
index 0000000000000..faf751c24f6cb
--- /dev/null
+++ b/tests/models/conf/config.yaml
@@ -0,0 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+defaults:
+  - training: default
+
+log: ${training.log}
diff --git a/tests/models/conf/training/default.yaml b/tests/models/conf/training/default.yaml
new file mode 100644
index 0000000000000..2c35b22365420
--- /dev/null
+++ b/tests/models/conf/training/default.yaml
@@ -0,0 +1,2 @@
+# @package training
+log: "Something"
diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py
index 7df78d9760bd9..e354c6e708d95 100644
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@@ -15,19 +15,25 @@
 import os
 import pickle
 from argparse import Namespace
+from copy import deepcopy
 
 import cloudpickle
 import pytest
 import torch
 from fsspec.implementations.local import LocalFileSystem
-from omegaconf import OmegaConf, Container
+from omegaconf import Container, OmegaConf
+from omegaconf.dictconfig import DictConfig
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 
-from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.core.saving import save_hparams_to_yaml, load_hparams_from_yaml
-from pytorch_lightning.utilities import AttributeDict, is_picklable
-from tests.base import EvalModelTemplate, TrialMNIST, BoringModel
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml
+from pytorch_lightning.utilities import AttributeDict, HYDRA_EXPERIMENTAL_AVAILABLE, is_picklable
+from tests.base import BoringModel, EvalModelTemplate, TrialMNIST
+
+if HYDRA_EXPERIMENTAL_AVAILABLE:
+    from hydra.experimental import compose, initialize
 
 
 class SaveHparamsModel(BoringModel):
@@ -483,13 +489,13 @@ def test_hparams_save_yaml(tmpdir):
     path_yaml = os.path.join(tmpdir, 'testing-hparams.yaml')
 
     save_hparams_to_yaml(path_yaml, hparams)
-    assert load_hparams_from_yaml(path_yaml) == hparams
+    assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams
 
     save_hparams_to_yaml(path_yaml, Namespace(**hparams))
-    assert load_hparams_from_yaml(path_yaml) == hparams
+    assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams
 
     save_hparams_to_yaml(path_yaml, AttributeDict(hparams))
-    assert load_hparams_from_yaml(path_yaml) == hparams
+    assert load_hparams_from_yaml(path_yaml, use_omegaconf=False) == hparams
 
     save_hparams_to_yaml(path_yaml, OmegaConf.create(hparams))
     assert load_hparams_from_yaml(path_yaml) == hparams
@@ -642,3 +648,46 @@ def test_model_with_fsspec_as_parameter(tmpdir):
     )
     trainer.fit(model)
     trainer.test()
+
+
+@pytest.mark.skipif(not HYDRA_EXPERIMENTAL_AVAILABLE, reason="Hydra experimental is not available")
+def test_model_save_hyper_parameters_interpolation_with_hydra(tmpdir):
+    """
+    This test relies on configuration saved under tests/models/conf/config.yaml
+    """
+
+    class TestHydraModel(BoringModel):
+
+        def __init__(self, args_0, args_1, args_2, kwarg_1=None):
+            self.save_hyperparameters()
+            self.test_hparams()
+            config_file = f"{tmpdir}/hparams.yaml"
+            save_hparams_to_yaml(config_file, self.hparams)
+            self.hparams = load_hparams_from_yaml(config_file)
+            self.test_hparams()
+            super().__init__()
+
+        def test_hparams(self):
+            assert self.hparams.args_0.log == "Something"
+            assert self.hparams.args_1['cfg'].log == "Something"
+            assert self.hparams.args_2[0].log == "Something"
+            assert self.hparams.kwarg_1['cfg'][0].log == "Something"
+
+    with initialize(config_path="conf"):
+        args_0 = compose(config_name="config")
+        args_1 = {"cfg": compose(config_name="config")}
+        args_2 = [compose(config_name="config")]
+        kwarg_1 = {"cfg": [compose(config_name="config")]}
+        model = TestHydraModel(args_0, args_1, args_2, kwarg_1=kwarg_1)
+        epochs = 2
+        checkpoint_callback = ModelCheckpoint(monitor=None, dirpath=tmpdir, save_top_k=-1)
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            callbacks=[checkpoint_callback],
+            limit_train_batches=10,
+            limit_val_batches=10,
+            max_epochs=epochs,
+            logger=False,
+        )
+        trainer.fit(model)
+        _ = TestHydraModel.load_from_checkpoint(checkpoint_callback.best_model_path)

From f1e28d1e436b852a92d6d2dc5ae7c080b006c748 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Sun, 10 Jan 2021 21:33:46 +0100
Subject: [PATCH 087/136] GH action - label conflicts (#5450)

* GH action - label conflicts

* .

* trigger

* trigger

* .
---
 .github/workflows/ci_dockers.yml              |  2 +-
 .../{nightly.yml => events-nightly.yml}       |  0
 .github/workflows/events-recurrent.yml        | 21 +++++++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)
 rename .github/workflows/{nightly.yml => events-nightly.yml} (100%)
 create mode 100644 .github/workflows/events-recurrent.yml

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 43550ade8794b..1f5e5c2315bb8 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -14,7 +14,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bran
       - "environment.yml"
       - "requirements.txt"
       - ".github/workflows/ci_dockers.yml"
-      - ".github/workflows/nightly.yml"
+      - ".github/workflows/events-nightly.yml"
       - ".github/workflows/release-docker.yml"
       - "setup.py"
 
diff --git a/.github/workflows/nightly.yml b/.github/workflows/events-nightly.yml
similarity index 100%
rename from .github/workflows/nightly.yml
rename to .github/workflows/events-nightly.yml
diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
new file mode 100644
index 0000000000000..a618112469b61
--- /dev/null
+++ b/.github/workflows/events-recurrent.yml
@@ -0,0 +1,21 @@
+on:
+  # We recommend `pull_request_target` so that github secrets are available.
+  # In `pull_request` we wouldn't be able to change labels of fork PRs
+  pull_request_target:
+    types: [ synchronize ]
+
+jobs:
+
+  # This label will then be managed by this action.
+  #  It will be added to PRs with merge conflicts and removed from PRs without conflicts.
+  #  https://github.com/mschilde/auto-label-merge-conflicts
+  pr-label-conflicts:
+    name: Label PR conflits
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: mschilde/auto-label-merge-conflicts@v2
+        with:
+          CONFLICT_LABEL_NAME: "has conflicts"
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          MAX_RETRIES: 3
+          WAIT_MS: 5000

From 499d5031e87e8a4ba54059ced69337f53b94e66f Mon Sep 17 00:00:00 2001
From: Poons <66294486+thepooons@users.noreply.github.com>
Date: Mon, 11 Jan 2021 16:32:30 +0530
Subject: [PATCH 088/136] fix typos in validation_step and test_step docs
 (#5438)

* fixed docs in lightning.py

* few more

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 pytorch_lightning/core/lightning.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index bd6784cc3b4bb..bc1b5b9547771 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -621,14 +621,14 @@ def validation_step(self, *args, **kwargs):
             for val_batch in val_data:
                 out = validation_step(val_batch)
                 val_outs.append(out)
-                validation_epoch_end(val_outs)
+            validation_epoch_end(val_outs)
 
         Args:
             batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
                 The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
             batch_idx (int): The index of this batch
             dataloader_idx (int): The index of the dataloader that produced this batch
-                (only if multiple val datasets used)
+                (only if multiple val dataloaders used)
 
         Return:
            Any of.
@@ -677,11 +677,11 @@ def validation_step(self, batch, batch_idx):
                     # log the outputs!
                     self.log_dict({'val_loss': loss, 'val_acc': val_acc})
 
-            If you pass in multiple val datasets, validation_step will have an additional argument.
+            If you pass in multiple val dataloaders, :meth:`validation_step` will have an additional argument.
 
             .. code-block:: python
 
-                # CASE 2: multiple validation datasets
+                # CASE 2: multiple validation dataloaders
                 def validation_step(self, batch, batch_idx, dataloader_idx):
                     # dataloader_idx tells you which dataset this is.
 
@@ -813,7 +813,7 @@ def test_step(self, *args, **kwargs):
                 The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
             batch_idx (int): The index of this batch.
             dataloader_idx (int): The index of the dataloader that produced this batch
-                (only if multiple test datasets used).
+                (only if multiple test dataloaders used).
 
         Return:
            Any of.
@@ -853,17 +853,17 @@ def test_step(self, batch, batch_idx):
                     # log the outputs!
                     self.log_dict({'test_loss': loss, 'test_acc': test_acc})
 
-            If you pass in multiple validation datasets, :meth:`test_step` will have an additional
+            If you pass in multiple test dataloaders, :meth:`test_step` will have an additional
             argument.
 
             .. code-block:: python
 
-                # CASE 2: multiple test datasets
+                # CASE 2: multiple test dataloaders
                 def test_step(self, batch, batch_idx, dataloader_idx):
                     # dataloader_idx tells you which dataset this is.
 
         Note:
-            If you don't need to validate you don't need to implement this method.
+            If you don't need to test you don't need to implement this method.
 
         Note:
             When the :meth:`test_step` is called, the model has been put in eval mode and

From 92bbf2fdd6509a273795a9bda08d95cd952c7791 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 11 Jan 2021 13:32:28 +0100
Subject: [PATCH 089/136] GH action - auto-update PRs (#5451)

* GH action - auto-update PRs

* .
---
 .github/workflows/events-recurrent.yml | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
index a618112469b61..30ccbcd11a9d7 100644
--- a/.github/workflows/events-recurrent.yml
+++ b/.github/workflows/events-recurrent.yml
@@ -1,8 +1,8 @@
+name: Recurrent events
+
 on:
-  # We recommend `pull_request_target` so that github secrets are available.
-  # In `pull_request` we wouldn't be able to change labels of fork PRs
-  pull_request_target:
-    types: [ synchronize ]
+  push:
+    branches: [master, "release/*"]
 
 jobs:
 
@@ -13,9 +13,25 @@ jobs:
     name: Label PR conflits
     runs-on: ubuntu-20.04
     steps:
-      - uses: mschilde/auto-label-merge-conflicts@v2
+      - uses: mschilde/auto-label-merge-conflicts@v2.0
         with:
           CONFLICT_LABEL_NAME: "has conflicts"
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           MAX_RETRIES: 3
           WAIT_MS: 5000
+
+  # autoupdate is a GitHub Action that auto-updates pull requests branches whenever changes land on their destination branch.
+  # see: https://github.com/marketplace/actions/auto-update
+  pr-auto-update:
+    name: Auto-update PR
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: docker://chinthakagodawita/autoupdate-action:v1
+        env:
+          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
+          DRY_RUN: "false"
+          PR_FILTER: "labelled"
+          PR_LABELS: "0:] Ready-To-Go"
+          MERGE_MSG: "Branch was auto-updated."
+          RETRY_COUNT: "3"
+          RETRY_SLEEP: "500"

From 87482935a3d03cab3ffff336c88e0ae977a2beee Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Mon, 11 Jan 2021 08:21:10 -0800
Subject: [PATCH 090/136] Add automatic optimization property setter to
 lightning module (#5169)

* add automatic optimization property setter to lightning module

* Update test_manual_optimization.py

Co-authored-by: chaton <thomas@grid.ai>
---
 pytorch_lightning/core/lightning.py                    | 8 +++++++-
 tests/trainer/optimization/test_manual_optimization.py | 9 +++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index bc1b5b9547771..b62fd57e8debf 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs):
         self._running_manual_backward = False
         self._current_hook_fx_name = None
         self._current_dataloader_idx = None
+        self._automatic_optimization: bool = True
 
     def optimizers(self, use_pl_optimizer: bool = True) -> Union[Optimizer, List[Optimizer], List[LightningOptimizer]]:
         if use_pl_optimizer:
@@ -163,7 +164,12 @@ def automatic_optimization(self) -> bool:
         """
         If False you are responsible for calling .backward, .step, zero_grad.
         """
-        return True
+        return self._automatic_optimization
+
+    @automatic_optimization.setter
+    def automatic_optimization(self, automatic_optimization: bool) -> None:
+        self._automatic_optimization = automatic_optimization
+
 
     def print(self, *args, **kwargs) -> None:
         r"""
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index f0d7c6d96914e..338769f2ee40f 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -33,6 +33,11 @@ def test_multiple_optimizers_manual(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
             # manual
             (opt_a, opt_b) = self.optimizers()
@@ -69,10 +74,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 

From f065ea65bf4a577e2fe050049bbdbcb0cad5effc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 12 Jan 2021 02:36:12 +0100
Subject: [PATCH 091/136] populate some more legacy checkpoints (#5457)

* populate some more legacy checkpoints

* .

* pt freeze

* .

* skip

Co-authored-by: chaton <thomas@grid.ai>
---
 legacy/README.md                              | 17 ++++++++++++++
 legacy/generate_checkpoints.sh                |  3 ++-
 legacy/zero_training.py                       |  3 ++-
 pytorch_lightning/core/lightning.py           |  1 -
 .../checkpointing/test_legacy_checkpoints.py  | 22 +++++++++++++++++--
 5 files changed, 41 insertions(+), 5 deletions(-)
 create mode 100644 legacy/README.md

diff --git a/legacy/README.md b/legacy/README.md
new file mode 100644
index 0000000000000..3ce6d15f65568
--- /dev/null
+++ b/legacy/README.md
@@ -0,0 +1,17 @@
+# Maintaining back-compatibility with come legacy versions
+
+The aim of this section is set some baselines and workflows/guidelines for maintaining back compatibility with some legacies version of PL
+
+At this moment we focus on ability running old checkpoints, so the flow here is to create a checkpoint with every release and store it in our public AWS storage and so each CI testing will pull this archive and test loading and resuming training with this model.
+
+If you want to pull all saved version-checkpoints for local testing/development, call
+```bash
+wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip
+unzip -o checkpoints.zip
+```
+
+To back populate collection with past version you can use following bash:
+```bash
+bash generate_checkpoints.sh 1.0.2 1.0.3 1.0.4
+zip -r checkpoints.zip checkpoints/
+```
diff --git a/legacy/generate_checkpoints.sh b/legacy/generate_checkpoints.sh
index c9f4dabff46c5..7726c5b097c5c 100644
--- a/legacy/generate_checkpoints.sh
+++ b/legacy/generate_checkpoints.sh
@@ -21,7 +21,8 @@ do
   virtualenv $ENV_PATH --system-site-packages
   # activate and install PL version
   source "$ENV_PATH/bin/activate"
-  pip install "pytorch_lightning==$ver" --quiet -U --no-cache-dir
+  # there are problem to load ckpt in older versions since they are saved the newer versions
+  pip install "pytorch_lightning==$ver" "torch==1.3" --quiet --no-cache-dir
 
   python --version
   pip --version
diff --git a/legacy/zero_training.py b/legacy/zero_training.py
index 4e4952a3bb1db..0115df4143460 100644
--- a/legacy/zero_training.py
+++ b/legacy/zero_training.py
@@ -49,7 +49,8 @@ def _loss(self, batch, prediction):
     def _step(self, batch, batch_idx):
         output = self.layer(batch)
         loss = self._loss(batch, output)
-        return loss
+        # return {'loss': loss}  # used for PL<1.0
+        return loss  # used for PL >= 1.0
 
     def training_step(self, batch, batch_idx):
         return self._step(batch, batch_idx)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index b62fd57e8debf..f750c8aff7caf 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -170,7 +170,6 @@ def automatic_optimization(self) -> bool:
     def automatic_optimization(self, automatic_optimization: bool) -> None:
         self._automatic_optimization = automatic_optimization
 
-
     def print(self, *args, **kwargs) -> None:
         r"""
         Prints only from process 0. Use this in any distributed mode to log only once.
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index cb9fe443a316b..42623cb4df1ec 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -24,9 +24,27 @@
 CHECKPOINT_EXTENSION = ".ckpt"
 
 
-# todo: add more legacy checkpoints :]
+# todo: add more legacy checkpoints - for < v0.8
 @pytest.mark.parametrize("pl_version", [
-    "0.10.0", "1.0.0", "1.0.1", "1.0.2", "1.0.3", "1.0.4", "1.0.5", "1.0.6", "1.0.7", "1.0.8"
+    # "0.8.1",
+    "0.8.3",
+    "0.8.4",
+    # "0.8.5", # this version has problem with loading on PT<=1.4 as it seems to be archive
+    # "0.9.0", # this version has problem with loading on PT<=1.4 as it seems to be archive
+    "0.10.0",
+    "1.0.0",
+    "1.0.1",
+    "1.0.2",
+    "1.0.3",
+    "1.0.4",
+    "1.0.5",
+    "1.0.6",
+    "1.0.7",
+    "1.0.8",
+    "1.1.0",
+    "1.1.1",
+    "1.1.2",
+    "1.1.3",
 ])
 def test_resume_legacy_checkpoints(tmpdir, pl_version):
     path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)

From 635df27880521c29d2d8d094e5f790a6658ad5a3 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Tue, 12 Jan 2021 04:30:27 +0000
Subject: [PATCH 092/136] [BUG] Check environ before selecting a seed to
 prevent warning message (#4743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Check environment var independently to selecting a seed to prevent unnecessary warning message

* Add if statement to check if PL_GLOBAL_SEED has been set

* Added seed test to ensure that the seed stays the same, in case

* if

* Delete global seed after test has finished

* Fix code, add tests

* Ensure seed does not exist before tests start

* Refactor test based on review, add log call

* Ensure we clear the os environ in patched dict

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
---
 pytorch_lightning/utilities/seed.py | 17 ++++-----
 tests/utilities/test_seed.py        | 55 +++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 10 deletions(-)
 create mode 100644 tests/utilities/test_seed.py

diff --git a/pytorch_lightning/utilities/seed.py b/pytorch_lightning/utilities/seed.py
index 1ce782f967ebb..16bc39bd7f142 100644
--- a/pytorch_lightning/utilities/seed.py
+++ b/pytorch_lightning/utilities/seed.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 import torch
-
 from pytorch_lightning import _logger as log
+from pytorch_lightning.utilities import rank_zero_warn
 
 
 def seed_everything(seed: Optional[int] = None) -> int:
@@ -41,18 +41,17 @@ def seed_everything(seed: Optional[int] = None) -> int:
 
     try:
         if seed is None:
-            seed = os.environ.get("PL_GLOBAL_SEED", _select_seed_randomly(min_seed_value, max_seed_value))
+            seed = os.environ.get("PL_GLOBAL_SEED")
         seed = int(seed)
     except (TypeError, ValueError):
         seed = _select_seed_randomly(min_seed_value, max_seed_value)
+        rank_zero_warn(f"No correct seed found, seed set to {seed}")
 
-    if (seed > max_seed_value) or (seed < min_seed_value):
-        log.warning(
-            f"{seed} is not in bounds, \
-            numpy accepts from {min_seed_value} to {max_seed_value}"
-        )
+    if not (min_seed_value <= seed <= max_seed_value):
+        rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
         seed = _select_seed_randomly(min_seed_value, max_seed_value)
 
+    log.info(f"Global seed set to {seed}")
     os.environ["PL_GLOBAL_SEED"] = str(seed)
     random.seed(seed)
     np.random.seed(seed)
@@ -62,6 +61,4 @@ def seed_everything(seed: Optional[int] = None) -> int:
 
 
 def _select_seed_randomly(min_seed_value: int = 0, max_seed_value: int = 255) -> int:
-    seed = random.randint(min_seed_value, max_seed_value)
-    log.warning(f"No correct seed found, seed set to {seed}")
-    return seed
+    return random.randint(min_seed_value, max_seed_value)
diff --git a/tests/utilities/test_seed.py b/tests/utilities/test_seed.py
new file mode 100644
index 0000000000000..7fa6df516c304
--- /dev/null
+++ b/tests/utilities/test_seed.py
@@ -0,0 +1,55 @@
+import os
+
+from unittest import mock
+import pytest
+
+import pytorch_lightning.utilities.seed as seed_utils
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+def test_seed_stays_same_with_multiple_seed_everything_calls():
+    """
+    Ensure that after the initial seed everything,
+    the seed stays the same for the same run.
+    """
+    with pytest.warns(UserWarning, match="No correct seed found"):
+        seed_utils.seed_everything()
+    initial_seed = os.environ.get("PL_GLOBAL_SEED")
+
+    with pytest.warns(None) as record:
+        seed_utils.seed_everything()
+    assert not record  # does not warn
+    seed = os.environ.get("PL_GLOBAL_SEED")
+
+    assert initial_seed == seed
+
+
+@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "2020"}, clear=True)
+def test_correct_seed_with_environment_variable():
+    """
+    Ensure that the PL_GLOBAL_SEED environment is read
+    """
+    assert seed_utils.seed_everything() == 2020
+
+
+@mock.patch.dict(os.environ, {"PL_GLOBAL_SEED": "invalid"}, clear=True)
+@mock.patch.object(seed_utils, attribute='_select_seed_randomly', new=lambda *_: 123)
+def test_invalid_seed():
+    """
+    Ensure that we still fix the seed even if an invalid seed is given
+    """
+    with pytest.warns(UserWarning, match="No correct seed found"):
+        seed = seed_utils.seed_everything()
+    assert seed == 123
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch.object(seed_utils, attribute='_select_seed_randomly', new=lambda *_: 123)
+@pytest.mark.parametrize("seed", (10e9, -10e9))
+def test_out_of_bounds_seed(seed):
+    """
+    Ensure that we still fix the seed even if an out-of-bounds seed is given
+    """
+    with pytest.warns(UserWarning, match="is not in bounds"):
+        actual = seed_utils.seed_everything(seed)
+    assert actual == 123

From d30e316a35c0246296a4f62cf964f131759e851f Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Tue, 12 Jan 2021 00:42:23 -0800
Subject: [PATCH 093/136] [docs] Add ananthsub to core (#5476)

* Update test_manual_optimization.py

* Update governance.rst

* Update test_manual_optimization.py

* Update test_manual_optimization.py
---
 docs/source/governance.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/governance.rst b/docs/source/governance.rst
index 74d24e306d3f9..22fba33771c0a 100644
--- a/docs/source/governance.rst
+++ b/docs/source/governance.rst
@@ -25,3 +25,4 @@ Core Maintainers
 - Jeff Yang (`ydcjeff <https://github.com/ydcjeff>`_)
 - Roger Shieh (`s-rog <https://github.com/s-rog>`_)
 - Carlos Mocholí (`carmocca <https://github.com/carmocca>`_)
+- Ananth Subramaniam (`ananthsub <https://github.com/ananthsub>`_)

From 9611a7f8976657ae36b7c2af61178f5b80f5ce81 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 12 Jan 2021 13:56:20 +0100
Subject: [PATCH 094/136] update nightly & upgrade Twine (#5458)

* update used Twine

* .

* .

* install

* install

* .

* .

* .

* .

* .

* .

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .github/prepare-nightly_pkg-name.py   | 12 ------------
 .github/prepare-nightly_version.py    | 12 ++++++------
 .github/workflows/ci_pkg-install.yml  | 28 ++++++++++++++++-----------
 .github/workflows/ci_test-base.yml    |  2 +-
 .github/workflows/ci_test-conda.yml   |  2 +-
 .github/workflows/ci_test-full.yml    |  2 +-
 .github/workflows/ci_test-tpu.yml     |  2 +-
 .github/workflows/code-formatting.yml |  2 +-
 .github/workflows/docs-checks.yml     |  2 +-
 .github/workflows/release-docker.yml  |  2 +-
 .github/workflows/release-pypi.yml    |  2 +-
 LICENSE                               |  2 +-
 pytorch_lightning/__init__.py         | 10 ++++++----
 requirements/test.txt                 |  2 +-
 14 files changed, 39 insertions(+), 43 deletions(-)
 delete mode 100644 .github/prepare-nightly_pkg-name.py

diff --git a/.github/prepare-nightly_pkg-name.py b/.github/prepare-nightly_pkg-name.py
deleted file mode 100644
index b85f6049ac140..0000000000000
--- a/.github/prepare-nightly_pkg-name.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-import re
-
-PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-
-PATH_SETUP = os.path.join(PATH_ROOT, 'setup.py')
-print(f"rename package '{PATH_SETUP}'")
-with open(PATH_SETUP, 'r') as fp:
-    setup = fp.read()
-setup = re.sub(r'name=[\'"]pytorch-lightning[\'"]', 'name="pytorch-lightning-nightly"', setup)
-with open(PATH_SETUP, 'w') as fp:
-    fp.write(setup)
diff --git a/.github/prepare-nightly_version.py b/.github/prepare-nightly_version.py
index 22b72c8d6803c..f830cc469905c 100644
--- a/.github/prepare-nightly_version.py
+++ b/.github/prepare-nightly_version.py
@@ -2,15 +2,15 @@
 import os
 import re
 
-PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+_PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+_PATH_INIT = os.path.join(_PATH_ROOT, 'pytorch_lightning', '__init__.py')
 
 # get today date
 now = datetime.datetime.now()
 now_date = now.strftime("%Y%m%d")
-PATH_INIT = os.path.join(PATH_ROOT, 'pytorch_lightning', '__init__.py')
-print(f"prepare init '{PATH_INIT}' - replace version by {now_date}")
-with open(PATH_INIT, 'r') as fp:
+print(f"prepare init '{_PATH_INIT}' - replace version by {now_date}")
+with open(_PATH_INIT, 'r') as fp:
     init = fp.read()
-init = re.sub(r'__version__ = [\d\.rc\'"]+', f'__version__ = "{now_date}"', init)
-with open(PATH_INIT, 'w') as fp:
+init = re.sub(r'__version__ = [\d\.\w\'"]+', f'__version__ = "{now_date}"', init)
+with open(_PATH_INIT, 'w') as fp:
     fp.write(init)
diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml
index 4d70beddf3f1b..54c9f5c007c82 100644
--- a/.github/workflows/ci_pkg-install.yml
+++ b/.github/workflows/ci_pkg-install.yml
@@ -3,7 +3,7 @@ name: Install pkg
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
@@ -27,13 +27,13 @@ jobs:
 
       - name: Prepare env
         run: |
-          pip install check-manifest "twine==1.13.0"
+          pip install check-manifest "twine==3.2" setuptools wheel
 
       - name: Create package
         run: |
           check-manifest
           # python setup.py check --metadata --strict
-          python setup.py sdist
+          python setup.py sdist bdist_wheel
 
       - name: Check package
         run: |
@@ -46,12 +46,18 @@ jobs:
           # this is just a hotfix because of Win cannot install it directly
           pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
 
-      - name: Install package
+      - name: Install | Uninstall package - archive
+        run: |
+          # install as archive
+          pip install dist/*.tar.gz
+          cd ..
+          python -c "import pytorch_lightning as pl ; print(pl.__version__)"
+          pip uninstall -y pytorch-lightning
+
+      - name: Install | Uninstall package - wheel
         run: |
-          # pip install virtualenv
-          # virtualenv vEnv --system-site-packages
-          # source vEnv/bin/activate
-          pip install dist/*
-          cd .. & python -c "import pytorch_lightning as pl ; print(pl.__version__)"
-          # deactivate
-          # rm -rf vEnv
+          # install as wheel
+          pip install dist/*.whl
+          cd ..
+          python -c "import pytorch_lightning as pl ; print(pl.__version__)"
+          pip uninstall -y pytorch-lightning
\ No newline at end of file
diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
index d1ef75db942e8..ed8a2e30949b7 100644
--- a/.github/workflows/ci_test-base.yml
+++ b/.github/workflows/ci_test-base.yml
@@ -3,7 +3,7 @@ name: CI basic testing
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 284a9792090e8..3faceb296eb1d 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -3,7 +3,7 @@ name: PyTorch & Conda
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 3eb8ed8409f64..bf7c9aba8f3c2 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -3,7 +3,7 @@ name: CI complete testing
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml
index ec2a976ea98e5..b1abcfe123201 100644
--- a/.github/workflows/ci_test-tpu.yml
+++ b/.github/workflows/ci_test-tpu.yml
@@ -2,7 +2,7 @@ name: TPU tests
 
 on:
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
 # TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs
 #  pull_request:
 #    branches:
diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
index 3e2f296226a48..0210e3ceb603a 100644
--- a/.github/workflows/code-formatting.yml
+++ b/.github/workflows/code-formatting.yml
@@ -2,7 +2,7 @@ name: "Check Code Format"
 
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 247c5cf61f9c1..1857ebc8dabea 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -3,7 +3,7 @@ name: "Docs check"
 
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   pull_request:
     branches: [master, "release/*"]
 
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 3543891cf7698..fee3bbebbee84 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -3,7 +3,7 @@ name: Publish Docker Releases
 # https://github.com/docker/build-push-action
 on:
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   release:
     types: [created]
 
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index b0310c3d36ccc..9b2bc0699eeb6 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -3,7 +3,7 @@ name: PyPI Release
 # https://help.github.com/en/actions/reference/events-that-trigger-workflows
 on:  # Trigger the workflow on push or pull request, but only for the master branch
   push:
-    branches: [master, "release/*"]  # include release branches like release/1.0.x
+    branches: [master, "release/*"]
   release:
     types: [created]
 
diff --git a/LICENSE b/LICENSE
index b9181e1a6e5d8..2e66bec2e791c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2018-2020 William Falcon
+   Copyright 2018-2021 William Falcon
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 5f7ae6bdee9d2..2e35207d05cb1 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,10 +1,15 @@
 """Root package info."""
 
+import logging as python_logging
+import os
+import time
+
+_this_year = time.strftime("%Y")
 __version__ = '1.1.3'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'
-__copyright__ = 'Copyright (c) 2018-2020, %s.' % __author__
+__copyright__ = f'Copyright (c) 2018-{_this_year}, {__author__}.'
 __homepage__ = 'https://github.com/PyTorchLightning/pytorch-lightning'
 # this has to be simple string, see: https://github.com/pypa/twine/issues/522
 __docs__ = (
@@ -33,9 +38,6 @@
 - https://pytorch-lightning.readthedocs.io/en/stable
 """
 
-import logging as python_logging
-import os
-
 _logger = python_logging.getLogger("lightning")
 _logger.addHandler(python_logging.StreamHandler())
 _logger.setLevel(python_logging.INFO)
diff --git a/requirements/test.txt b/requirements/test.txt
index 632f40e0287b4..e9226139d9287 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -6,7 +6,7 @@ pytest>=5.0
 flake8>=3.6
 flake8-black
 check-manifest
-twine==1.13.0
+twine==3.2
 # scipy>=0.13.3
 scikit-learn>=0.22.2
 scikit-image>=0.17.2

From c00d5709c4fb40259ad05fe827e24ad421ce2da2 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 12 Jan 2021 15:53:41 +0100
Subject: [PATCH 095/136] ci: update recurent events (#5480)

* ci: update recurent events

* split events

* .

* .

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .github/workflows/events-ocasional.yml | 26 ++++++++++++++++++++++++++
 .github/workflows/events-recurrent.yml | 19 +------------------
 2 files changed, 27 insertions(+), 18 deletions(-)
 create mode 100644 .github/workflows/events-ocasional.yml

diff --git a/.github/workflows/events-ocasional.yml b/.github/workflows/events-ocasional.yml
new file mode 100644
index 0000000000000..a6cd43a8371ea
--- /dev/null
+++ b/.github/workflows/events-ocasional.yml
@@ -0,0 +1,26 @@
+name: Ocasional events
+
+on:
+  push:
+    branches: [master, "release/*"]
+  pull_request_target: {}
+
+jobs:
+
+  # autoupdate is a GitHub Action that auto-updates pull requests branches whenever changes land on their destination branch.
+  # see: https://github.com/marketplace/actions/auto-update
+  pr-auto-update:
+    name: Auto-update PR
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: docker://chinthakagodawita/autoupdate-action:v1
+        # todo: this shall be resolved with https://github.com/chinthakagodawita/autoupdate/issues/100
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
+          DRY_RUN: "false"
+          PR_FILTER: "labelled"
+          PR_LABELS: "0:] Ready-To-Go,has conflicts"
+          MERGE_MSG: "Branch was auto-updated."
+          RETRY_COUNT: "3"
+          RETRY_SLEEP: "500"
diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
index 30ccbcd11a9d7..6b9382e29901e 100644
--- a/.github/workflows/events-recurrent.yml
+++ b/.github/workflows/events-recurrent.yml
@@ -1,8 +1,7 @@
 name: Recurrent events
 
 on:
-  push:
-    branches: [master, "release/*"]
+  push: {}
 
 jobs:
 
@@ -19,19 +18,3 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           MAX_RETRIES: 3
           WAIT_MS: 5000
-
-  # autoupdate is a GitHub Action that auto-updates pull requests branches whenever changes land on their destination branch.
-  # see: https://github.com/marketplace/actions/auto-update
-  pr-auto-update:
-    name: Auto-update PR
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: docker://chinthakagodawita/autoupdate-action:v1
-        env:
-          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
-          DRY_RUN: "false"
-          PR_FILTER: "labelled"
-          PR_LABELS: "0:] Ready-To-Go"
-          MERGE_MSG: "Branch was auto-updated."
-          RETRY_COUNT: "3"
-          RETRY_SLEEP: "500"

From 652df1886abc63820d956aef7a468ff4cf1bea7f Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Tue, 12 Jan 2021 18:53:10 +0000
Subject: [PATCH 096/136] Increment version, update CHANGELOG.md (#5482)

---
 CHANGELOG.md                  | 14 +++++---------
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c32b93cc0dec..066745b4c57cb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,22 +9,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Add automatic optimization property setter to lightning module ([#5169](https://github.com/PyTorchLightning/pytorch-lightning/pull/5169))
 
 ### Changed
 
-- Changed depreceated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244))
-
-
-### Deprecated
-
-
-### Removed
-
+- Changed deprecated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244))
 
 ### Fixed
 
 - Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/PyTorchLightning/pytorch-lightning/pull/5195))
-
+- Logging only on `not should_accumulate()` during training ([#5417](https://github.com/PyTorchLightning/pytorch-lightning/pull/5417))
+- Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406)) 
+- Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/PyTorchLightning/pytorch-lightning/pull/4743))
 
 
 ## [1.1.3] - 2021-01-05
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 2e35207d05cb1..1f672dcd9aac8 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -5,7 +5,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.1.3'
+__version__ = '1.1.4'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From 1f6236accce78303249c55de656b71501e607d1a Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 12 Jan 2021 22:35:43 +0100
Subject: [PATCH 097/136] fix generate checkpoint (#5489)

---
 .github/workflows/release-pypi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index 9b2bc0699eeb6..aaf5ef1c2e2cf 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -94,7 +94,7 @@ jobs:
       run: |
         virtualenv vEnv --system-site-packages
         source vEnv/bin/activate
-        pip install dist/*
+        pip install dist/*.whl
 
         pl_ver=$(python -c "import pytorch_lightning as pl ; print(pl.__version__)" 2>&1)
         # generate checkpoint to this version

From 1ec1d3e997e15a1f4f8dac72b856574aee20f63a Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Wed, 13 Jan 2021 04:23:43 +0530
Subject: [PATCH 098/136] update tests with new auto_opt api (#5466)

* update tests with new auto_opt api

* Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/core/test_lightning_optimizer.py        | 14 ++-
 tests/plugins/test_ddp_sequential_plugin.py   | 12 +--
 .../dynamic_args/test_multiple_optimizers.py  | 11 ++-
 .../optimization/test_manual_optimization.py  | 95 +++++++++----------
 4 files changed, 63 insertions(+), 69 deletions(-)

diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
index 171cf00ad4f66..1c49844b5764b 100644
--- a/tests/core/test_lightning_optimizer.py
+++ b/tests/core/test_lightning_optimizer.py
@@ -89,6 +89,9 @@ def test_lightning_optimizer_manual_optimization(mock_sgd_step, mock_adam_step,
     Test that the user can use our LightningOptimizer. Not recommended for now.
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
 
         def training_step(self, batch, batch_idx, optimizer_idx=None):
             (opt_1, opt_2) = self.optimizers()
@@ -114,10 +117,6 @@ def configure_optimizers(self):
             lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
             return [optimizer_1, optimizer_2], [lr_scheduler]
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.training_step_end = None
     model.training_epoch_end = None
@@ -141,6 +140,9 @@ def test_lightning_optimizer_manual_optimization_and_accumulated_gradients(mock_
     Test that the user can use our LightningOptimizer. Not recommended.
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
 
         def training_step(self, batch, batch_idx, optimizer_idx=None):
             (opt_1, opt_2) = self.optimizers()
@@ -166,10 +168,6 @@ def configure_optimizers(self):
             lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
             return [optimizer_1, optimizer_2], [lr_scheduler]
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.training_step_end = None
     model.training_epoch_end = None
diff --git a/tests/plugins/test_ddp_sequential_plugin.py b/tests/plugins/test_ddp_sequential_plugin.py
index 8b21c36e73065..0f5b78f71da50 100644
--- a/tests/plugins/test_ddp_sequential_plugin.py
+++ b/tests/plugins/test_ddp_sequential_plugin.py
@@ -149,6 +149,7 @@ class SequentialModelRPCManual(LightningModule):
     def __init__(self):
         super().__init__()
         self.sequential_module = nn.Sequential(torch.nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 2))
+        self.automatic_optimization = False
 
     def forward(self, x):
         return self.sequential_module(x)
@@ -195,19 +196,14 @@ def val_dataloader(self):
     def test_dataloader(self):
         return torch.utils.data.DataLoader(RandomDataset(32, 64))
 
-    @property
-    def automatic_optimization(self) -> bool:
-        return False
-
 
 class SequentialModelRPCAutomatic(SequentialModelRPCManual):
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = True
 
     def training_step(self, batch, batch_idx):
         output = self.sequential_module(batch)
         loss = self.loss(output)
         self.log("train_loss", loss, on_epoch=True, prog_bar=True)
         return loss
-
-    @property
-    def automatic_optimization(self) -> bool:
-        return True
diff --git a/tests/trainer/dynamic_args/test_multiple_optimizers.py b/tests/trainer/dynamic_args/test_multiple_optimizers.py
index 48b1bf6ab7ac9..6b8219c673009 100644
--- a/tests/trainer/dynamic_args/test_multiple_optimizers.py
+++ b/tests/trainer/dynamic_args/test_multiple_optimizers.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import torch
+
 from pytorch_lightning import Trainer
 from tests.base.boring_model import BoringModel
-import torch
 
 
 def test_multiple_optimizers(tmpdir):
@@ -68,6 +69,10 @@ def test_multiple_optimizers_manual(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def on_train_epoch_start(self) -> None:
             self.opt_0_seen = False
             self.opt_1_seen = False
@@ -97,10 +102,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 338769f2ee40f..50463f5c4b5e2 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -99,6 +99,10 @@ def test_multiple_optimizers_manual_return(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
             # manual
             (opt_a, opt_b) = self.optimizers()
@@ -137,10 +141,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
@@ -166,6 +166,10 @@ def test_multiple_optimizers_manual_return_and_log(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
             # manual
             (opt_a, opt_b) = self.optimizers()
@@ -205,10 +209,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
@@ -239,6 +239,10 @@ def test_multiple_optimizers_manual_native_amp(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
             # manual
             (opt_a, opt_b) = self.optimizers()
@@ -275,10 +279,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
@@ -308,6 +308,10 @@ def test_multiple_optimizers_manual_apex(tmpdir):
     Tests that only training_step can be used
     """
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
             # manual
             (opt_a, opt_b) = self.optimizers()
@@ -348,10 +352,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
@@ -381,6 +381,10 @@ class ManualOptimizationExtendedModel(BoringModel):
     called = collections.defaultdict(int)
     detach = False
 
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
+
     @property
     def should_update(self):
         return self.count % 2 == 0
@@ -428,10 +432,6 @@ def on_train_end(self):
         assert self.called["on_train_batch_start"] == 10
         assert self.called["on_train_batch_end"] == 10
 
-    @property
-    def automatic_optimization(self) -> bool:
-        return False
-
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -503,6 +503,10 @@ class ExtendedModel(BoringModel):
         called = collections.defaultdict(int)
         detach = False
 
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         @property
         def should_update(self):
             return self.count % 2 == 0
@@ -555,10 +559,6 @@ def on_train_end(self):
             assert self.called["on_train_batch_start"] == 20
             assert self.called["on_train_batch_end"] == 20
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = ExtendedModel()
     model.training_step_end = None
     model.training_epoch_end = None
@@ -587,6 +587,10 @@ class TestModel(BoringModel):
 
         called = False
 
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def on_after_backward(self):
             self.called = True
             norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
@@ -629,10 +633,6 @@ def configure_optimizers(self):
             optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer, optimizer_2
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
 
@@ -666,6 +666,10 @@ class TestModel(BoringModel):
 
         _losses = []
 
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx):
             # manual
 
@@ -713,10 +717,6 @@ def configure_optimizers(self):
             optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
     model.training_epoch_end = None
@@ -743,6 +743,10 @@ def test_step_with_optimizer_closure_and_accumulated_grad(tmpdir):
     os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx):
             # manual
             opt = self.optimizers()
@@ -776,10 +780,6 @@ def configure_optimizers(self):
             optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
     model.training_epoch_end = None
@@ -806,6 +806,10 @@ def test_step_with_optimizer_closure_and_extra_arguments(step_mock, tmpdir):
     os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx):
             # manual
             opt = self.optimizers()
@@ -831,10 +835,6 @@ def configure_optimizers(self):
             optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             return optimizer
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
     model.training_epoch_end = None
@@ -863,6 +863,10 @@ def test_step_with_optimizer_closure_with_different_frequencies(mock_sgd_step, m
     os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
         def training_step(self, batch, batch_idx, optimizer_idx):
 
             # emulate gans training
@@ -908,10 +912,6 @@ def configure_optimizers(self):
             optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
             return [optimizer_gen, optimizer_dis]
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     model = TestModel()
     model.val_dataloader = None
     model.training_epoch_end = None
@@ -944,6 +944,9 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_ste
     os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
 
         def loss_ones(self, batch, prediction):
             # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
@@ -1013,10 +1016,6 @@ def configure_optimizers(self):
             optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
             return [optimizer_gen, optimizer_dis]
 
-        @property
-        def automatic_optimization(self) -> bool:
-            return False
-
     seed_everything(42)
 
     model = TestModel()

From a9377e34cb5c202363b239e31eb703efa5ec1abd Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Wed, 13 Jan 2021 05:06:13 +0530
Subject: [PATCH 099/136] [Docs] fix on_after_backward example (#5278)

* fix on_after_backward docs

* doc fix
---
 pytorch_lightning/core/hooks.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 4a1eeb4e9f608..a87ebbeb47199 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -285,10 +285,9 @@ def on_after_backward(self):
                 if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
                     params = self.state_dict()
                     for k, v in params.items():
-                        grads = v
-                        name = k
-                        self.logger.experiment.add_histogram(tag=name, values=grads,
-                                                             global_step=self.trainer.global_step)
+                        self.logger.experiment.add_histogram(
+                            tag=k, values=v.grad, global_step=self.trainer.global_step
+                        )
 
         """
 

From 36198ec128b205799ad86615c56fc45df6b85348 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 13 Jan 2021 13:53:19 +0100
Subject: [PATCH 100/136] fix typo in multi-gpu docs (#5402)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/source/multi_gpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 7c8bba4621c5b..fff32850b9466 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -362,7 +362,7 @@ project module) you can use the following method:
 .. code-block:: python
 
     # train on 8 GPUs (same machine (ie: node))
-    trainer = Trainer(gpus=8, accelerator='ddp')
+    trainer = Trainer(gpus=8, accelerator='ddp_spawn')
 
 We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):
 

From 4c7880445cb5405da4ca772c57fc06c7762f9392 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 13 Jan 2021 15:35:58 +0100
Subject: [PATCH 101/136] fix auto-label conditions (#5496)

* fix auto-label conditions

* labels

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .github/workflows/events-ocasional.yml | 2 +-
 .github/workflows/events-recurrent.yml | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/events-ocasional.yml b/.github/workflows/events-ocasional.yml
index a6cd43a8371ea..0ddec73eeffb6 100644
--- a/.github/workflows/events-ocasional.yml
+++ b/.github/workflows/events-ocasional.yml
@@ -20,7 +20,7 @@ jobs:
           GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
           DRY_RUN: "false"
           PR_FILTER: "labelled"
-          PR_LABELS: "0:] Ready-To-Go,has conflicts"
+          PR_LABELS: "0:] Ready-To-Go"
           MERGE_MSG: "Branch was auto-updated."
           RETRY_COUNT: "3"
           RETRY_SLEEP: "500"
diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
index 6b9382e29901e..74839696c6dd1 100644
--- a/.github/workflows/events-recurrent.yml
+++ b/.github/workflows/events-recurrent.yml
@@ -2,6 +2,9 @@ name: Recurrent events
 
 on:
   push: {}
+  pull_request:
+    types: [synchronize]
+  pull_request_target: {}
 
 jobs:
 

From 83b1ff421dbc1a1c07786e322489086b055ffb01 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 13 Jan 2021 20:02:22 +0100
Subject: [PATCH 102/136] pipeline release CI (#5494)

* pipeline release CI

* trigger

* trigger

* .

* t1

* t2

* t1

* t2
---
 .github/workflows/release-pypi.yml | 35 ++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
index aaf5ef1c2e2cf..80594180abd09 100644
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -10,9 +10,8 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
 
 jobs:
   # based on https://github.com/pypa/gh-action-pypi-publish
-  build-publish:
+  build-package:
     runs-on: ubuntu-20.04
-
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
@@ -28,6 +27,16 @@ jobs:
         python setup.py sdist bdist_wheel
         ls -lh dist/
 
+    - uses: actions/upload-artifact@v2
+      with:
+        name: pypi-packages
+        path: dist
+
+  publish-package:
+    runs-on: ubuntu-20.04
+    needs: build-package
+    steps:
+    - uses: actions/checkout@v2
     - name: Upload to release
       if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
       uses: svenstaro/upload-release-action@v2
@@ -62,6 +71,14 @@ jobs:
         user: __token__
         password: ${{ secrets.pypi_password }}
 
+  create-legacy-ckpt:
+    runs-on: ubuntu-20.04
+    needs: [build-package, publish-package]
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
     # Note: This uses an internal pip API and may not always work
     # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
     - name: Cache pip
@@ -74,7 +91,6 @@ jobs:
     - name: Install dependencies
       run: |
         pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
-        pip install virtualenv
         pip install awscli
 
     - name: Configure AWS credentials
@@ -84,25 +100,26 @@ jobs:
         aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY_ID }}
         aws-region: us-east-1
 
+    - uses: actions/download-artifact@v2
+      with:
+        name: pypi-packages
+        path: dist
+
     - name: Pull files from S3
       run: |
         aws s3 cp --recursive s3://pl-public-data/legacy/checkpoints/ legacy/checkpoints/ #  --acl public-read
         ls -l legacy/checkpoints/
 
     - name: Generate checkpoint
-      if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
+      # if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
       run: |
-        virtualenv vEnv --system-site-packages
-        source vEnv/bin/activate
+        ls -lh dist/
         pip install dist/*.whl
 
         pl_ver=$(python -c "import pytorch_lightning as pl ; print(pl.__version__)" 2>&1)
         # generate checkpoint to this version
         bash legacy/generate_checkpoints.sh $pl_ver
 
-        deactivate
-        rm -rf vEnv
-
     - name: Push files to S3
       run: |
         aws s3 sync legacy/checkpoints/ s3://pl-public-data/legacy/checkpoints/

From d916973cdc8bffe8c8a07cd29d8be681f78ef62d Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 14 Jan 2021 02:00:01 +0530
Subject: [PATCH 103/136] Refactor setup_training and remove test_mode (#5388)

* ref and fix call for on_pretrained_routine

* avoid failing tests

* unnecessary_call

* unnecessary call in accelerators

* tmpdir

* rm test_mode

* pep

* updates

* more ref

* Revert "more ref"

This reverts commit 5d9e95f87343a4d9853eb30ca883d1dbfba369c6.

* more refac

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 pytorch_lightning/accelerators/accelerator.py |  5 ++
 .../accelerators/cpu_accelerator.py           | 10 ---
 .../accelerators/ddp2_accelerator.py          |  6 +-
 .../accelerators/ddp_accelerator.py           |  6 +-
 .../accelerators/ddp_cpu_spawn_accelerator.py |  6 +-
 .../accelerators/ddp_hpc_accelerator.py       |  6 +-
 .../accelerators/ddp_spawn_accelerator.py     |  6 +-
 .../accelerators/dp_accelerator.py            | 10 ---
 .../accelerators/gpu_accelerator.py           | 10 ---
 .../accelerators/horovod_accelerator.py       |  3 +-
 .../accelerators/tpu_accelerator.py           |  3 +-
 .../connectors/checkpoint_connector.py        |  6 +-
 .../logger_connector/logger_connector.py      | 12 ++--
 pytorch_lightning/trainer/evaluation_loop.py  | 60 +++++++++---------
 pytorch_lightning/trainer/trainer.py          | 62 ++++++++++++++-----
 pytorch_lightning/trainer/training_loop.py    | 54 +++-------------
 pytorch_lightning/utilities/debugging.py      |  6 +-
 tests/callbacks/test_callbacks.py             |  2 -
 tests/core/test_datamodules.py                | 18 +++---
 tests/models/test_hooks.py                    |  2 -
 tests/models/test_torchscript.py              |  6 +-
 .../test_eval_loop_dict_return.py             | 19 +++---
 .../test_eval_loop_logging_1_0.py             |  2 +-
 .../optimization/test_manual_optimization.py  | 10 +--
 24 files changed, 139 insertions(+), 191 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 77f30219ba8c0..8bb335f2e7847 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -52,6 +52,10 @@ def __init__(self,
     def setup(self, model):
         pass
 
+    def train(self):
+        self.trainer.setup_trainer(self.trainer.model)
+        return self.train_or_test()
+
     def teardown(self):
         # Ensure if necessary all processes are finished
         self.barrier()
@@ -66,6 +70,7 @@ def train_or_test(self):
         if self.trainer.testing:
             results = self.trainer.run_test()
         else:
+            self.trainer.train_loop.setup_training()
             results = self.trainer.train()
         return results
 
diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py
index e034b209bf34c..997a3568daf2d 100644
--- a/pytorch_lightning/accelerators/cpu_accelerator.py
+++ b/pytorch_lightning/accelerators/cpu_accelerator.py
@@ -50,16 +50,6 @@ def setup(self, model):
 
         self.trainer.model = model
 
-    def train(self):
-        model = self.trainer.model
-
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-        return results
-
     def _step(self, model_step: Callable, args):
         if self.trainer.amp_backend == AMPType.NATIVE:
             with torch.cuda.amp.autocast():
diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py
index 68af3f579a6e8..373406589d855 100644
--- a/pytorch_lightning/accelerators/ddp2_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp2_accelerator.py
@@ -186,9 +186,6 @@ def ddp_train(self, process_idx, mp_queue, model):
 
         self.ddp_plugin.on_after_setup_optimizers(self.trainer)
 
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
@@ -198,8 +195,7 @@ def ddp_train(self, process_idx, mp_queue, model):
         # allow user to configure ddp
         model = self.configure_ddp(model, device_ids)
 
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py
index f0d9f2171bf48..0fde9da158c94 100644
--- a/pytorch_lightning/accelerators/ddp_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_accelerator.py
@@ -285,9 +285,6 @@ def ddp_train(self, process_idx, model):
         # allow for lr schedulers as well
         self.setup_optimizers(model)
 
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
@@ -297,9 +294,8 @@ def ddp_train(self, process_idx, model):
         # allow user to configure ddp
         model = self.configure_ddp(model, device_ids)
 
-        # set up training routine
         self.barrier('ddp_setup')
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
index e7ef38c8df3b4..f9ccaa200bbf4 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
@@ -146,9 +146,6 @@ def ddp_train(self, process_idx, mp_queue, model):
 
         self.ddp_plugin.on_after_setup_optimizers(self.trainer)
 
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
@@ -158,8 +155,7 @@ def ddp_train(self, process_idx, mp_queue, model):
         # allow user to configure ddp
         model = self.configure_ddp(model, device_ids)
 
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index c25e082ee348d..bdc4631b5d017 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -177,9 +177,6 @@ def ddp_train(self, process_idx, model):
 
         self.ddp_plugin.on_after_setup_optimizers(self.trainer)
 
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
@@ -189,8 +186,7 @@ def ddp_train(self, process_idx, model):
         # allow user to configure ddp
         model = self.configure_ddp(model, device_ids)
 
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
index 23783fada72f1..eb4ff24e39dd4 100644
--- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
@@ -161,9 +161,6 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
 
         self.ddp_plugin.on_after_setup_optimizers(self.trainer)
 
-        # set model properties before going into wrapper
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # 16-bit
         model = self.trainer.precision_connector.connect(model)
 
@@ -173,8 +170,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         # allow user to configure ddp
         model = self.configure_ddp(model, device_ids)
 
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py
index fc01c4686f04f..7517c774f51dd 100644
--- a/pytorch_lightning/accelerators/dp_accelerator.py
+++ b/pytorch_lightning/accelerators/dp_accelerator.py
@@ -101,16 +101,6 @@ def __init_nvidia_apex(self, model):
 
         return model
 
-    def train(self):
-        model = self.trainer.model
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-
-        return results
-
     def teardown(self):
         # replace the original fwd function
         self.trainer.model.forward = self.model_autocast_original_forward
diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py
index 49f21e9e34816..d65b19bbd9bb1 100644
--- a/pytorch_lightning/accelerators/gpu_accelerator.py
+++ b/pytorch_lightning/accelerators/gpu_accelerator.py
@@ -56,16 +56,6 @@ def setup(self, model):
 
         self.trainer.model = model
 
-    def train(self):
-        model = self.trainer.model
-
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
-
-        # train or test
-        results = self.train_or_test()
-        return results
-
     def _step(self, model_step: Callable, args):
         args[0] = self.to_device(args[0])
 
diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py
index 2013d75df7b1e..6e11a13064513 100644
--- a/pytorch_lightning/accelerators/horovod_accelerator.py
+++ b/pytorch_lightning/accelerators/horovod_accelerator.py
@@ -104,8 +104,7 @@ def train(self):
                 # Synchronization will be performed explicitly following backward()
                 stack.enter_context(optimizer.skip_synchronize())
 
-            # set up training routine
-            self.trainer.train_loop.setup_training(self.trainer.model)
+            self.trainer.setup_trainer(self.trainer.model)
 
             # train or test
             results = self.train_or_test()
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 7dcfaae401ca7..286004bc0976e 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -134,8 +134,7 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine
         # setup TPU training
         self.__setup_tpu_training(model, trainer)
 
-        # set up training routine
-        self.trainer.train_loop.setup_training(model)
+        self.trainer.setup_trainer(model)
 
         # train or test
         results = self.train_or_test()
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index fc9c70ba46d2e..03d46132fb177 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-from pathlib import Path
 import re
+from pathlib import Path
 from typing import Optional, Union
 
 import torch
@@ -44,7 +44,7 @@ def __init__(self, trainer):
         # used to validate checkpointing logic
         self.has_trained = False
 
-    def restore_weights(self, model: LightningModule) -> None:
+    def restore_weights(self) -> None:
         """
         Attempt to restore a checkpoint (e.g. weights) in this priority:
         1. from HPC weights
@@ -64,7 +64,7 @@ def restore_weights(self, model: LightningModule) -> None:
             rank_zero_info(f'restored hpc model from: {checkpoint_path}')
 
         # 2. Attempt to restore states from `resume_from_checkpoint` file
-        elif self.trainer.resume_from_checkpoint is not None and not self.trainer.testing:
+        elif self.trainer.resume_from_checkpoint is not None:
             self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
 
         # wait for all to catch up
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 6cf020aa65fa1..84e8a1bc68f05 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -208,9 +208,9 @@ def add_progress_bar_metrics(self, metrics):
 
         self.trainer.dev_debugger.track_pbar_metrics_history(metrics)
 
-    def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result, test_mode):
+    def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result):
         self._track_callback_metrics(deprecated_eval_results, using_eval_result)
-        self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode)
+        self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results)
 
     def evaluation_epoch_end(self, testing):
         # reset dataloader idx
@@ -239,7 +239,7 @@ def prepare_eval_loop_results(self):
         for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders):
             self.add_to_eval_loop_results(dl_idx, has_been_initialized)
 
-    def get_evaluate_epoch_results(self, test_mode):
+    def get_evaluate_epoch_results(self):
         if not self.trainer.running_sanity_check:
             # log all the metrics as a single dict
             metrics_to_log = self.cached_results.get_epoch_log_metrics()
@@ -249,7 +249,7 @@ def get_evaluate_epoch_results(self, test_mode):
         self.prepare_eval_loop_results()
 
         # log results of test
-        if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test:
+        if self.trainer.testing and self.trainer.is_global_zero and self.trainer.verbose_test:
             print('-' * 80)
             for result_idx, results in enumerate(self.eval_loop_results):
                 print(f'DATALOADER:{result_idx} TEST RESULTS')
@@ -330,7 +330,7 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric
         if len(dataloader_result_metrics) > 0:
             self.eval_loop_results.append(dataloader_result_metrics)
 
-    def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mode):
+    def __process_eval_epoch_end_results_and_log_legacy(self, eval_results):
         if self.trainer.running_sanity_check:
             return
 
@@ -350,7 +350,7 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod
                     callback_metrics = result.callback_metrics
 
                     # in testing we don't need the callback metrics
-                    if test_mode:
+                    if self.trainer.testing:
                         callback_metrics = {}
                 else:
                     _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result)
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 4b70917c8c43d..63f65bead2579 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -24,7 +24,6 @@
 class EvaluationLoop(object):
     def __init__(self, trainer):
         self.trainer = trainer
-        self.testing = False
         self.outputs = []
         self.step_metrics = []
         self.predictions = None
@@ -52,7 +51,7 @@ def get_evaluation_dataloaders(self, max_batches):
         model = self.trainer.get_model()
 
         # select dataloaders
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.reset_test_dataloader(model)
 
             dataloaders = self.trainer.test_dataloaders
@@ -85,34 +84,34 @@ def should_skip_evaluation(self, dataloaders, max_batches):
         return False
 
     def on_evaluation_start(self, *args, **kwargs):
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_start', *args, **kwargs)
         else:
             self.trainer.call_hook('on_validation_start', *args, **kwargs)
 
     def on_evaluation_model_eval(self, *args, **kwargs):
         model_ref = self.trainer.get_model()
-        if self.testing:
+        if self.trainer.testing:
             model_ref.on_test_model_eval()
         else:
             model_ref.on_validation_model_eval()
 
     def on_evaluation_model_train(self, *args, **kwargs):
         model_ref = self.trainer.get_model()
-        if self.testing:
+        if self.trainer.testing:
             model_ref.on_test_model_train()
         else:
             model_ref.on_validation_model_train()
 
     def on_evaluation_end(self, *args, **kwargs):
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_end', *args, **kwargs)
         else:
             self.trainer.call_hook('on_validation_end', *args, **kwargs)
 
     def reload_evaluation_dataloaders(self):
         model = self.trainer.get_model()
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.reset_test_dataloader(model)
         else:
             self.trainer.reset_val_dataloader(model)
@@ -123,9 +122,6 @@ def is_using_eval_results(self):
         return using_eval_result
 
     def setup(self, model, max_batches, dataloaders):
-        # copy properties for forward overrides
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # bookkeeping
         self.outputs = []
         self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size)
@@ -138,17 +134,23 @@ def setup(self, model, max_batches, dataloaders):
         self.num_dataloaders = self._get_num_dataloaders(dataloaders)
 
     def on_evaluation_epoch_start(self, *args, **kwargs):
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_epoch_start', *args, **kwargs)
         else:
             self.trainer.call_hook('on_validation_epoch_start', *args, **kwargs)
 
-    def build_args(self, test_mode, batch, batch_idx, dataloader_idx):
+    def _build_args(self, batch, batch_idx, dataloader_idx):
         # make dataloader_idx arg in validation_step optional
         args = [batch, batch_idx]
 
-        multiple_val_loaders = (not test_mode and self._get_num_dataloaders(self.trainer.val_dataloaders) > 1)
-        multiple_test_loaders = (test_mode and self._get_num_dataloaders(self.trainer.test_dataloaders) > 1)
+        multiple_val_loaders = (
+            not self.trainer.testing
+            and self._get_num_dataloaders(self.trainer.val_dataloaders) > 1
+        )
+        multiple_test_loaders = (
+            self.trainer.testing
+            and self._get_num_dataloaders(self.trainer.test_dataloaders) > 1
+        )
 
         if multiple_test_loaders or multiple_val_loaders:
             args.append(dataloader_idx)
@@ -163,14 +165,14 @@ def _get_num_dataloaders(self, dataloaders):
             length = len(dataloaders[0])
         return length
 
-    def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx):
+    def evaluation_step(self, batch, batch_idx, dataloader_idx):
         # configure args
-        args = self.build_args(test_mode, batch, batch_idx, dataloader_idx)
+        args = self._build_args(batch, batch_idx, dataloader_idx)
 
         model_ref = self.trainer.get_model()
         model_ref._results = Result()
         # run actual test step
-        if self.testing:
+        if self.trainer.testing:
             model_ref._current_fx_name = "test_step"
             output = self.trainer.accelerator_backend.test_step(args)
         else:
@@ -192,7 +194,7 @@ def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx):
         return output
 
     def evaluation_step_end(self, *args, **kwargs):
-        if self.testing:
+        if self.trainer.testing:
             output = self.trainer.call_hook('test_step_end', *args, **kwargs)
         else:
             output = self.trainer.call_hook('validation_step_end', *args, **kwargs)
@@ -200,7 +202,7 @@ def evaluation_step_end(self, *args, **kwargs):
 
     def evaluation_epoch_end(self):
         # unset dataloder_idx in model
-        self.trainer.logger_connector.evaluation_epoch_end(self.testing)
+        self.trainer.logger_connector.evaluation_epoch_end(self.trainer.testing)
 
         using_eval_result = self.is_using_eval_results()
 
@@ -216,7 +218,7 @@ def evaluation_epoch_end(self):
 
     def log_epoch_metrics_on_evaluation_end(self):
         # get the final loop results
-        eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results(self.testing)
+        eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results()
         return eval_loop_results
 
     def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
@@ -230,7 +232,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
 
         user_reduced = False
 
-        if self.testing:
+        if self.trainer.testing:
             if is_overridden('test_epoch_end', model=model):
                 if using_eval_result:
                     eval_results = self.__gather_epoch_end_eval_results(outputs)
@@ -250,7 +252,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
         self.trainer.logger_connector.cache_logged_metrics()
         # depre warning
         if eval_results is not None and user_reduced:
-            step = 'testing_epoch_end' if self.testing else 'validation_epoch_end'
+            step = 'testing_epoch_end' if self.trainer.testing else 'validation_epoch_end'
             self.warning_cache.warn(
                 f'The {step} should not return anything as of 9.1.'
                 ' To log, use self.log(...) or self.write(...) directly in the LightningModule'
@@ -263,7 +265,7 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
             eval_results = [eval_results]
 
         # track depreceated metrics
-        self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result, self.testing)
+        self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result)
 
         return eval_results
 
@@ -300,15 +302,15 @@ def __auto_reduce_result_objs(self, outputs):
     def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx):
         # set dataloader_idx to model and track batch_size
         self.trainer.logger_connector.on_evaluation_batch_start(
-            self.testing, batch, dataloader_idx, self.num_dataloaders)
+            self.trainer.testing, batch, dataloader_idx, self.num_dataloaders)
 
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx)
         else:
             self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx)
 
     def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx):
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_batch_end', output, batch, batch_idx, dataloader_idx)
         else:
             self.trainer.call_hook('on_validation_batch_end', output, batch, batch_idx, dataloader_idx)
@@ -319,16 +321,16 @@ def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx):
     def store_predictions(self, output, batch_idx, dataloader_idx):
         # Add step predictions to prediction collection to write later
         if output is not None:
-            do_write_predictions = isinstance(output, Result) and self.testing
+            do_write_predictions = isinstance(output, Result) and self.trainer.testing
             if do_write_predictions:
                 self.predictions.add(output.pop('predictions', None))
 
         # track debug metrics
-        self.trainer.dev_debugger.track_eval_loss_history(self.testing, batch_idx, dataloader_idx, output)
+        self.trainer.dev_debugger.track_eval_loss_history(batch_idx, dataloader_idx, output)
 
     def on_evaluation_epoch_end(self, *args, **kwargs):
         # call the callback hook
-        if self.testing:
+        if self.trainer.testing:
             self.trainer.call_hook('on_test_epoch_end', *args, **kwargs)
         else:
             self.trainer.call_hook('on_validation_epoch_end', *args, **kwargs)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 2c1867a21552d..c3ef0e507789e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -15,9 +15,9 @@
 """Trainer to automate the training."""
 
 import os
+import warnings
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Union
-import warnings
 
 import torch
 from torch.utils.data import DataLoader
@@ -57,7 +57,7 @@
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
-from pytorch_lightning.utilities import DeviceType, rank_zero_warn
+from pytorch_lightning.utilities import AMPType, DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -412,6 +412,46 @@ def __init__(
         # Callback system
         self.on_init_end()
 
+    def setup_trainer(self, model: LightningModule):
+        """
+        Sanity check a few things before starting actual training or testing.
+
+        Args:
+            model: The model to run sanity test on.
+        """
+        # --------------------------
+        # Setup??
+        # --------------------------
+        ref_model = self.get_model()
+
+        # set the ranks and devices
+        self.accelerator_backend.dist.rank = self.global_rank
+        self.accelerator_backend.dist.device = ref_model.device
+
+        # set local properties on the model
+        self.model_connector.copy_trainer_model_properties(model)
+
+        # init amp. Must be done here instead of __init__ to allow ddp to work
+        if self.amp_backend == AMPType.NATIVE and self.precision == 16 and not self.use_tpu:
+            self.scaler = self.precision_connector.backend.scaler
+
+        # log hyper-parameters
+        if self.logger is not None:
+            # save exp to get started (this is where the first experiment logs are written)
+            self.logger.log_hyperparams(ref_model.hparams_initial)
+            self.logger.log_graph(ref_model)
+            self.logger.save()
+
+        # wait for all to join if on distributed
+        self.accelerator_backend.barrier("setup_trainer")
+
+        # register auto-resubmit when on SLURM
+        self.slurm_connector.register_slurm_signal_handlers()
+
+        # track model now.
+        # if cluster resets state, the model will update with the saved weights
+        self.model = model
+
     def fit(
         self,
         model: LightningModule,
@@ -446,10 +486,6 @@ def fit(
         # hook
         self.data_connector.prepare_data(model)
 
-        # bookkeeping
-        # we reuse fit in .test() but change its behavior using this flag
-        self.testing = os.environ.get('PL_TESTING_MODE', self.testing)
-
         # ----------------------------
         # SET UP TRAINING
         # ----------------------------
@@ -554,13 +590,13 @@ def train(self):
             # hook
             self.train_loop.on_train_end()
 
-    def run_evaluation(self, test_mode: bool = False, max_batches=None):
+    def run_evaluation(self, max_batches=None):
 
         # used to know if we are logging for val, test + reset cached results
-        self.logger_connector.set_stage(test_mode, reset=True)
+        self.logger_connector.set_stage(self.testing, reset=True)
 
         # bookkeeping
-        self.evaluation_loop.testing = test_mode
+        self.evaluation_loop.testing = self.testing
 
         # prepare dataloaders
         dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders(max_batches)
@@ -606,7 +642,7 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
 
                 # lightning module methods
                 with self.profiler.profile("evaluation_step_and_end"):
-                    output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx)
+                    output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx)
                     output = self.evaluation_loop.evaluation_step_end(output)
 
                 # hook + store predictions
@@ -659,7 +695,7 @@ def run_test(self):
         # only load test dataloader for testing
         # self.reset_test_dataloader(ref_model)
         with self.profiler.profile("run_test_evaluation"):
-            eval_loop_results, _ = self.run_evaluation(test_mode=True)
+            eval_loop_results, _ = self.run_evaluation()
 
         if len(eval_loop_results) == 0:
             return 1
@@ -690,7 +726,7 @@ def run_sanity_check(self, ref_model):
             self.on_sanity_check_start()
 
             # run eval step
-            _, eval_results = self.run_evaluation(test_mode=False, max_batches=self.num_sanity_val_batches)
+            _, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
 
             # allow no returns from eval
             if eval_results is not None and len(eval_results) > 0:
@@ -794,11 +830,9 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
         # run tests
         self.tested_ckpt_path = ckpt_path
         self.testing = True
-        os.environ['PL_TESTING_MODE'] = '1'
         self.model = model
         results = self.fit(model)
         self.testing = False
-        del os.environ['PL_TESTING_MODE']
 
         # teardown
         if self.is_function_implemented('teardown'):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 3c8a8d45d0411..47e254606af93 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -124,64 +124,26 @@ def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
         # check that model is configured correctly
         self.trainer.config_validator.verify_loop_configurations(model)
 
-    def setup_training(self, model: LightningModule):
-        """Sanity check a few things before starting actual training.
-
-        Args:
-            model: The model to run sanity test on.
+    def setup_training(self):
+        """
+        Sanity check a few things before starting actual training.
         """
-        # --------------------------
-        # Setup??
-        # --------------------------
-        ref_model = model
-        if self.trainer.data_parallel:
-            ref_model = model.module
-
-        # set the ranks and devices
-        self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
-        self.trainer.accelerator_backend.dist.device = ref_model.device
-
-        # give model convenience properties
-        ref_model.trainer = self.trainer
-
-        # set local properties on the model
-        self.trainer.model_connector.copy_trainer_model_properties(ref_model)
-
-        # init amp. Must be done here instead of __init__ to allow ddp to work
-        if self.trainer.amp_backend == AMPType.NATIVE and self.trainer.precision == 16 and not self.trainer.use_tpu:
-            self.trainer.scaler = self.trainer.precision_connector.backend.scaler
-
-        # log hyper-parameters
-        if self.trainer.logger is not None:
-            # save exp to get started (this is where the first experiment logs are written)
-            self.trainer.logger.log_hyperparams(ref_model.hparams_initial)
-            self.trainer.logger.log_graph(ref_model)
-            self.trainer.logger.save()
-
-        # wait for all to join if on distributed
-        self.trainer.accelerator_backend.barrier("setup_training")
-
-        # register auto-resubmit when on SLURM
-        self.trainer.slurm_connector.register_slurm_signal_handlers()
-
         # --------------------------
         # Pre-train
         # --------------------------
+        ref_model = self.trainer.get_model()
+
         # on pretrain routine start
         self.trainer.on_pretrain_routine_start(ref_model)
         if self.trainer.is_function_implemented("on_pretrain_routine_start"):
             ref_model.on_pretrain_routine_start()
 
         # print model summary
-        if self.trainer.is_global_zero and not self.trainer.testing:
+        if self.trainer.is_global_zero:
             ref_model.summarize(mode=self.trainer.weights_summary)
 
-        # track model now.
-        # if cluster resets state, the model will update with the saved weights
-        self.trainer.model = model
-
         # restore training state and model weights before hpc is called
-        self.trainer.checkpoint_connector.restore_weights(model)
+        self.trainer.checkpoint_connector.restore_weights()
 
         # on pretrain routine end
         self.trainer.on_pretrain_routine_end(ref_model)
@@ -597,7 +559,7 @@ def run_training_epoch(self):
             # -----------------------------------------
             should_check_val = self.should_check_val_fx(batch_idx, is_last_batch)
             if should_check_val:
-                self.trainer.run_evaluation(test_mode=False)
+                self.trainer.run_evaluation()
                 # reset stage to train
                 self.trainer.logger_connector.set_stage("train")
 
diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py
index 9264e2a49810d..c9fac5cc04a45 100644
--- a/pytorch_lightning/utilities/debugging.py
+++ b/pytorch_lightning/utilities/debugging.py
@@ -16,7 +16,7 @@
 import time
 from collections import Counter
 from functools import wraps
-from typing import Callable, Any, Optional
+from typing import Any, Callable, Optional
 
 
 def enabled_only(fn: Callable):
@@ -133,7 +133,7 @@ def track_lr_schedulers_update(self, batch_idx, interval, scheduler_idx, old_lr,
         self.saved_lr_scheduler_updates.append(loss_dict)
 
     @enabled_only
-    def track_eval_loss_history(self, test_mode, batch_idx, dataloader_idx, output):
+    def track_eval_loss_history(self, batch_idx, dataloader_idx, output):
         loss_dict = {
             'sanity_check': self.trainer.running_sanity_check,
             'dataloader_idx': dataloader_idx,
@@ -142,7 +142,7 @@ def track_eval_loss_history(self, test_mode, batch_idx, dataloader_idx, output):
             'output': output
         }
 
-        if test_mode:
+        if self.trainer.testing:
             self.saved_test_losses.append(loss_dict)
         else:
             self.saved_val_losses.append(loss_dict)
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 53debcebeb7cd..c9baf0db6976d 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -109,8 +109,6 @@ def test_trainer_callback_system(torch_save):
         call.on_init_end(trainer),
         call.setup(trainer, model, 'test'),
         call.on_fit_start(trainer, model),
-        call.on_pretrain_routine_start(trainer, model),
-        call.on_pretrain_routine_end(trainer, model),
         call.on_test_start(trainer, model),
         call.on_test_epoch_start(trainer, model),
         call.on_test_batch_start(trainer, model, ANY, 0, 0),
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index d286bbf3a9de6..64dc25101eae6 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -13,21 +13,21 @@
 # limitations under the License.
 import pickle
 from argparse import ArgumentParser
-from unittest.mock import MagicMock
 from typing import Optional
+from unittest.mock import MagicMock
 
 import pytest
 import torch
 from torch.utils.data import DataLoader, random_split
 
-from pytorch_lightning import LightningDataModule, Trainer, seed_everything
+from pytorch_lightning import LightningDataModule, seed_everything, Trainer
+from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.utilities.model_utils import is_overridden
 from tests.base import EvalModelTemplate
-from tests.base.datasets import TrialMNIST
 from tests.base.datamodules import TrialMNISTDataModule
+from tests.base.datasets import TrialMNIST
 from tests.base.develop_utils import reset_seed
-from pytorch_lightning.utilities.model_utils import is_overridden
-from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator
-from pytorch_lightning.callbacks import ModelCheckpoint
 
 
 def test_can_prepare_data(tmpdir):
@@ -170,14 +170,14 @@ def test_data_hooks_called_with_stage_kwarg(tmpdir):
 def test_dm_add_argparse_args(tmpdir):
     parser = ArgumentParser()
     parser = TrialMNISTDataModule.add_argparse_args(parser)
-    args = parser.parse_args(['--data_dir', './my_data'])
-    assert args.data_dir == './my_data'
+    args = parser.parse_args(['--data_dir', str(tmpdir)])
+    assert args.data_dir == str(tmpdir)
 
 
 def test_dm_init_from_argparse_args(tmpdir):
     parser = ArgumentParser()
     parser = TrialMNISTDataModule.add_argparse_args(parser)
-    args = parser.parse_args(['--data_dir', './my_data'])
+    args = parser.parse_args(['--data_dir', str(tmpdir)])
     dm = TrialMNISTDataModule.from_argparse_args(args)
     dm.prepare_data()
     dm.setup()
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 8a5d2f667bc32..5352e749c5e55 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -392,8 +392,6 @@ def on_test_model_train(self):
 
     expected = [
         'on_fit_start',
-        'on_pretrain_routine_start',
-        'on_pretrain_routine_end',
         'on_test_model_eval',
         'on_test_epoch_start',
         'on_test_batch_start',
diff --git a/tests/models/test_torchscript.py b/tests/models/test_torchscript.py
index 3c43b201f52e4..75e1ec7724967 100644
--- a/tests/models/test_torchscript.py
+++ b/tests/models/test_torchscript.py
@@ -18,7 +18,7 @@
 
 from tests.base import BoringModel
 from tests.base.datamodules import TrialMNISTDataModule
-from tests.base.models import ParityModuleRNN, BasicGAN
+from tests.base.models import BasicGAN, ParityModuleRNN
 
 
 @pytest.mark.parametrize("modelclass", [
@@ -116,10 +116,10 @@ def test_torchscript_retain_training_state():
     ParityModuleRNN,
     BasicGAN,
 ])
-def test_torchscript_properties(modelclass):
+def test_torchscript_properties(tmpdir, modelclass):
     """ Test that scripted LightningModule has unnecessary methods removed. """
     model = modelclass()
-    model.datamodule = TrialMNISTDataModule()
+    model.datamodule = TrialMNISTDataModule(tmpdir)
     script = model.to_torchscript()
     assert not hasattr(script, "datamodule")
     assert not hasattr(model, "batch_size") or hasattr(script, "batch_size")
diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
index 9e2023d27d928..3a9a87f84e5d9 100644
--- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
+++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
@@ -15,8 +15,9 @@
 Tests to ensure that the training loop works with a dict
 """
 import os
-from pytorch_lightning.core.lightning import LightningModule
+
 from pytorch_lightning import Trainer
+from pytorch_lightning.core.lightning import LightningModule
 from tests.base.deterministic_model import DeterministicModel
 
 
@@ -43,7 +44,7 @@ def backward(self, loss, optimizer, optimizer_idx):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    out, eval_results = trainer.run_evaluation(test_mode=False)
+    out, eval_results = trainer.run_evaluation()
     assert len(out) == 1
     assert len(eval_results) == 0
 
@@ -74,7 +75,7 @@ def test_validation_step_scalar_return(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    out, eval_results = trainer.run_evaluation(test_mode=False)
+    out, eval_results = trainer.run_evaluation()
     assert len(out) == 1
     assert len(eval_results) == 2
     assert eval_results[0] == 171 and eval_results[1] == 171
@@ -106,7 +107,7 @@ def test_validation_step_arbitrary_dict_return(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(eval_results) == 2
     assert eval_results[0]['some'] == 171
@@ -144,7 +145,7 @@ def test_validation_step_dict_return(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(callback_metrics[0]) == 5
     assert len(eval_results) == 2
@@ -186,7 +187,7 @@ def test_val_step_step_end_no_return(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(eval_results) == 0
 
@@ -218,7 +219,7 @@ def test_val_step_step_end(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(callback_metrics[0]) == 6
 
@@ -264,7 +265,7 @@ def test_no_val_step_end(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(callback_metrics[0]) == 6
     assert len(eval_results) == 1
@@ -308,7 +309,7 @@ def test_full_val_loop(tmpdir):
 
     # out are the results of the full loop
     # eval_results are output of _evaluate
-    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    callback_metrics, eval_results = trainer.run_evaluation()
     assert len(callback_metrics) == 1
     assert len(callback_metrics[0]) == 7
     assert len(eval_results) == 1
diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
index da08ffe710e75..53636bed66f56 100644
--- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
@@ -292,7 +292,7 @@ def validation_epoch_end(self, outputs) -> None:
         max_epochs=1,
         log_every_n_steps=1,
         weights_summary=None,
-        callbacks=[ModelCheckpoint(dirpath='val_loss')],
+        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
     )
     trainer.fit(model)
 
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 50463f5c4b5e2..2fc6cb60c7fb0 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -656,11 +656,11 @@ def configure_optimizers(self):
     assert model.called
 
 
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_step_with_optimizer_closure(tmpdir):
     """
     Tests that `step` works with optimizer_closure
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
 
@@ -736,11 +736,11 @@ def configure_optimizers(self):
     assert trainer.logger_connector.progress_bar_metrics["train_loss_epoch"] == torch.stack(model._losses).mean()
 
 
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_step_with_optimizer_closure_and_accumulated_grad(tmpdir):
     """
     Tests that `step` works with optimizer_closure and accumulated_grad
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
         def __init__(self):
@@ -798,12 +798,12 @@ def configure_optimizers(self):
     assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * 2
 
 
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 @patch("torch.optim.SGD.step")
 def test_step_with_optimizer_closure_and_extra_arguments(step_mock, tmpdir):
     """
     Tests that `step` works with optimizer_closure and extra arguments
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
         def __init__(self):
@@ -854,13 +854,13 @@ def configure_optimizers(self):
     step_mock.assert_has_calls(expected_calls)
 
 
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 @patch("torch.optim.Adam.step")
 @patch("torch.optim.SGD.step")
 def test_step_with_optimizer_closure_with_different_frequencies(mock_sgd_step, mock_adam_step, tmpdir):
     """
     Tests that `step` works with optimizer_closure and different accumulated_gradient frequency
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
         def __init__(self):
@@ -933,6 +933,7 @@ def configure_optimizers(self):
     mock_adam_step.assert_has_calls(expected_calls)
 
 
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 @patch("torch.optim.Adam.step")
 @patch("torch.optim.SGD.step")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -941,7 +942,6 @@ def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_ste
     """
     Tests that `step` works with optimizer_closure and different accumulated_gradient frequency
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
         def __init__(self):

From 94b7d84b9e3208d4d8651db013097b0e57d77484 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 14 Jan 2021 01:29:29 +0100
Subject: [PATCH 104/136] add section & add testing ckpt 1.1.4 (#5495)

* add section

* test legacy

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 CHANGELOG.md                                  | 19 ++++++++++++++++++-
 .../checkpointing/test_legacy_checkpoints.py  |  1 +
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 066745b4c57cb..93f3505eb1eb1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,8 +4,25 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [unreleased.Bugfixes] - YYYY-MM-DD
 
-## [1.1.4] - YYYY-MM-DD
+### Added
+
+
+### Changed
+
+
+### Deprecated
+
+
+### Removed
+
+
+### Fixed
+
+
+
+## [1.1.4] - 2021-01-12
 
 ### Added
 
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 42623cb4df1ec..48f5a53733214 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -45,6 +45,7 @@
     "1.1.1",
     "1.1.2",
     "1.1.3",
+    "1.1.4",
 ])
 def test_resume_legacy_checkpoints(tmpdir, pl_version):
     path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)

From 71d5cc11f13c1338fbe3f74a8d12e438cc6fddef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 14 Jan 2021 03:25:12 +0100
Subject: [PATCH 105/136] Fix visual progress bar bug / properly reset progress
 bar (#4579)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* reset

* fix reset

* changelog

* update chlog

* typing

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                            |  2 ++
 pytorch_lightning/callbacks/progress.py | 19 +++++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 93f3505eb1eb1..2011626e67bbc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 
+- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
+
 
 ## [1.1.4] - 2021-01-12
 
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
index 3ed5c11fd75d7..639a988bf3856 100644
--- a/pytorch_lightning/callbacks/progress.py
+++ b/pytorch_lightning/callbacks/progress.py
@@ -24,6 +24,8 @@
 
 # check if ipywidgets is installed before importing tqdm.auto
 # to ensure it won't fail and a progress bar is displayed
+from typing import Optional, Union
+
 if importlib.util.find_spec('ipywidgets') is not None:
     from tqdm.auto import tqdm
 else:
@@ -306,7 +308,7 @@ def init_test_tqdm(self) -> tqdm:
     def on_sanity_check_start(self, trainer, pl_module):
         super().on_sanity_check_start(trainer, pl_module)
         self.val_progress_bar = self.init_sanity_tqdm()
-        self.val_progress_bar.total = convert_inf(sum(trainer.num_sanity_val_batches))
+        reset(self.val_progress_bar, sum(trainer.num_sanity_val_batches))
         self.main_progress_bar = tqdm(disable=True)  # dummy progress bar
 
     def on_sanity_check_end(self, trainer, pl_module):
@@ -327,8 +329,7 @@ def on_epoch_start(self, trainer, pl_module):
             val_checks_per_epoch = total_train_batches // trainer.val_check_batch
             total_val_batches = total_val_batches * val_checks_per_epoch
         total_batches = total_train_batches + total_val_batches
-        if not self.main_progress_bar.disable:
-            self.main_progress_bar.reset(convert_inf(total_batches))
+        reset(self.main_progress_bar, total_batches)
         self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch}')
 
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
@@ -342,7 +343,7 @@ def on_validation_start(self, trainer, pl_module):
         if not trainer.running_sanity_check:
             self._update_bar(self.main_progress_bar)  # fill up remaining
             self.val_progress_bar = self.init_validation_tqdm()
-            self.val_progress_bar.total = convert_inf(self.total_val_batches)
+            reset(self.val_progress_bar, self.total_val_batches)
 
     def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_validation_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
@@ -362,7 +363,7 @@ def on_train_end(self, trainer, pl_module):
     def on_test_start(self, trainer, pl_module):
         super().on_test_start(trainer, pl_module)
         self.test_progress_bar = self.init_test_tqdm()
-        self.test_progress_bar.total = convert_inf(self.total_test_batches)
+        reset(self.test_progress_bar, self.total_test_batches)
 
     def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         super().on_test_batch_end(trainer, pl_module, outputs, batch, batch_idx, dataloader_idx)
@@ -387,8 +388,14 @@ def _update_bar(self, bar):
             bar.update(delta)
 
 
-def convert_inf(x):
+def convert_inf(x: Optional[Union[int, float]]) -> Optional[Union[int, float]]:
     """ The tqdm doesn't support inf values. We have to convert it to None. """
     if x == float('inf'):
         return None
     return x
+
+
+def reset(bar: tqdm, total: Optional[int] = None) -> None:
+    """ Resets the tqdm bar to 0 progress with a new total, unless it is disabled. """
+    if not bar.disable:
+        bar.reset(total=convert_inf(total))

From 24fb75a1f4f024fed5be81e9b2aeccdb10ba6bb8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 14 Jan 2021 16:53:12 +0100
Subject: [PATCH 106/136] reconfigure mergify (#5499)

* configure mergify

* drop gha

* drop commented section

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 .github/workflows/events-ocasional.yml | 26 -------
 .github/workflows/events-recurrent.yml | 23 ------
 .mergify.yml                           | 98 +++++++++++---------------
 3 files changed, 42 insertions(+), 105 deletions(-)
 delete mode 100644 .github/workflows/events-ocasional.yml
 delete mode 100644 .github/workflows/events-recurrent.yml

diff --git a/.github/workflows/events-ocasional.yml b/.github/workflows/events-ocasional.yml
deleted file mode 100644
index 0ddec73eeffb6..0000000000000
--- a/.github/workflows/events-ocasional.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Ocasional events
-
-on:
-  push:
-    branches: [master, "release/*"]
-  pull_request_target: {}
-
-jobs:
-
-  # autoupdate is a GitHub Action that auto-updates pull requests branches whenever changes land on their destination branch.
-  # see: https://github.com/marketplace/actions/auto-update
-  pr-auto-update:
-    name: Auto-update PR
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: docker://chinthakagodawita/autoupdate-action:v1
-        # todo: this shall be resolved with https://github.com/chinthakagodawita/autoupdate/issues/100
-        continue-on-error: true
-        env:
-          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
-          DRY_RUN: "false"
-          PR_FILTER: "labelled"
-          PR_LABELS: "0:] Ready-To-Go"
-          MERGE_MSG: "Branch was auto-updated."
-          RETRY_COUNT: "3"
-          RETRY_SLEEP: "500"
diff --git a/.github/workflows/events-recurrent.yml b/.github/workflows/events-recurrent.yml
deleted file mode 100644
index 74839696c6dd1..0000000000000
--- a/.github/workflows/events-recurrent.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Recurrent events
-
-on:
-  push: {}
-  pull_request:
-    types: [synchronize]
-  pull_request_target: {}
-
-jobs:
-
-  # This label will then be managed by this action.
-  #  It will be added to PRs with merge conflicts and removed from PRs without conflicts.
-  #  https://github.com/mschilde/auto-label-merge-conflicts
-  pr-label-conflicts:
-    name: Label PR conflits
-    runs-on: ubuntu-20.04
-    steps:
-      - uses: mschilde/auto-label-merge-conflicts@v2.0
-        with:
-          CONFLICT_LABEL_NAME: "has conflicts"
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          MAX_RETRIES: 3
-          WAIT_MS: 5000
diff --git a/.mergify.yml b/.mergify.yml
index cb5ef3ec7519a..4ca323347104e 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -12,59 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#pull_request_rules:
-#
-#  - name: Automatic merge on approval
-#    conditions:
-#      - base=master
-#      # number of review approvals
-#      - "#approved-reviews-by>=3"
-#      # no waiting or assigned review
-#      - "#review-requested=0"
-#      # no requested chnages from any reviewer
-#      - "#changes-requested-reviews-by=0"
-#      # this serves as ALL check has to pass as we have actually around 40 tests in total
-#      - "#status-success>=54"
-#      # this is just in case since we rely on GPU tests (note: redundand to the above)
-#      - status-success=continuous-integration/drone/pr
-#      - "status-success=ci/circleci: TPU-tests"
-#      # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
-#      #- "status-success~=^ci/circleci:"
-#      # no conflict with master branch
-#      - -conflict
-#      # was not closed yet
-#      - -closed
-#      # filter-out GH draft PRs
-#      - -draft
-#    actions:
-#      delete_head_branch: {}
-#      merge:
-#        # https://doc.mergify.io/merge-action.html#strict-merge
-#        # (on head branch) $ git merge --no-ff base
-#        # (on head branch) # Wait for CI to go green
-#        # (on head branch) # Squash all commits
-#        # (on base branch) $ git merge --ff head
-#        strict: true
-#        method: squash
-#      comment:
-#        message: Great job! =)
-#
-#  - name: warn on conflicts
-#    conditions:
-#      - conflict
-#      # filter-out GH draft PRs
-#      - -draft
-#    actions:
-#      comment:
-#        message: This pull request is now in conflict... :(
-#
-#  - name: add core reviewer
-#    conditions:
-#      # filter-out GH draft PRs
-#      - -draft
-#      # number of review approvals
-#      - "#approved-reviews-by<3"
-#    actions:
-#      request_reviews:
-#        teams:
-#          - core-contributors
+pull_request_rules:
+
+  - name: warn on conflicts
+    conditions:
+      - conflict
+      - -draft # filter-out GH draft PRs
+      - -label="has conflicts"
+    actions:
+      # comment:
+      #   message: This pull request is now in conflict... :(
+      label:
+        add: [ "has conflicts" ]
+
+  - name: resolved conflicts
+    conditions:
+      - -conflict
+      - label="has conflicts"
+      - -draft # filter-out GH draft PRs
+      - -merged # not merged yet
+      - -closed
+    actions:
+      label:
+        remove: [ "has conflicts" ]
+
+  - name: update PR
+    conditions:
+      - conflict
+      - -draft # filter-out GH draft PRs
+      - label="0:] Ready-To-Go"
+    actions:
+      update: {}
+
+  - name: add core reviewer
+    conditions:
+      - -conflict # skip if conflict
+      - -draft # filter-out GH draft PRs
+      - label="0:] Ready-To-Go"
+      - "#approved-reviews-by<3" # number of review approvals
+    actions:
+      request_reviews:
+        teams:
+          - core-contributors

From d15f7a0a9d730930426bdf419a0d9814b464064c Mon Sep 17 00:00:00 2001
From: lacrosse91 <lacrosse91.ut@gmail.com>
Date: Fri, 15 Jan 2021 07:17:33 +0900
Subject: [PATCH 107/136] Fix Wrong exception message (#5492)

* Update accelerator_connector.py

* Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/accelerators/accelerator_connector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index 4d899da2b0ec2..c911225d0b29f 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -342,8 +342,8 @@ def set_distributed_mode(self):
         # throw error to force user ddp or ddp2 choice
         if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp):
             raise MisconfigurationException(
-                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
-                'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
+                'DataParallel does not support num_nodes > 1. '
+                'To avoid this exception, set `accelerator="ddp"` or `accelerator="ddp2"`'
             )
 
         rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self.trainer.on_gpu}')

From d62ca82f6e6e3b414d7821b0fd7f7d2b002b12d8 Mon Sep 17 00:00:00 2001
From: Skyy93 <f.deuser93@hotmail.de>
Date: Fri, 15 Jan 2021 06:28:58 +0100
Subject: [PATCH 108/136] Tensorboard Docu about Hyperparams saving (#5158)

* Add documentation to tensorboard

* Remove unnecessary whitespaces

* Update pytorch_lightning/loggers/tensorboard.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Add metrics to tensorboard logger

* Whitespace removed

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/loggers/tensorboard.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py
index b3365de25b384..f8e984c6ff5bc 100644
--- a/pytorch_lightning/loggers/tensorboard.py
+++ b/pytorch_lightning/loggers/tensorboard.py
@@ -144,8 +144,21 @@ def experiment(self) -> SummaryWriter:
         return self._experiment
 
     @rank_zero_only
-    def log_hyperparams(self, params: Union[Dict[str, Any], Namespace],
-                        metrics: Optional[Dict[str, Any]] = None) -> None:
+    def log_hyperparams(
+        self,
+        params: Union[Dict[str, Any], Namespace],
+        metrics: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Record hyperparameters. TensorBoard logs with and without saved hyperparameters
+        are incompatible, the hyperparameters are then not displayed in the TensorBoard.
+        Please delete or move the previously saved logs to display the new ones with hyperparameters.
+
+        Args:
+            params: a dictionary-like container with the hyperparameters
+            metrics: Dictionary with metric names as keys and measured quantities as values
+        """
+
         params = self._convert_params(params)
 
         # store params to output

From 7f352cb69a8202e3f829419657597697ca5d99e2 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Fri, 15 Jan 2021 20:52:45 +0530
Subject: [PATCH 109/136] fix reinit_schedulers with correct optimizer (#5519)

* update test

* syntax

* fix

* update test

* scheduler

* only apex

* fix

* rev drone

* chlog
---
 CHANGELOG.md                                  |  5 +++-
 .../accelerators/dp_accelerator.py            | 24 -------------------
 pytorch_lightning/trainer/optimizers.py       | 10 ++++----
 tests/models/test_amp.py                      | 22 ++++++++++++-----
 4 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2011626e67bbc..e3c2a33469f3b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
 
 
+- Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
+
+
 ## [1.1.4] - 2021-01-12
 
 ### Added
@@ -38,7 +41,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/PyTorchLightning/pytorch-lightning/pull/5195))
 - Logging only on `not should_accumulate()` during training ([#5417](https://github.com/PyTorchLightning/pytorch-lightning/pull/5417))
-- Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406)) 
+- Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406))
 - Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/PyTorchLightning/pytorch-lightning/pull/4743))
 
 
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py
index 7517c774f51dd..03c9ebb442fb2 100644
--- a/pytorch_lightning/accelerators/dp_accelerator.py
+++ b/pytorch_lightning/accelerators/dp_accelerator.py
@@ -144,30 +144,6 @@ def test_step_end(self, output):
             output = output.mean()
         return output
 
-    def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
-        """
-        Reinitialize optimizer.step properties added by schedulers
-        """
-        for scheduler in schedulers:
-            scheduler = scheduler['scheduler']
-
-            for optimizer in optimizers:
-                # check that we dont mix users optimizers and schedulers
-                if scheduler.optimizer == optimizer:
-                    # Find the mro belonging to the base lr scheduler class
-                    for i, mro in enumerate(scheduler.__class__.__mro__):
-                        is_regular_scheduler = optim.lr_scheduler._LRScheduler
-                        is_lr_reduce_on_plateau = optim.lr_scheduler.ReduceLROnPlateau
-                        if is_regular_scheduler or is_lr_reduce_on_plateau:
-                            idx = i
-                            state = scheduler.state_dict()
-                        else:
-                            state = None
-
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
-                if state is not None:
-                    scheduler.load_state_dict(state)
-
     def get_reference_model(self, model) -> LightningModule:
         if isinstance(model, LightningDataParallel):
             return model.module
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index a8cb1e279984f..81f6eb64a4ab0 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -145,6 +145,7 @@ def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
         # Reinitialize optimizer.step properties added by schedulers
         for scheduler in schedulers:
             scheduler = scheduler['scheduler']
+            state = None
 
             for optimizer in optimizers:
                 # check that we dont mix users optimizers and schedulers
@@ -152,14 +153,13 @@ def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
                     # Find the mro belonging to the base lr scheduler class
                     for i, mro in enumerate(scheduler.__class__.__mro__):
                         if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
-                            idx = i
                             state = scheduler.state_dict()
-                        else:
-                            state = None
+                            scheduler.__class__.__mro__[i].__init__(scheduler, optimizer)
+                            scheduler.load_state_dict(state)
+                            break
 
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
                 if state is not None:
-                    scheduler.load_state_dict(state)
+                    break
 
 
 class _MockOptimizer(Optimizer):
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 60da3ba55eba4..214c3951c80dd 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -16,15 +16,16 @@
 
 import pytest
 import torch
+from torch import optim
 
+import tests.base.develop_pipelines as tpipes
+import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
-import tests.base.develop_pipelines as tpipes
-import tests.base.develop_utils as tutils
 
 
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@@ -189,9 +190,15 @@ def test_amp_without_apex(tmpdir):
 @pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex")
 def test_amp_with_apex(tmpdir):
     """Check calling apex scaling in training."""
-
-    model = EvalModelTemplate()
-
+    class CustomModel(EvalModelTemplate):
+        def configure_optimizers(self):
+            optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate)
+            optimizer2 = optim.SGD(self.parameters(), lr=self.learning_rate)
+            lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
+            lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)
+            return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
+
+    model = CustomModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
@@ -202,4 +209,7 @@ def test_amp_with_apex(tmpdir):
     assert str(trainer.amp_backend) == "AMPType.APEX"
     trainer.fit(model)
     assert trainer.state == TrainerState.FINISHED
-    assert trainer.dev_debugger.count_events('AMP') == 10
+    assert trainer.dev_debugger.count_events('AMP') == 20
+
+    assert isinstance(trainer.lr_schedulers[0]['scheduler'].optimizer, optim.Adam)
+    assert isinstance(trainer.lr_schedulers[1]['scheduler'].optimizer, optim.SGD)

From 6926b849372fe8f7bc6d5fa8c9eb3ba856645534 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Sat, 16 Jan 2021 02:57:30 -0800
Subject: [PATCH 110/136] [bugfix] Fix signature mismatch in
 DDPCPUHPCAccelerator's model_to_device (#5505)

* Update ddp_cpu_hpc_accelerator.py

* Update CHANGELOG.md
---
 CHANGELOG.md                                              | 1 +
 pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3c2a33469f3b..7bc448667a86e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,6 +43,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Logging only on `not should_accumulate()` during training ([#5417](https://github.com/PyTorchLightning/pytorch-lightning/pull/5417))
 - Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406))
 - Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/PyTorchLightning/pytorch-lightning/pull/4743))
+- Fixed signature mismatch in `model_to_device` of `DDPCPUHPCAccelerator` ([#5505](https://github.com/PyTorchLightning/pytorch-lightning/pull/5505))
 
 
 ## [1.1.3] - 2021-01-05
diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
index b9a71ed271744..4694a31438ca6 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
@@ -42,7 +42,7 @@ def __init__(self,
         super().__init__(trainer, cluster_environment, ddp_plugin)
         self.nickname = 'ddp_cpu'
 
-    def model_to_device(self, model, process_idx):
+    def model_to_device(self, model):
         model.cpu()
 
     def get_device_ids(self):

From c80e45d2fe0af94d059dc59e6b77417c48becf6d Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Mon, 18 Jan 2021 13:54:27 +0530
Subject: [PATCH 111/136] Fix val_check_interval with fast_dev_run (#5540)

* fix val_check_interval with fast_dev_run

* chlog
---
 CHANGELOG.md                                  |  3 +
 .../trainer/connectors/debugging_connector.py |  2 +-
 tests/trainer/flags/test_fast_dev_run.py      | 59 +++++++++++++------
 3 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7bc448667a86e..422f2f4385f69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
 
 
+- Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540))
+
+
 ## [1.1.4] - 2021-01-12
 
 ### Added
diff --git a/pytorch_lightning/trainer/connectors/debugging_connector.py b/pytorch_lightning/trainer/connectors/debugging_connector.py
index ecba35d5dbf55..1fa1f4f319289 100644
--- a/pytorch_lightning/trainer/connectors/debugging_connector.py
+++ b/pytorch_lightning/trainer/connectors/debugging_connector.py
@@ -59,7 +59,7 @@ def on_init_start(
             self.trainer.max_steps = fast_dev_run
             self.trainer.num_sanity_val_steps = 0
             self.trainer.max_epochs = 1
-            self.trainer.val_check_interval = 1.0
+            val_check_interval = 1.0
             self.trainer.check_val_every_n_epoch = 1
             self.trainer.logger = DummyLogger()
 
diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py
index 624b3cc6ac9c2..2eaa6fd7f888d 100644
--- a/tests/trainer/flags/test_fast_dev_run.py
+++ b/tests/trainer/flags/test_fast_dev_run.py
@@ -36,30 +36,59 @@ def test_callbacks_and_logger_not_called_with_fastdevrun(tmpdir, fast_dev_run):
     class FastDevRunModel(BoringModel):
         def __init__(self):
             super().__init__()
-            self.training_step_called = False
-            self.validation_step_called = False
-            self.test_step_called = False
+            self.training_step_call_count = 0
+            self.training_epoch_end_call_count = 0
+            self.validation_step_call_count = 0
+            self.validation_epoch_end_call_count = 0
+            self.test_step_call_count = 0
 
         def training_step(self, batch, batch_idx):
             self.log('some_metric', torch.tensor(7.))
             self.logger.experiment.dummy_log('some_distribution', torch.randn(7) + batch_idx)
-            self.training_step_called = True
+            self.training_step_call_count += 1
             return super().training_step(batch, batch_idx)
 
+        def training_epoch_end(self, outputs):
+            self.training_epoch_end_call_count += 1
+            super().training_epoch_end(outputs)
+
         def validation_step(self, batch, batch_idx):
-            self.validation_step_called = True
+            self.validation_step_call_count += 1
             return super().validation_step(batch, batch_idx)
 
+        def validation_epoch_end(self, outputs):
+            self.validation_epoch_end_call_count += 1
+            super().validation_epoch_end(outputs)
+
+        def test_step(self, batch, batch_idx):
+            self.test_step_call_count += 1
+            return super().test_step(batch, batch_idx)
+
     checkpoint_callback = ModelCheckpoint()
     early_stopping_callback = EarlyStopping()
     trainer_config = dict(
         fast_dev_run=fast_dev_run,
+        val_check_interval=2,
         logger=True,
         log_every_n_steps=1,
         callbacks=[checkpoint_callback, early_stopping_callback],
     )
 
-    def _make_fast_dev_run_assertions(trainer):
+    def _make_fast_dev_run_assertions(trainer, model):
+        # check the call count for train/val/test step/epoch
+        assert model.training_step_call_count == fast_dev_run
+        assert model.training_epoch_end_call_count == 1
+        assert model.validation_step_call_count == 0 if model.validation_step is None else fast_dev_run
+        assert model.validation_epoch_end_call_count == 0 if model.validation_step is None else 1
+        assert model.test_step_call_count == fast_dev_run
+
+        # check trainer arguments
+        assert trainer.max_steps == fast_dev_run
+        assert trainer.num_sanity_val_steps == 0
+        assert trainer.max_epochs == 1
+        assert trainer.val_check_interval == 1.0
+        assert trainer.check_val_every_n_epoch == 1
+
         # there should be no logger with fast_dev_run
         assert isinstance(trainer.logger, DummyLogger)
         assert len(trainer.dev_debugger.logged_metrics) == fast_dev_run
@@ -76,13 +105,10 @@ def _make_fast_dev_run_assertions(trainer):
     train_val_step_model = FastDevRunModel()
     trainer = Trainer(**trainer_config)
     results = trainer.fit(train_val_step_model)
-    assert results
+    trainer.test(ckpt_path=None)
 
-    # make sure both training_step and validation_step were called
-    assert train_val_step_model.training_step_called
-    assert train_val_step_model.validation_step_called
-
-    _make_fast_dev_run_assertions(trainer)
+    assert results
+    _make_fast_dev_run_assertions(trainer, train_val_step_model)
 
     # -----------------------
     # also called once with no val step
@@ -92,10 +118,7 @@ def _make_fast_dev_run_assertions(trainer):
 
     trainer = Trainer(**trainer_config)
     results = trainer.fit(train_step_only_model)
-    assert results
+    trainer.test(ckpt_path=None)
 
-    # make sure only training_step was called
-    assert train_step_only_model.training_step_called
-    assert not train_step_only_model.validation_step_called
-
-    _make_fast_dev_run_assertions(trainer)
+    assert results
+    _make_fast_dev_run_assertions(trainer, train_step_only_model)

From a56f745391e64ebeac076529fb79144a54984bc0 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Mon, 18 Jan 2021 17:35:17 +0630
Subject: [PATCH 112/136] Remove unused `beta` argument in precision/recall
 (#5532)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/metrics/classification/precision_recall.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pytorch_lightning/metrics/classification/precision_recall.py b/pytorch_lightning/metrics/classification/precision_recall.py
index 6c2bf64a1ecfc..c5a577a5d45e3 100644
--- a/pytorch_lightning/metrics/classification/precision_recall.py
+++ b/pytorch_lightning/metrics/classification/precision_recall.py
@@ -42,7 +42,6 @@ class Precision(Metric):
 
     Args:
         num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
         threshold:
             Threshold value for binary or multi-label logits. default: 0.5
 
@@ -135,7 +134,6 @@ class Recall(Metric):
 
     Args:
         num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
         threshold:
             Threshold value for binary or multi-label logits. default: 0.5
 

From 18d2ae82b444dac1817f66015158561168ea38a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 18 Jan 2021 14:31:17 +0100
Subject: [PATCH 113/136] Fix logging on_train_batch_end in a callback with
 multiple optimizers (#5521)

* Start with the failing test

* Then fix the failing test

* Update CHANGELOG
---
 CHANGELOG.md                                  |  3 ++
 .../logger_connector/epoch_result_store.py    |  8 +---
 .../optimization/test_multiple_optimizers.py  | 45 +++++++++++--------
 3 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 422f2f4385f69..65c086b2b6670 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
 
 
+- Fixed logging on_train_batch_end in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
+
+
 - Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
 
 
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index dd12a2970727a..2e27f8cf61ab3 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -203,13 +203,7 @@ def auto_reduce_results_on_epoch_end(self) -> None:
             epoch_metrics = self._internals[dl_idx]
 
             if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP:
-
-                num_opt_idx = len(self._internals[dl_idx]) - 1
-
-                # Make sure we didn't create key
-                assert num_opt_idx >= 0
-
-                for opt_idx in range(num_opt_idx + 1):
+                for opt_idx in list(epoch_metrics):
                     # TODO: Figure out to reduce memory
                     # TODO: How to start training in middle of epoch
                     opt_outputs = epoch_metrics[opt_idx]
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
index 78b6f8f7ff84a..a26accfab106f 100644
--- a/tests/trainer/optimization/test_multiple_optimizers.py
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -22,23 +22,18 @@
 
 def test_unbalanced_logging_with_multiple_optimizers(tmpdir):
     """
-    This tests ensures reduction works in un-balanced logging settings
+    This tests ensures reduction works in unbalanced logging settings,
+    even when a Callback also logs.
     """
     class TestModel(BoringModel):
-
-        loss_1 = []
-        loss_2 = []
+        actual = {0: [], 1: []}
 
         def training_step(self, batch, batch_idx, optimizer_idx):
-            output = self.layer(batch)
-            loss = self.loss(batch, output)
-            if optimizer_idx == 0 and self.trainer.global_step > 10:
-                self.log("loss_1", loss, on_epoch=True, prog_bar=True)
-                self.loss_1.append(loss.detach().clone())
-            elif optimizer_idx == 1:
-                self.log("loss_2", loss, on_epoch=True, prog_bar=True)
-                self.loss_2.append(loss.detach().clone())
-            return {"loss": loss}
+            out = super().training_step(batch, batch_idx)
+            loss = out["loss"]
+            self.log(f"loss_{optimizer_idx}", loss, on_epoch=True)
+            self.actual[optimizer_idx].append(loss)
+            return out
 
         def configure_optimizers(self):
             optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
@@ -48,16 +43,28 @@ def configure_optimizers(self):
     model = TestModel()
     model.training_epoch_end = None
 
+    class TestCallback(pl.Callback):
+        def on_train_batch_end(self, trainer, pl_module, output, batch, batch_idx, dl_idx):
+            # when this is called, the EpochResultStore state has not been reset yet because we are still
+            # "INSIDE_BATCH_TRAIN_LOOP" and the LoggerConnector runs its `on_train_batch_end` after the
+            # Callback (see `TrainLoop.on_train_batch_end`). For this reason, opt_idx here is the index
+            # of the last optimizer updated (the second, index 1). This produced a KeyError as reported in #5459
+            pl_module.log("test_train_batch_end", trainer.logger_connector.cached_results._opt_idx)
+
     # Initialize a trainer
     trainer = pl.Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=5,
+        limit_val_batches=5,
+        callbacks=[TestCallback()],
+        weights_summary=None,
     )
-
     trainer.fit(model)
 
-    assert torch.equal(trainer.callback_metrics["loss_2_step"], model.loss_2[-1])
-    assert torch.equal(trainer.callback_metrics["loss_1_step"], model.loss_1[-1])
-    # test loss are properly reduced
-    assert torch.abs(trainer.callback_metrics["loss_2_epoch"] - torch.FloatTensor(model.loss_2).mean()) < 1e-6
-    assert torch.abs(trainer.callback_metrics["loss_1_epoch"] - torch.FloatTensor(model.loss_1).mean()) < 1e-6
+    for k, v in model.actual.items():
+        assert torch.equal(trainer.callback_metrics[f"loss_{k}_step"], v[-1])
+        # test loss is properly reduced
+        torch.testing.assert_allclose(trainer.callback_metrics[f"loss_{k}_epoch"], torch.tensor(v).mean())
+
+    assert trainer.callback_metrics["test_train_batch_end"] == len(model.optimizers()) - 1

From 18bba250fcf05e5f271a5726ac345056cdec0f27 Mon Sep 17 00:00:00 2001
From: Sidhant Sundrani <sidhant96@outlook.com>
Date: Tue, 19 Jan 2021 13:42:30 +0530
Subject: [PATCH 114/136] fix command line run for refinforce_learn_qnet in
 pl_examples  (#5414)

* fix wrong argument in argparse

* remove wrong default arg in argparser

* disable add help argparse
---
 pl_examples/domain_templates/reinforce_learn_Qnet.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
index 21583dad1f086..c817f69ee205d 100644
--- a/pl_examples/domain_templates/reinforce_learn_Qnet.py
+++ b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -393,17 +393,13 @@ def add_model_specific_args(parent_parser):  # pragma: no-cover
                             help="how many frames do we update the target network")
         parser.add_argument("--replay_size", type=int, default=1000,
                             help="capacity of the replay buffer")
-        parser.add_argument("--warm_start_size", type=int, default=1000,
+        parser.add_argument("--warm_start_steps", type=int, default=1000,
                             help="how many samples do we use to fill our buffer at the start of training")
         parser.add_argument("--eps_last_frame", type=int, default=1000,
                             help="what frame should epsilon stop decaying")
         parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon")
         parser.add_argument("--eps_end", type=float, default=0.01, help="final value of epsilon")
         parser.add_argument("--episode_length", type=int, default=200, help="max length of an episode")
-        parser.add_argument("--max_episode_reward", type=int, default=200,
-                            help="max episode reward in the environment")
-        parser.add_argument("--warm_start_steps", type=int, default=1000,
-                            help="max episode reward in the environment")
         return parser
 
 
@@ -424,7 +420,7 @@ def main(args) -> None:
     torch.manual_seed(0)
     np.random.seed(0)
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(add_help=False)
     parser = DQNLightning.add_model_specific_args(parser)
     args = parser.parse_args()
 

From 389186c7ad2ef6572d2b93430d17ec08e3a186f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 19 Jan 2021 11:59:26 +0100
Subject: [PATCH 115/136] Drop greetings comment (#5563)

Co-authored-by: chaton <thomas@grid.ai>
---
 .github/workflows/greetings.yml | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 .github/workflows/greetings.yml

diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml
deleted file mode 100644
index bdcabdcf69cbf..0000000000000
--- a/.github/workflows/greetings.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: Greetings
-# https://github.com/marketplace/actions/first-interaction
-
-on: [issues]  # pull_request
-
-jobs:
-  greeting:
-    runs-on: ubuntu-20.04
-    steps:
-    - uses: actions/first-interaction@v1
-      with:
-        repo-token: ${{ secrets.GITHUB_TOKEN }}
-        issue-message: 'Hi! thanks for your contribution!, great first issue!'
-        pr-message: 'Hey thanks for the input! Please give us a bit of time to review it!'

From 486f682c4ac34f4e6d5033fe24c009219bf22b8b Mon Sep 17 00:00:00 2001
From: Tobias Maier <tobiasmaier@users.noreply.github.com>
Date: Tue, 19 Jan 2021 13:58:32 +0100
Subject: [PATCH 116/136] Fix root node resolution in slurm environment

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
---
 pytorch_lightning/cluster_environments/slurm_environment.py | 2 +-
 pytorch_lightning/trainer/connectors/slurm_connector.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py
index 6df1cf680c57f..cb8db4d440178 100644
--- a/pytorch_lightning/cluster_environments/slurm_environment.py
+++ b/pytorch_lightning/cluster_environments/slurm_environment.py
@@ -26,7 +26,7 @@ def __init__(self):
     def master_address(self):
         # figure out the root node addr
         try:
-            root_node = os.environ["SLURM_NODELIST"].split(" ")[0]
+            root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0]
         except Exception:
             root_node = "127.0.0.1"
 
diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py
index 4cb954a8e92fc..9c04c10559b6d 100644
--- a/pytorch_lightning/trainer/connectors/slurm_connector.py
+++ b/pytorch_lightning/trainer/connectors/slurm_connector.py
@@ -137,7 +137,7 @@ def connect_ddp(self, global_rank: int, world_size: int) -> None:
 
         # figure out the root node addr
         try:
-            root_node = os.environ["SLURM_NODELIST"].split(" ")[0]
+            root_node = os.environ["SLURM_NODELIST"].split(" ")[0].split(",")[0]
         except Exception:
             root_node = "127.0.0.1"
 

From 3825ce4547b61b83b39927784cc9b864cf2bfc87 Mon Sep 17 00:00:00 2001
From: Sidhant Sundrani <sidhant96@outlook.com>
Date: Tue, 19 Jan 2021 22:22:52 +0530
Subject: [PATCH 117/136] fix argparse conflicting options error (#5569)

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pl_examples/domain_templates/semantic_segmentation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index 2e718a37ac4b0..8ffd539b80aaf 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -284,7 +284,7 @@ def main(hparams: Namespace):
 
 if __name__ == '__main__':
     cli_lightning_logo()
-    parser = ArgumentParser()
+    parser = ArgumentParser(add_help=False)
     parser = SegModel.add_model_specific_args(parser)
     hparams = parser.parse_args()
 

From 088b3528ff82699a05fecd3b75e8752ddd935ab1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 19 Jan 2021 21:03:17 +0100
Subject: [PATCH 118/136] Prepare 1.1.5 release (#5576)

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 CHANGELOG.md                  | 21 ++-------------------
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 65c086b2b6670..10bf0211351ce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,32 +4,15 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
-## [unreleased.Bugfixes] - YYYY-MM-DD
-
-### Added
-
-
-### Changed
-
-
-### Deprecated
-
-
-### Removed
 
+## [1.1.5] - 2021-01-19
 
 ### Fixed
 
 
 - Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
-
-
-- Fixed logging on_train_batch_end in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
-
-
+- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
 - Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
-
-
 - Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540))
 
 
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 1f672dcd9aac8..8b8e3328375bb 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -5,7 +5,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.1.4'
+__version__ = '1.1.5'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From f477c2fd2980ad128bfe79a3b859e0b81b435507 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 19 Jan 2021 23:24:14 +0100
Subject: [PATCH 119/136] Add new CHANGELOG section (#5580)

---
 CHANGELOG.md                                  | 20 ++++++++++++++++++-
 .../checkpointing/test_legacy_checkpoints.py  |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 10bf0211351ce..5fd70e3583c01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,29 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.5] - 2021-01-19
+## [unreleased.Bugfixes] - YYYY-MM-DD
+
+### Added
+
+
+### Changed
+
+
+### Deprecated
+
+
+### Removed
+
 
 ### Fixed
 
 
+
+
+## [1.1.5] - 2021-01-19
+
+### Fixed
+
 - Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579))
 - Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521))
 - Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519))
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 48f5a53733214..577362e65f1c9 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -46,6 +46,7 @@
     "1.1.2",
     "1.1.3",
     "1.1.4",
+    "1.1.5",
 ])
 def test_resume_legacy_checkpoints(tmpdir, pl_version):
     path_dir = os.path.join(LEGACY_CHECKPOINTS_PATH, pl_version)

From a376b652548ef51c1d9dcc770eb1b1ab553842f3 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Sun, 22 Nov 2020 23:05:44 +0530
Subject: [PATCH 120/136] :zap: Added initial setup to calculate model size

---
 pytorch_lightning/core/memory.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index faafc0a0f0584..757af952f670c 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -213,6 +213,23 @@ def out_sizes(self) -> List:
     def param_nums(self) -> List[int]:
         return [layer.num_parameters for layer in self._layer_summary.values()]
 
+    @property
+    def total_out_params(self) -> int:
+        _map_out_size_prod = map(lambda out_size: np.prod(out_size), self.out_sizes)
+        return sum([out_size_prod for out_size_prod in _map_out_size_prod])
+
+    @property
+    def total_params(self) -> int:
+        return sum(self.param_nums)
+
+    def _get_total_size(self, input_size: tuple) -> int:
+        # TODO(kartik4949) : get precision.
+        _precision = 32.0 / 8.0  # 1 byte -> 8 bits
+        total_input_dsize = abs(np.prod(np.array(input_size))) * _precision / (1024 ** 2.0)
+        total_output_dsize = abs(2.0 * self.total_out_params * _precision / (1024 ** 2.0))
+        total_params_dsize = abs(self.total_params * _precision / (1024 ** 2.0))
+        return total_params_dsize + total_output_dsize + total_input_dsize
+
     def summarize(self) -> Dict[str, LayerSummary]:
         summary = OrderedDict((name, LayerSummary(module)) for name, module in self.named_modules)
         if self._model.example_input_array is not None:
@@ -321,7 +338,7 @@ def _format_summary_table(total_parameters: int, trainable_parameters: int, *col
 
 
 def get_memory_profile(mode: str) -> Union[Dict[str, int], Dict[int, int]]:
-    """ Get a profile of the current memory usage.
+    """Get a profile of the current memory usage.
 
     Args:
         mode: There are two modes:
@@ -367,9 +384,7 @@ def get_gpu_memory_map() -> Dict[str, int]:
 
     # Convert lines into a dictionary
     gpu_memory = [float(x) for x in result.stdout.strip().split(os.linesep)]
-    gpu_memory_map = {
-        f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)
-    }
+    gpu_memory_map = {f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)}
     return gpu_memory_map
 
 

From ef4d36b79d18a12bc3c2e5c3c3f410f2605ed125 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 23 Nov 2020 17:31:18 +0530
Subject: [PATCH 121/136] :hammer: minor refactor

---
 pytorch_lightning/core/memory.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 757af952f670c..7e5ecf2ed7169 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -216,13 +216,16 @@ def param_nums(self) -> List[int]:
     @property
     def total_out_params(self) -> int:
         _map_out_size_prod = map(lambda out_size: np.prod(out_size), self.out_sizes)
-        return sum([out_size_prod for out_size_prod in _map_out_size_prod])
+        return sum(list(_map_out_size_prod))
 
     @property
     def total_params(self) -> int:
         return sum(self.param_nums)
 
-    def _get_total_size(self, input_size: tuple) -> int:
+    def model_size(self, input_size: tuple) -> float:
+        return self._get_total_size(input_size)
+
+    def _get_total_size(self, input_size: tuple) -> float:
         # TODO(kartik4949) : get precision.
         _precision = 32.0 / 8.0  # 1 byte -> 8 bits
         total_input_dsize = abs(np.prod(np.array(input_size))) * _precision / (1024 ** 2.0)

From 1863c4e38165359b336ed850982d7e6ea43fda7e Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 30 Nov 2020 20:10:48 +0530
Subject: [PATCH 122/136] :zap: Model size for different input sizes

---
 pytorch_lightning/core/memory.py | 56 +++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 7e5ecf2ed7169..da9b2434d6379 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -215,22 +215,52 @@ def param_nums(self) -> List[int]:
 
     @property
     def total_out_params(self) -> int:
-        _map_out_size_prod = map(lambda out_size: np.prod(out_size), self.out_sizes)
-        return sum(list(_map_out_size_prod))
+        _total_out_params = 0
+
+        def _get_out_size_params(out_sizes):
+            nonlocal _total_out_params
+            if not any(isinstance(i, list) for i in out_sizes):
+                try:
+                    # try to find prod, i.e check for unknown sizes.
+                    _total_out_params += np.prod(out_sizes)
+                except:
+                    # do nothing if could not find product.
+                    pass
+            else:
+                _ = [_get_out_size_params(out_size) for out_size in out_sizes if isinstance(out_size, list)]
+
+        _get_out_size_params(self.out_sizes)
+
+        return _total_out_params
 
     @property
     def total_params(self) -> int:
         return sum(self.param_nums)
 
-    def model_size(self, input_size: tuple) -> float:
-        return self._get_total_size(input_size)
+    @property
+    def model_size(self, input_size=None) -> float:
+        if isinstance(self._model.example_input_array, (list, tuple)):
+            in_features = (
+                sum(
+                    [
+                        input_array.numel() if isinstance(input_array, torch.Tensor) else torch.tensor(input_array)
+                        for input_array in self._model.example_input_array
+                    ]
+                ),
+            )
+
+        elif isinstance(self._model.example_input_array, dict):
+            # TODO (kartik4949): write input_feature for dict input array.
+            in_features = (1,)
+        else:
+            in_features = (self._model.example_input_array.numel(),)
+        return self._get_total_size(in_features if not input_size else input_size)
 
     def _get_total_size(self, input_size: tuple) -> float:
-        # TODO(kartik4949) : get precision.
-        _precision = 32.0 / 8.0  # 1 byte -> 8 bits
-        total_input_dsize = abs(np.prod(np.array(input_size))) * _precision / (1024 ** 2.0)
-        total_output_dsize = abs(2.0 * self.total_out_params * _precision / (1024 ** 2.0))
-        total_params_dsize = abs(self.total_params * _precision / (1024 ** 2.0))
+        _precision_bytes = self._model.precision / 8.0  # 1 byte -> 8 bits
+        total_input_dsize = abs(np.prod(np.array(input_size))) * _precision_bytes / (1024 ** 2.0)
+        total_output_dsize = abs(2.0 * self.total_out_params * _precision_bytes / (1024 ** 2.0))
+        total_params_dsize = abs(self.total_params * _precision_bytes / (1024 ** 2.0))
         return total_params_dsize + total_output_dsize + total_input_dsize
 
     def summarize(self) -> Dict[str, LayerSummary]:
@@ -282,8 +312,8 @@ def __str__(self):
 
         trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
         total_parameters = sum(p.numel() for p in self._model.parameters())
-
-        return _format_summary_table(total_parameters, trainable_parameters, *arrays)
+        total_model_size = self.model_size
+        return _format_summary_table(total_parameters, trainable_parameters, total_model_size, *arrays)
 
     def __repr__(self):
         return str(self)
@@ -300,7 +330,7 @@ def parse_batch_shape(batch: Any) -> Union[str, List]:
     return UNKNOWN_SIZE
 
 
-def _format_summary_table(total_parameters: int, trainable_parameters: int, *cols) -> str:
+def _format_summary_table(total_parameters: int, trainable_parameters: int, total_model_size: float, *cols) -> str:
     """
     Takes in a number of arrays, each specifying a column in
     the summary table, and combines them all into one big
@@ -336,6 +366,8 @@ def _format_summary_table(total_parameters: int, trainable_parameters: int, *col
     summary += "Non-trainable params"
     summary += "\n" + s.format(get_human_readable_count(total_parameters), 10)
     summary += "Total params"
+    summary += "\n" + s.format(total_model_size, 10)
+    summary += "Total Estimated Model Size (MB)"
 
     return summary
 

From 2fb4bdc4630e12e4de3539b09dfca879be7acd35 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 30 Nov 2020 20:11:11 +0530
Subject: [PATCH 123/136] :zap: added tests

---
 tests/core/test_memory.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index 142159fa48fd8..b004a014b6c89 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -21,6 +21,11 @@
 from tests.base.models import ParityModuleRNN
 
 
+def almost_equals(a, b, rel_tol=0.0, abs_tol=0.0):
+    _almost_close = lambda a, b: abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+    return _almost_close(a, b)
+
+
 class EmptyModule(LightningModule):
     """ A module that has no layers """
 
@@ -36,6 +41,8 @@ def forward(self, *args, **kwargs):
 class UnorderedModel(LightningModule):
     """ A model in which the layers not defined in order of execution """
 
+    pre_calculated_model_size = 0.000870
+
     def __init__(self):
         super().__init__()
         # note: the definition order is intentionally scrambled for this test
@@ -59,6 +66,8 @@ def forward(self, x, y):
 class MixedDtypeModel(LightningModule):
     """ The parameters and inputs of this model have different dtypes. """
 
+    pre_calculated_model_size = 0.00182
+
     def __init__(self):
         super().__init__()
         self.embed = nn.Embedding(10, 20)   # expects dtype long as input
@@ -124,10 +133,20 @@ def test_linear_model_summary_shapes(device, mode):
     assert model.device == device
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires GPU.")
+def test_linear_model_summary_shapes(device, mode):
+    """ Test that the model size is correctly calculated."""
+    model = UnorderedModel().to(device)
+    model.train()
+    summary = model.summarize(mode=mode)
+    assert almost_equals(summary.model_size, model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
+
+
 def test_mixed_dtype_model_summary():
     """ Test that the model summary works with models that have mixed input- and parameter dtypes. """
     model = MixedDtypeModel()
     summary = model.summarize()
+    assert almost_equals(summary.model_size, model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
     assert summary.in_sizes == [
         [2, 3],         # embed
         [2, 3, 20],     # reduce

From 9fd363f6dd72017c23505e7ab014a5526415dc90 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 30 Nov 2020 20:19:08 +0530
Subject: [PATCH 124/136] :bug: make model_size method

---
 pytorch_lightning/core/memory.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index da9b2434d6379..180c6fae01474 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -237,7 +237,6 @@ def _get_out_size_params(out_sizes):
     def total_params(self) -> int:
         return sum(self.param_nums)
 
-    @property
     def model_size(self, input_size=None) -> float:
         if isinstance(self._model.example_input_array, (list, tuple)):
             in_features = (

From 9a493f9afd902a67458525d30b36680e5a2d3f2e Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 30 Nov 2020 20:23:55 +0530
Subject: [PATCH 125/136] :bug: call model_size

---
 pytorch_lightning/core/memory.py | 2 +-
 tests/core/test_memory.py        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 180c6fae01474..f895472d342ca 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -311,7 +311,7 @@ def __str__(self):
 
         trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
         total_parameters = sum(p.numel() for p in self._model.parameters())
-        total_model_size = self.model_size
+        total_model_size = self.model_size()
         return _format_summary_table(total_parameters, trainable_parameters, total_model_size, *arrays)
 
     def __repr__(self):
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index b004a014b6c89..933fbf3270569 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -139,14 +139,14 @@ def test_linear_model_summary_shapes(device, mode):
     model = UnorderedModel().to(device)
     model.train()
     summary = model.summarize(mode=mode)
-    assert almost_equals(summary.model_size, model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
+    assert almost_equals(summary.model_size(), model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
 
 
 def test_mixed_dtype_model_summary():
     """ Test that the model summary works with models that have mixed input- and parameter dtypes. """
     model = MixedDtypeModel()
     summary = model.summarize()
-    assert almost_equals(summary.model_size, model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
+    assert almost_equals(summary.model_size(), model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
     assert summary.in_sizes == [
         [2, 3],         # embed
         [2, 3, 20],     # reduce

From 419ecdf007b5c4999752f134204c50375ee8728d Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Mon, 30 Nov 2020 23:13:38 +0530
Subject: [PATCH 126/136] :hammer: model size summary refactor

---
 pytorch_lightning/core/memory.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index f895472d342ca..0f39810c772ee 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -365,7 +365,7 @@ def _format_summary_table(total_parameters: int, trainable_parameters: int, tota
     summary += "Non-trainable params"
     summary += "\n" + s.format(get_human_readable_count(total_parameters), 10)
     summary += "Total params"
-    summary += "\n" + s.format(total_model_size, 10)
+    summary += "\n" + s.format(get_formatted_model_size(total_model_size), 10)
     summary += "Total Estimated Model Size (MB)"
 
     return summary
@@ -421,6 +421,8 @@ def get_gpu_memory_map() -> Dict[str, int]:
     gpu_memory_map = {f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)}
     return gpu_memory_map
 
+def get_formatted_model_size(total_model_size: float) -> float:
+    return "{:.4f}".format(total_model_size)
 
 def get_human_readable_count(number: int) -> str:
     """

From 80cb6994fe77cdb845afbbb585b66bc5f38076f9 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 01:24:57 +0530
Subject: [PATCH 127/136] :hammer: Simplified tests

---
 tests/core/test_memory.py | 83 ++++++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index 933fbf3270569..a26e2bc26523d 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -26,6 +26,29 @@ def almost_equals(a, b, rel_tol=0.0, abs_tol=0.0):
     return _almost_close(a, b)
 
 
+class KnownNet(LightningModule):
+    """ Pre calculated known model """
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv3 = nn.Conv2d(20, 30, kernel_size=3)
+        self.conv4 = nn.Conv2d(30, 30, kernel_size=3)
+        self.fc1 = nn.Linear(10, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = x.view(-1, 10)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+
 class EmptyModule(LightningModule):
     """ A module that has no layers """
 
@@ -41,8 +64,6 @@ def forward(self, *args, **kwargs):
 class UnorderedModel(LightningModule):
     """ A model in which the layers not defined in order of execution """
 
-    pre_calculated_model_size = 0.000870
-
     def __init__(self):
         super().__init__()
         # note: the definition order is intentionally scrambled for this test
@@ -66,8 +87,6 @@ def forward(self, x, y):
 class MixedDtypeModel(LightningModule):
     """ The parameters and inputs of this model have different dtypes. """
 
-    pre_calculated_model_size = 0.00182
-
     def __init__(self):
         super().__init__()
         self.embed = nn.Embedding(10, 20)   # expects dtype long as input
@@ -133,20 +152,10 @@ def test_linear_model_summary_shapes(device, mode):
     assert model.device == device
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires GPU.")
-def test_linear_model_summary_shapes(device, mode):
-    """ Test that the model size is correctly calculated."""
-    model = UnorderedModel().to(device)
-    model.train()
-    summary = model.summarize(mode=mode)
-    assert almost_equals(summary.model_size(), model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
-
-
 def test_mixed_dtype_model_summary():
     """ Test that the model summary works with models that have mixed input- and parameter dtypes. """
     model = MixedDtypeModel()
     summary = model.summarize()
-    assert almost_equals(summary.model_size(), model.pre_calculated_model_size, rel_tol=1e-4, abs_tol=1e-4)
     assert summary.in_sizes == [
         [2, 3],         # embed
         [2, 3, 20],     # reduce
@@ -232,6 +241,7 @@ def test_summary_layer_types(mode):
     ]
 
 
+<<<<<<< HEAD
 @pytest.mark.parametrize(['mode'], [
     pytest.param(ModelSummary.MODE_FULL),
     pytest.param(ModelSummary.MODE_TOP),
@@ -245,6 +255,51 @@ def test_summary_layer_types(mode):
     pytest.param([torch.zeros(2, 3), torch.zeros(4, 5)], [[2, 3], [4, 5]]),
     pytest.param((torch.zeros(2, 3), torch.zeros(4, 5)), [[2, 3], [4, 5]]),
 ])
+=======
+@pytest.mark.parametrize(
+    ["mode"],
+    [
+        pytest.param(ModelSummary.MODE_FULL),
+        pytest.param(ModelSummary.MODE_TOP),
+    ],
+)
+@pytest.mark.parametrize(
+    ["example_input", "expected_model_size"],
+    [
+        pytest.param(torch.zeros(1, 1, 28, 28), 0.668),
+        pytest.param(torch.zeros(1, 1, 224, 224), 93.57),
+        pytest.param(torch.zeros(10, 1, 512, 512), 5176.78),
+    ],
+)
+def test_known_model_sizes(example_input, expected_model_size, mode):
+    """ Test the knownet model on example input arrays and corresponding known model size """
+
+    model = KnownNet()
+    model.example_input_array = example_input
+    summary = model.summarize(mode=mode)
+    assert almost_equals(summary.model_size(), expected_model_size, rel_tol=1e-3, abs_tol=1e-3)
+
+
+@pytest.mark.parametrize(
+    ["mode"],
+    [
+        pytest.param(ModelSummary.MODE_FULL),
+        pytest.param(ModelSummary.MODE_TOP),
+    ],
+)
+@pytest.mark.parametrize(
+    ["example_input", "expected_size"],
+    [
+        pytest.param([], UNKNOWN_SIZE),
+        pytest.param((1, 2, 3), [UNKNOWN_SIZE] * 3),
+        pytest.param(torch.tensor(0), UNKNOWN_SIZE),
+        pytest.param(dict(tensor=torch.zeros(1, 2, 3)), UNKNOWN_SIZE),
+        pytest.param(torch.zeros(2, 3, 4), [2, 3, 4]),
+        pytest.param([torch.zeros(2, 3), torch.zeros(4, 5)], [[2, 3], [4, 5]]),
+        pytest.param((torch.zeros(2, 3), torch.zeros(4, 5)), [[2, 3], [4, 5]]),
+    ],
+)
+>>>>>>> :hammer: Simplified tests
 def test_example_input_array_types(example_input, expected_size, mode):
     """ Test the types of example inputs supported for display in the summary. """
 

From 9b812ecd43077b1d77b9faded5ab78be50a97b6c Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 01:37:09 +0530
Subject: [PATCH 128/136] :hammer: dict input support for model size

---
 pytorch_lightning/core/memory.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 0f39810c772ee..3bcbcabb93976 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -180,6 +180,7 @@ def __init__(self, model, mode: str = MODE_DEFAULT):
         self._model = model
         self._mode = mode
         self._layer_summary = self.summarize()
+        self._precision_bytes = self._model.precision / 8.0  # 1 byte -> 8 bits
 
     @property
     def named_modules(self) -> List[Tuple[str, nn.Module]]:
@@ -249,17 +250,16 @@ def model_size(self, input_size=None) -> float:
             )
 
         elif isinstance(self._model.example_input_array, dict):
-            # TODO (kartik4949): write input_feature for dict input array.
-            in_features = (1,)
+            in_features = self._model.example_input_array["tensor"].numel()
         else:
             in_features = (self._model.example_input_array.numel(),)
         return self._get_total_size(in_features if not input_size else input_size)
 
     def _get_total_size(self, input_size: tuple) -> float:
-        _precision_bytes = self._model.precision / 8.0  # 1 byte -> 8 bits
-        total_input_dsize = abs(np.prod(np.array(input_size))) * _precision_bytes / (1024 ** 2.0)
-        total_output_dsize = abs(2.0 * self.total_out_params * _precision_bytes / (1024 ** 2.0))
-        total_params_dsize = abs(self.total_params * _precision_bytes / (1024 ** 2.0))
+        total_input_dsize = abs(np.prod(np.array(input_size))) * self._precision_bytes / (1024 ** 2.0)
+        # 2x  for gradients.
+        total_output_dsize = abs(2.0 * self.total_out_params * self._precision_bytes / (1024 ** 2.0))
+        total_params_dsize = abs(self.total_params * self._precision_bytes / (1024 ** 2.0))
         return total_params_dsize + total_output_dsize + total_input_dsize
 
     def summarize(self) -> Dict[str, LayerSummary]:
@@ -421,8 +421,10 @@ def get_gpu_memory_map() -> Dict[str, int]:
     gpu_memory_map = {f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)}
     return gpu_memory_map
 
+
 def get_formatted_model_size(total_model_size: float) -> float:
-    return "{:.4f}".format(total_model_size)
+    return "{:.3f}".format(total_model_size)
+
 
 def get_human_readable_count(number: int) -> str:
     """

From 869098fe3ffc352c32f46d51386444db3bd5c1f1 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 01:39:20 +0530
Subject: [PATCH 129/136] :hammer: use param_nums property for total_params
 calc.

---
 pytorch_lightning/core/memory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 3bcbcabb93976..c1f5e7322b9b0 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -310,7 +310,7 @@ def __str__(self):
             arrays.append(["Out sizes", self.out_sizes])
 
         trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
-        total_parameters = sum(p.numel() for p in self._model.parameters())
+        total_parameters = self.total_params
         total_model_size = self.model_size()
         return _format_summary_table(total_parameters, trainable_parameters, total_model_size, *arrays)
 

From b897ef95f5220a49481b1d495fbb8ece91bf1948 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 12:22:31 +0530
Subject: [PATCH 130/136] :hammer: fix minor issues

---
 pytorch_lightning/core/memory.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index c1f5e7322b9b0..e7cb57895093e 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -251,8 +251,12 @@ def model_size(self, input_size=None) -> float:
 
         elif isinstance(self._model.example_input_array, dict):
             in_features = self._model.example_input_array["tensor"].numel()
-        else:
+        elif isinstance(self._model.example_input_array, torch.Tensor):
             in_features = (self._model.example_input_array.numel(),)
+        else:
+            # if example_input_array is NoneType
+            in_features = (1,)
+
         return self._get_total_size(in_features if not input_size else input_size)
 
     def _get_total_size(self, input_size: tuple) -> float:

From ca730ffcb759181a3c05ab2d1637785c3e5bc404 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 12:26:10 +0530
Subject: [PATCH 131/136] :hammer: better Exception

---
 pytorch_lightning/core/memory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index e7cb57895093e..1bf03f131661d 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -224,8 +224,8 @@ def _get_out_size_params(out_sizes):
                 try:
                     # try to find prod, i.e check for unknown sizes.
                     _total_out_params += np.prod(out_sizes)
-                except:
-                    # do nothing if could not find product.
+                except TypeError:
+                    # do nothing if tried to find prod on unknown type.
                     pass
             else:
                 _ = [_get_out_size_params(out_size) for out_size in out_sizes if isinstance(out_size, list)]

From aafd89d0cc25017a3057a1ce5fcb6011b25af5b3 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 14:32:25 +0530
Subject: [PATCH 132/136] :hammer: refactore and minor bug fixes

---
 pytorch_lightning/core/memory.py | 89 +++++++++++++++++++++++++-------
 tests/core/test_memory.py        |  7 +--
 2 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 1bf03f131661d..7bad047780c3d 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -16,7 +16,7 @@
 import shutil
 import subprocess
 from collections import OrderedDict
-from typing import Tuple, Dict, Union, List, Any
+from typing import Tuple, Optional, Dict, Union, List, Any
 
 import numpy as np
 import torch
@@ -181,6 +181,9 @@ def __init__(self, model, mode: str = MODE_DEFAULT):
         self._mode = mode
         self._layer_summary = self.summarize()
         self._precision_bytes = self._model.precision / 8.0  # 1 byte -> 8 bits
+        self._precision_megabytes = self._precision_bytes / (1024 ** 2.0)
+        self.total_output_dsize = 0.0
+        self.total_params_dsize = 0.0
 
     @property
     def named_modules(self) -> List[Tuple[str, nn.Module]]:
@@ -214,14 +217,18 @@ def out_sizes(self) -> List:
     def param_nums(self) -> List[int]:
         return [layer.num_parameters for layer in self._layer_summary.values()]
 
-    @property
-    def total_out_params(self) -> int:
+    def total_out_params(self, batch_size_dim: int) -> int:
+        """ finds total output parameters to calculate forward/backward pass size. """
         _total_out_params = 0
 
-        def _get_out_size_params(out_sizes):
+        # recursive traversal to calculate output size.
+        # recursive is used to handle nested output sizes i.e [[[1,2,3],[[12,2,3], [1,3,4]]], [2,3,4]].
+        def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
             nonlocal _total_out_params
             if not any(isinstance(i, list) for i in out_sizes):
                 try:
+                    if out_sizes:
+                        out_sizes = out_sizes[:batch_size_dim] + [-1] + out_sizes[batch_size_dim + 1 :]
                     # try to find prod, i.e check for unknown sizes.
                     _total_out_params += np.prod(out_sizes)
                 except TypeError:
@@ -230,7 +237,10 @@ def _get_out_size_params(out_sizes):
             else:
                 _ = [_get_out_size_params(out_size) for out_size in out_sizes if isinstance(out_size, list)]
 
-        _get_out_size_params(self.out_sizes)
+        import copy
+
+        _out_sizes = copy.deepcopy(self.out_sizes)
+        _get_out_size_params(_out_sizes)
 
         return _total_out_params
 
@@ -238,7 +248,23 @@ def _get_out_size_params(out_sizes):
     def total_params(self) -> int:
         return sum(self.param_nums)
 
-    def model_size(self, input_size=None) -> float:
+    def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
+        """
+        Estimates total model size i.e input_size + forward/backward pass size + total params size in MBs
+        total params size gives model size in accounting total model params.
+        forward/backward model size accounts model size acounting output shape of individual layers.
+        input size gives the total input size in MBs including multiple inputs, batch size, etc.
+        ::
+
+        Example:
+            >> model = LitModel()
+            >> summary = ModelSummary(model, mode='top')  # doctest: +NORMALIZE_WHITESPACE
+            >> summary.model_size()
+
+        Returns:
+            float: Total estimated model size(MB) if example input array is passed else Total Model Params Size(MB).
+        """
+
         if isinstance(self._model.example_input_array, (list, tuple)):
             in_features = (
                 sum(
@@ -255,16 +281,28 @@ def model_size(self, input_size=None) -> float:
             in_features = (self._model.example_input_array.numel(),)
         else:
             # if example_input_array is NoneType
-            in_features = (1,)
+            in_features = None
+
+        return self._get_total_size(in_features, batch_size_dim)
 
-        return self._get_total_size(in_features if not input_size else input_size)
+    def _get_total_size(self, input_size: tuple, batch_size_dim: int) -> float:
+        """_get_total_size.
+        helper function to find total model size MB
 
-    def _get_total_size(self, input_size: tuple) -> float:
-        total_input_dsize = abs(np.prod(np.array(input_size))) * self._precision_bytes / (1024 ** 2.0)
+        Args:
+            input_size (tuple): input_size to calculate model input size (MB)
+
+        Returns:
+            float: Total estimated model size if example input array is passed else Total Model Params Size.
+        """
+        self.total_params_dsize = abs(self.total_params * self._precision_megabytes)
+        if not input_size:
+            self.total_output_dsize = 0.0
+            return self.total_params_dsize
+        self.total_input_dsize = abs(np.prod(np.array(input_size)) * self._precision_megabytes)
         # 2x  for gradients.
-        total_output_dsize = abs(2.0 * self.total_out_params * self._precision_bytes / (1024 ** 2.0))
-        total_params_dsize = abs(self.total_params * self._precision_bytes / (1024 ** 2.0))
-        return total_params_dsize + total_output_dsize + total_input_dsize
+        self.total_output_dsize = abs(2.0 * self.total_out_params(batch_size_dim) * self._precision_megabytes)
+        return self.total_params_dsize + self.total_output_dsize + self.total_input_dsize
 
     def summarize(self) -> Dict[str, LayerSummary]:
         summary = OrderedDict((name, LayerSummary(module)) for name, module in self.named_modules)
@@ -275,7 +313,7 @@ def summarize(self) -> Dict[str, LayerSummary]:
         return summary
 
     def _forward_example_input(self) -> None:
-        """ Run the example input through each layer to get input- and output sizes. """
+        """ Run the example input through each layer to get input and output sizes. """
         model = self._model
         trainer = self._model.trainer
 
@@ -315,8 +353,12 @@ def __str__(self):
 
         trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
         total_parameters = self.total_params
-        total_model_size = self.model_size()
-        return _format_summary_table(total_parameters, trainable_parameters, total_model_size, *arrays)
+        total_model_dsize = self.model_size()
+        total_params_dsize = self.total_params_dsize
+        total_output_dsize = self.total_output_dsize
+        return _format_summary_table(
+            total_parameters, trainable_parameters, total_model_dsize, total_output_dsize, total_params_dsize, *arrays
+        )
 
     def __repr__(self):
         return str(self)
@@ -333,7 +375,14 @@ def parse_batch_shape(batch: Any) -> Union[str, List]:
     return UNKNOWN_SIZE
 
 
-def _format_summary_table(total_parameters: int, trainable_parameters: int, total_model_size: float, *cols) -> str:
+def _format_summary_table(
+    total_parameters: int,
+    trainable_parameters: int,
+    total_model_dsize: float,
+    total_output_dsize: float,
+    total_params_dsize: float,
+    *cols,
+) -> str:
     """
     Takes in a number of arrays, each specifying a column in
     the summary table, and combines them all into one big
@@ -369,7 +418,11 @@ def _format_summary_table(total_parameters: int, trainable_parameters: int, tota
     summary += "Non-trainable params"
     summary += "\n" + s.format(get_human_readable_count(total_parameters), 10)
     summary += "Total params"
-    summary += "\n" + s.format(get_formatted_model_size(total_model_size), 10)
+    summary += "\n" + s.format(get_formatted_model_size(total_params_dsize), 10)
+    summary += "Total Estimated Params Size (MB)"
+    summary += "\n" + s.format(get_formatted_model_size(total_output_dsize), 10)
+    summary += "Total Estimated Forward/Backward Size (MB)"
+    summary += "\n" + s.format(get_formatted_model_size(total_model_dsize), 10)
     summary += "Total Estimated Model Size (MB)"
 
     return summary
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index a26e2bc26523d..37546ccfc41ed 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -266,9 +266,10 @@ def test_summary_layer_types(mode):
 @pytest.mark.parametrize(
     ["example_input", "expected_model_size"],
     [
-        pytest.param(torch.zeros(1, 1, 28, 28), 0.668),
-        pytest.param(torch.zeros(1, 1, 224, 224), 93.57),
-        pytest.param(torch.zeros(10, 1, 512, 512), 5176.78),
+        pytest.param(torch.zeros(1, 1, 28, 28), 0.318),
+        pytest.param(torch.zeros(1, 1, 224, 224), 31.84),
+        pytest.param(torch.zeros(10, 1, 512, 512), 183.425),
+        pytest.param(None, 0.075),
     ],
 )
 def test_known_model_sizes(example_input, expected_model_size, mode):

From 6fc8eb4af3884cd590894cdd9631a0b7bdbd1005 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 14:56:38 +0530
Subject: [PATCH 133/136] :hammer: doc test summary fix

---
 pytorch_lightning/core/memory.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 7bad047780c3d..31b8bade34fc2 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -159,6 +159,9 @@ class ModelSummary(object):
         132 K     Trainable params
         0         Non-trainable params
         132 K     Total params
+        0.506     Total Estimated Params Size (MB)
+        0.004     Total Estimated Forward/Backward Size (MB)
+        0.520     Total Estimated Model Size (MB)
         >>> ModelSummary(model, mode='full')  # doctest: +NORMALIZE_WHITESPACE
           | Name  | Type        | Params | In sizes  | Out sizes
         --------------------------------------------------------------
@@ -169,6 +172,9 @@ class ModelSummary(object):
         132 K     Trainable params
         0         Non-trainable params
         132 K     Total params
+        0.506     Total Estimated Params Size (MB)
+        0.004     Total Estimated Forward/Backward Size (MB)
+        0.520     Total Estimated Model Size (MB)
     """
 
     MODE_TOP = "top"

From 937a94cdd41b290b84cb1a71597fa9324dd7112e Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 17:53:01 +0530
Subject: [PATCH 134/136] :zap: Only full mode support.

---
 pytorch_lightning/core/memory.py | 60 +++++++++++++++++---------------
 tests/core/test_memory.py        | 31 +++++++++++++++++
 2 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 31b8bade34fc2..c4c0f54f606ca 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -159,9 +159,6 @@ class ModelSummary(object):
         132 K     Trainable params
         0         Non-trainable params
         132 K     Total params
-        0.506     Total Estimated Params Size (MB)
-        0.004     Total Estimated Forward/Backward Size (MB)
-        0.520     Total Estimated Model Size (MB)
         >>> ModelSummary(model, mode='full')  # doctest: +NORMALIZE_WHITESPACE
           | Name  | Type        | Params | In sizes  | Out sizes
         --------------------------------------------------------------
@@ -173,8 +170,8 @@ class ModelSummary(object):
         0         Non-trainable params
         132 K     Total params
         0.506     Total Estimated Params Size (MB)
-        0.004     Total Estimated Forward/Backward Size (MB)
-        0.520     Total Estimated Model Size (MB)
+        0.012     Total Estimated Forward/Backward Size (MB)
+        0.527     Total Estimated Model Size (MB)
     """
 
     MODE_TOP = "top"
@@ -223,10 +220,21 @@ def out_sizes(self) -> List:
     def param_nums(self) -> List[int]:
         return [layer.num_parameters for layer in self._layer_summary.values()]
 
+    @property
+    def total_params(self) -> int:
+        _total_params = sum(p.numel() for p in self._model.parameters())
+        return _total_params
+
     def total_out_params(self, batch_size_dim: int) -> int:
         """ finds total output parameters to calculate forward/backward pass size. """
         _total_out_params = 0
 
+        _out_sizes = [
+            l_size
+            for l_type, l_size in zip(self.layer_types, self.out_sizes)
+            if not isinstance(l_type, torch.nn.Sequential)
+        ]
+
         # recursive traversal to calculate output size.
         # recursive is used to handle nested output sizes i.e [[[1,2,3],[[12,2,3], [1,3,4]]], [2,3,4]].
         def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
@@ -243,28 +251,24 @@ def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
             else:
                 _ = [_get_out_size_params(out_size) for out_size in out_sizes if isinstance(out_size, list)]
 
-        import copy
-
-        _out_sizes = copy.deepcopy(self.out_sizes)
         _get_out_size_params(_out_sizes)
 
         return _total_out_params
 
-    @property
-    def total_params(self) -> int:
-        return sum(self.param_nums)
-
     def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
         """
         Estimates total model size i.e input_size + forward/backward pass size + total params size in MBs
         total params size gives model size in accounting total model params.
         forward/backward model size accounts model size acounting output shape of individual layers.
         input size gives the total input size in MBs including multiple inputs, batch size, etc.
+
+        NOTE: Currently only Supported in Full Mode.
+
         ::
 
         Example:
             >> model = LitModel()
-            >> summary = ModelSummary(model, mode='top')  # doctest: +NORMALIZE_WHITESPACE
+            >> summary = ModelSummary(model, mode='full')  # doctest: +NORMALIZE_WHITESPACE
             >> summary.model_size()
 
         Returns:
@@ -359,12 +363,13 @@ def __str__(self):
 
         trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
         total_parameters = self.total_params
-        total_model_dsize = self.model_size()
-        total_params_dsize = self.total_params_dsize
-        total_output_dsize = self.total_output_dsize
-        return _format_summary_table(
-            total_parameters, trainable_parameters, total_model_dsize, total_output_dsize, total_params_dsize, *arrays
-        )
+        model_size = None
+        if self._mode == self.MODE_FULL:
+            total_model_dsize = self.model_size()
+            total_params_dsize = self.total_params_dsize
+            total_output_dsize = self.total_output_dsize
+            model_size = (total_params_dsize, total_output_dsize, total_model_dsize)
+        return _format_summary_table(total_parameters, trainable_parameters, model_size, *arrays)
 
     def __repr__(self):
         return str(self)
@@ -384,9 +389,7 @@ def parse_batch_shape(batch: Any) -> Union[str, List]:
 def _format_summary_table(
     total_parameters: int,
     trainable_parameters: int,
-    total_model_dsize: float,
-    total_output_dsize: float,
-    total_params_dsize: float,
+    model_size: tuple,
     *cols,
 ) -> str:
     """
@@ -424,12 +427,13 @@ def _format_summary_table(
     summary += "Non-trainable params"
     summary += "\n" + s.format(get_human_readable_count(total_parameters), 10)
     summary += "Total params"
-    summary += "\n" + s.format(get_formatted_model_size(total_params_dsize), 10)
-    summary += "Total Estimated Params Size (MB)"
-    summary += "\n" + s.format(get_formatted_model_size(total_output_dsize), 10)
-    summary += "Total Estimated Forward/Backward Size (MB)"
-    summary += "\n" + s.format(get_formatted_model_size(total_model_dsize), 10)
-    summary += "Total Estimated Model Size (MB)"
+    if model_size:
+        summary += "\n" + s.format(get_formatted_model_size(model_size[0]), 10)
+        summary += "Total Estimated Params Size (MB)"
+        summary += "\n" + s.format(get_formatted_model_size(model_size[1]), 10)
+        summary += "Total Estimated Forward/Backward Size (MB)"
+        summary += "\n" + s.format(get_formatted_model_size(model_size[2]), 10)
+        summary += "Total Estimated Model Size (MB)"
 
     return summary
 
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index 37546ccfc41ed..af4274d98a64e 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -26,6 +26,15 @@ def almost_equals(a, b, rel_tol=0.0, abs_tol=0.0):
     return _almost_close(a, b)
 
 
+class LitModel(LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.net = nn.Sequential(nn.Linear(256, 512), nn.BatchNorm1d(512))
+
+    def forward(self, x):
+        return self.net(x)
+
+
 class KnownNet(LightningModule):
     """ Pre calculated known model """
 
@@ -281,6 +290,28 @@ def test_known_model_sizes(example_input, expected_model_size, mode):
     assert almost_equals(summary.model_size(), expected_model_size, rel_tol=1e-3, abs_tol=1e-3)
 
 
+@pytest.mark.parametrize(
+    ["mode"],
+    [
+        pytest.param(ModelSummary.MODE_FULL),
+    ],
+)
+@pytest.mark.parametrize(
+    ["example_input", "expected_model_size"],
+    [
+        pytest.param(torch.zeros(10, 256), 0.527),
+        pytest.param(None, 0.505),
+    ],
+)
+def test_nested_seq_model_sizes(example_input, expected_model_size, mode):
+    """ Test the knownet model on example input arrays and corresponding known model size """
+
+    model = LitModel()
+    model.example_input_array = example_input
+    summary = model.summarize(mode=mode)
+    assert almost_equals(summary.model_size(), expected_model_size, rel_tol=1e-3, abs_tol=1e-3)
+
+
 @pytest.mark.parametrize(
     ["mode"],
     [

From d42352b30b59c63822156aa3ccc82d9c40b419c1 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Tue, 1 Dec 2020 19:11:13 +0530
Subject: [PATCH 135/136] :hammer: core memory refactor

---
 pytorch_lightning/core/memory.py | 40 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index c4c0f54f606ca..a74eb3c8089b4 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -222,21 +222,13 @@ def param_nums(self) -> List[int]:
 
     @property
     def total_params(self) -> int:
-        _total_params = sum(p.numel() for p in self._model.parameters())
-        return _total_params
+        return sum(p.numel() for p in self._model.parameters())
 
     def total_out_params(self, batch_size_dim: int) -> int:
         """ finds total output parameters to calculate forward/backward pass size. """
-        _total_out_params = 0
-
-        _out_sizes = [
-            l_size
-            for l_type, l_size in zip(self.layer_types, self.out_sizes)
-            if not isinstance(l_type, torch.nn.Sequential)
-        ]
 
         # recursive traversal to calculate output size.
-        # recursive is used to handle nested output sizes i.e [[[1,2,3],[[12,2,3], [1,3,4]]], [2,3,4]].
+        # recursive is used to handle nested output sizes i.e [[[1,2,3], [[12,2,3], [1,3,4]]], [2,3,4]].
         def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
             nonlocal _total_out_params
             if not any(isinstance(i, list) for i in out_sizes):
@@ -249,7 +241,17 @@ def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
                     # do nothing if tried to find prod on unknown type.
                     pass
             else:
-                _ = [_get_out_size_params(out_size) for out_size in out_sizes if isinstance(out_size, list)]
+                for out_size in out_sizes:
+                    if isinstance(out_size, list):
+                        _get_out_size_params(out_size)
+
+        _total_out_params = 0
+
+        _out_sizes = [
+            l_size
+            for l_type, l_size in zip(self.layer_types, self.out_sizes)
+            if not isinstance(l_type, torch.nn.Sequential)
+        ]
 
         _get_out_size_params(_out_sizes)
 
@@ -264,15 +266,13 @@ def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
 
         NOTE: Currently only Supported in Full Mode.
 
-        ::
-
-        Example:
+        Example::
             >> model = LitModel()
             >> summary = ModelSummary(model, mode='full')  # doctest: +NORMALIZE_WHITESPACE
             >> summary.model_size()
 
         Returns:
-            float: Total estimated model size(MB) if example input array is passed else Total Model Params Size(MB).
+            Total estimated model size(MB) if example input array is passed else Total Model Params Size(MB).
         """
 
         if isinstance(self._model.example_input_array, (list, tuple)):
@@ -295,15 +295,15 @@ def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
 
         return self._get_total_size(in_features, batch_size_dim)
 
-    def _get_total_size(self, input_size: tuple, batch_size_dim: int) -> float:
+    def _get_total_size(self, input_size: Tuple[int], batch_size_dim: int) -> float:
         """_get_total_size.
-        helper function to find total model size MB
+        Function to find total model size (MB)
 
         Args:
-            input_size (tuple): input_size to calculate model input size (MB)
+            input_size : input_size to calculate model input size (MB)
 
         Returns:
-            float: Total estimated model size if example input array is passed else Total Model Params Size.
+            Total estimated model size if example input array is passed else Total Model Params Size.
         """
         self.total_params_dsize = abs(self.total_params * self._precision_megabytes)
         if not input_size:
@@ -490,7 +490,7 @@ def get_gpu_memory_map() -> Dict[str, int]:
 
 
 def get_formatted_model_size(total_model_size: float) -> float:
-    return "{:.3f}".format(total_model_size)
+    return f"{total_model_size:.3f}"
 
 
 def get_human_readable_count(number: int) -> str:

From 63010a8d11765b42a1eb3befca24eb1fe9cd7dc4 Mon Sep 17 00:00:00 2001
From: Kartik Sharma <kartik4949@gmail.com>
Date: Wed, 20 Jan 2021 19:12:50 +0530
Subject: [PATCH 136/136] Simplified Model size

---
 pytorch_lightning/core/memory.py | 160 +++++--------------------------
 tests/core/test_memory.py        | 138 +++++++-------------------
 2 files changed, 59 insertions(+), 239 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index a74eb3c8089b4..de551cabd30df 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -16,7 +16,7 @@
 import shutil
 import subprocess
 from collections import OrderedDict
-from typing import Tuple, Optional, Dict, Union, List, Any
+from typing import Tuple, Dict, Union, List, Any
 
 import numpy as np
 import torch
@@ -33,17 +33,13 @@ class LayerSummary(object):
     """
     Summary class for a single layer in a :class:`~pytorch_lightning.core.lightning.LightningModule`.
     It collects the following information:
-
     - Type of the layer (e.g. Linear, BatchNorm1d, ...)
     - Input shape
     - Output shape
     - Number of parameters
-
     The input and output shapes are only known after the example input array was
     passed through the model.
-
     Example::
-
         >>> model = torch.nn.Conv2d(3, 8, 3)
         >>> summary = LayerSummary(model)
         >>> summary.num_parameters
@@ -55,10 +51,8 @@ class LayerSummary(object):
         [1, 3, 5, 5]
         >>> summary.out_size
         [1, 8, 3, 3]
-
     Args:
         module: A module to summarize
-
     """
 
     def __init__(self, module: nn.Module):
@@ -76,7 +70,6 @@ def _register_hook(self) -> RemovableHandle:
         Registers a hook on the module that computes the input- and output size(s) on the first forward pass.
         If the hook is called, it will remove itself from the from the module, meaning that
         recursive models will only record their input- and output shapes once.
-
         Return:
             A handle for the installed hook.
         """
@@ -120,25 +113,19 @@ def num_parameters(self) -> int:
 class ModelSummary(object):
     """
     Generates a summary of all layers in a :class:`~pytorch_lightning.core.lightning.LightningModule`.
-
     Args:
         model: The model to summarize (also referred to as the root module)
         mode: Can be one of
-
              - `top` (default): only the top-level modules will be recorded (the children of the root module)
              - `full`: summarizes all layers and their submodules in the root module
-
     The string representation of this summary prints a table with columns containing
-    the name, type and number of parameters for each layer.
-
+    the name type and number of parameters for each layer.
     The root module may also have an attribute ``example_input_array`` as shown in the example below.
     If present, the root module will be called with it as input to determine the
     intermediate input- and output shapes of all layers. Supported are tensors and
     nested lists and tuples of tensors. All other types of inputs will be skipped and show as `?`
     in the summary table. The summary will also display `?` for layers not used in the forward pass.
-
     Example::
-
         >>> import pytorch_lightning as pl
         >>> class LitModel(pl.LightningModule):
         ...
@@ -169,9 +156,7 @@ class ModelSummary(object):
         132 K     Trainable params
         0         Non-trainable params
         132 K     Total params
-        0.506     Total Estimated Params Size (MB)
-        0.012     Total Estimated Forward/Backward Size (MB)
-        0.527     Total Estimated Model Size (MB)
+        0.506     Total estimated model params size (MB)
     """
 
     MODE_TOP = "top"
@@ -183,10 +168,7 @@ def __init__(self, model, mode: str = MODE_DEFAULT):
         self._model = model
         self._mode = mode
         self._layer_summary = self.summarize()
-        self._precision_bytes = self._model.precision / 8.0  # 1 byte -> 8 bits
-        self._precision_megabytes = self._precision_bytes / (1024 ** 2.0)
-        self.total_output_dsize = 0.0
-        self.total_params_dsize = 0.0
+        self._precision_megabytes = (self._model.precision / 8.0) / (1024 ** 2.0) # 1 byte -> 8 bits)
 
     @property
     def named_modules(self) -> List[Tuple[str, nn.Module]]:
@@ -221,50 +203,19 @@ def param_nums(self) -> List[int]:
         return [layer.num_parameters for layer in self._layer_summary.values()]
 
     @property
-    def total_params(self) -> int:
+    def total_parameters(self) -> int:
         return sum(p.numel() for p in self._model.parameters())
 
-    def total_out_params(self, batch_size_dim: int) -> int:
-        """ finds total output parameters to calculate forward/backward pass size. """
-
-        # recursive traversal to calculate output size.
-        # recursive is used to handle nested output sizes i.e [[[1,2,3], [[12,2,3], [1,3,4]]], [2,3,4]].
-        def _get_out_size_params(out_sizes, batch_size_dim=batch_size_dim):
-            nonlocal _total_out_params
-            if not any(isinstance(i, list) for i in out_sizes):
-                try:
-                    if out_sizes:
-                        out_sizes = out_sizes[:batch_size_dim] + [-1] + out_sizes[batch_size_dim + 1 :]
-                    # try to find prod, i.e check for unknown sizes.
-                    _total_out_params += np.prod(out_sizes)
-                except TypeError:
-                    # do nothing if tried to find prod on unknown type.
-                    pass
-            else:
-                for out_size in out_sizes:
-                    if isinstance(out_size, list):
-                        _get_out_size_params(out_size)
-
-        _total_out_params = 0
-
-        _out_sizes = [
-            l_size
-            for l_type, l_size in zip(self.layer_types, self.out_sizes)
-            if not isinstance(l_type, torch.nn.Sequential)
-        ]
-
-        _get_out_size_params(_out_sizes)
-
-        return _total_out_params
+    @property
+    def trainable_parameters(self) -> int:
+        return sum(p.numel() for p in self._model.parameters() if p.requires_grad)
 
-    def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
+    def model_size(self) -> float:
         """
-        Estimates total model size i.e input_size + forward/backward pass size + total params size in MBs
+        Estimates total model size i.e total params size in MBs
         total params size gives model size in accounting total model params.
-        forward/backward model size accounts model size acounting output shape of individual layers.
-        input size gives the total input size in MBs including multiple inputs, batch size, etc.
 
-        NOTE: Currently only Supported in Full Mode.
+        NOTE: Currently only Supported total params size.
 
         Example::
             >> model = LitModel()
@@ -272,47 +223,9 @@ def model_size(self, batch_size_dim: Optional[int] = 0) -> float:
             >> summary.model_size()
 
         Returns:
-            Total estimated model size(MB) if example input array is passed else Total Model Params Size(MB).
+            Total estimated model size(MB).
         """
-
-        if isinstance(self._model.example_input_array, (list, tuple)):
-            in_features = (
-                sum(
-                    [
-                        input_array.numel() if isinstance(input_array, torch.Tensor) else torch.tensor(input_array)
-                        for input_array in self._model.example_input_array
-                    ]
-                ),
-            )
-
-        elif isinstance(self._model.example_input_array, dict):
-            in_features = self._model.example_input_array["tensor"].numel()
-        elif isinstance(self._model.example_input_array, torch.Tensor):
-            in_features = (self._model.example_input_array.numel(),)
-        else:
-            # if example_input_array is NoneType
-            in_features = None
-
-        return self._get_total_size(in_features, batch_size_dim)
-
-    def _get_total_size(self, input_size: Tuple[int], batch_size_dim: int) -> float:
-        """_get_total_size.
-        Function to find total model size (MB)
-
-        Args:
-            input_size : input_size to calculate model input size (MB)
-
-        Returns:
-            Total estimated model size if example input array is passed else Total Model Params Size.
-        """
-        self.total_params_dsize = abs(self.total_params * self._precision_megabytes)
-        if not input_size:
-            self.total_output_dsize = 0.0
-            return self.total_params_dsize
-        self.total_input_dsize = abs(np.prod(np.array(input_size)) * self._precision_megabytes)
-        # 2x  for gradients.
-        self.total_output_dsize = abs(2.0 * self.total_out_params(batch_size_dim) * self._precision_megabytes)
-        return self.total_params_dsize + self.total_output_dsize + self.total_input_dsize
+        return self.total_parameters * self._precision_megabytes
 
     def summarize(self) -> Dict[str, LayerSummary]:
         summary = OrderedDict((name, LayerSummary(module)) for name, module in self.named_modules)
@@ -323,7 +236,7 @@ def summarize(self) -> Dict[str, LayerSummary]:
         return summary
 
     def _forward_example_input(self) -> None:
-        """ Run the example input through each layer to get input and output sizes. """
+        """ Run the example input through each layer to get input- and output sizes. """
         model = self._model
         trainer = self._model.trainer
 
@@ -348,7 +261,6 @@ def _forward_example_input(self) -> None:
     def __str__(self):
         """
         Makes a summary listing with:
-
         Layer Name, Layer Type, Number of Parameters, Input Sizes, Output Sizes
         """
         arrays = [
@@ -360,15 +272,10 @@ def __str__(self):
         if self._model.example_input_array is not None:
             arrays.append(["In sizes", self.in_sizes])
             arrays.append(["Out sizes", self.out_sizes])
+        total_parameters = self.total_parameters
+        trainable_parameters = self.trainable_parameters
+        model_size = self.model_size()
 
-        trainable_parameters = sum(p.numel() for p in self._model.parameters() if p.requires_grad)
-        total_parameters = self.total_params
-        model_size = None
-        if self._mode == self.MODE_FULL:
-            total_model_dsize = self.model_size()
-            total_params_dsize = self.total_params_dsize
-            total_output_dsize = self.total_output_dsize
-            model_size = (total_params_dsize, total_output_dsize, total_model_dsize)
         return _format_summary_table(total_parameters, trainable_parameters, model_size, *arrays)
 
     def __repr__(self):
@@ -386,12 +293,7 @@ def parse_batch_shape(batch: Any) -> Union[str, List]:
     return UNKNOWN_SIZE
 
 
-def _format_summary_table(
-    total_parameters: int,
-    trainable_parameters: int,
-    model_size: tuple,
-    *cols,
-) -> str:
+def _format_summary_table(total_parameters: int, trainable_parameters: int, model_size: float, *cols) -> str:
     """
     Takes in a number of arrays, each specifying a column in
     the summary table, and combines them all into one big
@@ -427,31 +329,22 @@ def _format_summary_table(
     summary += "Non-trainable params"
     summary += "\n" + s.format(get_human_readable_count(total_parameters), 10)
     summary += "Total params"
-    if model_size:
-        summary += "\n" + s.format(get_formatted_model_size(model_size[0]), 10)
-        summary += "Total Estimated Params Size (MB)"
-        summary += "\n" + s.format(get_formatted_model_size(model_size[1]), 10)
-        summary += "Total Estimated Forward/Backward Size (MB)"
-        summary += "\n" + s.format(get_formatted_model_size(model_size[2]), 10)
-        summary += "Total Estimated Model Size (MB)"
+    summary += "\n" + s.format(get_formatted_model_size(model_size), 10)
+    summary += "Total Estimated Params Size (MB)"
 
     return summary
 
 
 def get_memory_profile(mode: str) -> Union[Dict[str, int], Dict[int, int]]:
-    """Get a profile of the current memory usage.
-
+    """ Get a profile of the current memory usage.
     Args:
         mode: There are two modes:
-
             - 'all' means return memory for all gpus
             - 'min_max' means return memory for max and min
-
     Return:
         A dictionary in which the keys are device ids as integers and
         values are memory usage as integers in MB.
         If mode is 'min_max', the dictionary will also contain two additional keys:
-
         - 'min_gpu_mem': the minimum memory usage in MB
         - 'max_gpu_mem': the maximum memory usage in MB
     """
@@ -469,7 +362,6 @@ def get_memory_profile(mode: str) -> Union[Dict[str, int], Dict[int, int]]:
 def get_gpu_memory_map() -> Dict[str, int]:
     """
     Get the current gpu usage.
-
     Return:
         A dictionary in which the keys are device ids as integers and
         values are memory usage as integers in MB.
@@ -485,19 +377,18 @@ def get_gpu_memory_map() -> Dict[str, int]:
 
     # Convert lines into a dictionary
     gpu_memory = [float(x) for x in result.stdout.strip().split(os.linesep)]
-    gpu_memory_map = {f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)}
+    gpu_memory_map = {
+        f"gpu_id: {gpu_id}/memory.used (MB)": memory for gpu_id, memory in enumerate(gpu_memory)
+    }
     return gpu_memory_map
 
-
 def get_formatted_model_size(total_model_size: float) -> float:
     return f"{total_model_size:.3f}"
 
-
 def get_human_readable_count(number: int) -> str:
     """
     Abbreviates an integer number with K, M, B, T for thousands, millions,
     billions and trillions, respectively.
-
     Examples:
         >>> get_human_readable_count(123)
         '123  '
@@ -511,13 +402,10 @@ def get_human_readable_count(number: int) -> str:
         '400 T'
         >>> get_human_readable_count(5e15)  # (more than trillion)
         '5,000 T'
-
     Args:
         number: a positive integer number
-
     Return:
         A string formatted according to the pattern described above.
-
     """
     assert number >= 0
     labels = PARAMETER_NUM_UNITS
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index af4274d98a64e..cb68ad04459e8 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -21,54 +21,33 @@
 from tests.base.models import ParityModuleRNN
 
 
-def almost_equals(a, b, rel_tol=0.0, abs_tol=0.0):
-    _almost_close = lambda a, b: abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
-    return _almost_close(a, b)
-
+class EmptyModule(LightningModule):
+    """ A module that has no layers """
 
-class LitModel(LightningModule):
     def __init__(self):
         super().__init__()
-        self.net = nn.Sequential(nn.Linear(256, 512), nn.BatchNorm1d(512))
+        self.parameter = torch.rand(3, 3, requires_grad=True)
+        self.example_input_array = torch.zeros(1, 2, 3, 4, 5)
 
-    def forward(self, x):
-        return self.net(x)
+    def forward(self, *args, **kwargs):
+        return {'loss': self.parameter.sum()}
 
 
-class KnownNet(LightningModule):
-    """ Pre calculated known model """
+class PreCalculatedModel(LightningModule):
+    """ A module with precalculated total params size in MB. """
 
     def __init__(self):
         super().__init__()
-        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-        self.conv3 = nn.Conv2d(20, 30, kernel_size=3)
-        self.conv4 = nn.Conv2d(30, 30, kernel_size=3)
-        self.fc1 = nn.Linear(10, 50)
-        self.fc2 = nn.Linear(50, 10)
+        self.layer1 = nn.Linear(10, 100)
+        self.layer2 = nn.Linear(100, 2)
+        self.pre_calculated_model_size = 0.005
 
     def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        x = self.conv4(x)
-        x = x.view(-1, 10)
-        x = self.fc1(x)
-        x = self.fc2(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
         return x
 
 
-class EmptyModule(LightningModule):
-    """ A module that has no layers """
-
-    def __init__(self):
-        super().__init__()
-        self.parameter = torch.rand(3, 3, requires_grad=True)
-        self.example_input_array = torch.zeros(1, 2, 3, 4, 5)
-
-    def forward(self, *args, **kwargs):
-        return {'loss': self.parameter.sum()}
-
 
 class UnorderedModel(LightningModule):
     """ A model in which the layers not defined in order of execution """
@@ -250,7 +229,6 @@ def test_summary_layer_types(mode):
     ]
 
 
-<<<<<<< HEAD
 @pytest.mark.parametrize(['mode'], [
     pytest.param(ModelSummary.MODE_FULL),
     pytest.param(ModelSummary.MODE_TOP),
@@ -264,74 +242,6 @@ def test_summary_layer_types(mode):
     pytest.param([torch.zeros(2, 3), torch.zeros(4, 5)], [[2, 3], [4, 5]]),
     pytest.param((torch.zeros(2, 3), torch.zeros(4, 5)), [[2, 3], [4, 5]]),
 ])
-=======
-@pytest.mark.parametrize(
-    ["mode"],
-    [
-        pytest.param(ModelSummary.MODE_FULL),
-        pytest.param(ModelSummary.MODE_TOP),
-    ],
-)
-@pytest.mark.parametrize(
-    ["example_input", "expected_model_size"],
-    [
-        pytest.param(torch.zeros(1, 1, 28, 28), 0.318),
-        pytest.param(torch.zeros(1, 1, 224, 224), 31.84),
-        pytest.param(torch.zeros(10, 1, 512, 512), 183.425),
-        pytest.param(None, 0.075),
-    ],
-)
-def test_known_model_sizes(example_input, expected_model_size, mode):
-    """ Test the knownet model on example input arrays and corresponding known model size """
-
-    model = KnownNet()
-    model.example_input_array = example_input
-    summary = model.summarize(mode=mode)
-    assert almost_equals(summary.model_size(), expected_model_size, rel_tol=1e-3, abs_tol=1e-3)
-
-
-@pytest.mark.parametrize(
-    ["mode"],
-    [
-        pytest.param(ModelSummary.MODE_FULL),
-    ],
-)
-@pytest.mark.parametrize(
-    ["example_input", "expected_model_size"],
-    [
-        pytest.param(torch.zeros(10, 256), 0.527),
-        pytest.param(None, 0.505),
-    ],
-)
-def test_nested_seq_model_sizes(example_input, expected_model_size, mode):
-    """ Test the knownet model on example input arrays and corresponding known model size """
-
-    model = LitModel()
-    model.example_input_array = example_input
-    summary = model.summarize(mode=mode)
-    assert almost_equals(summary.model_size(), expected_model_size, rel_tol=1e-3, abs_tol=1e-3)
-
-
-@pytest.mark.parametrize(
-    ["mode"],
-    [
-        pytest.param(ModelSummary.MODE_FULL),
-        pytest.param(ModelSummary.MODE_TOP),
-    ],
-)
-@pytest.mark.parametrize(
-    ["example_input", "expected_size"],
-    [
-        pytest.param([], UNKNOWN_SIZE),
-        pytest.param((1, 2, 3), [UNKNOWN_SIZE] * 3),
-        pytest.param(torch.tensor(0), UNKNOWN_SIZE),
-        pytest.param(dict(tensor=torch.zeros(1, 2, 3)), UNKNOWN_SIZE),
-        pytest.param(torch.zeros(2, 3, 4), [2, 3, 4]),
-        pytest.param([torch.zeros(2, 3), torch.zeros(4, 5)], [[2, 3], [4, 5]]),
-        pytest.param((torch.zeros(2, 3), torch.zeros(4, 5)), [[2, 3], [4, 5]]),
-    ],
-)
->>>>>>> :hammer: Simplified tests
 def test_example_input_array_types(example_input, expected_size, mode):
     """ Test the types of example inputs supported for display in the summary. """
 
@@ -353,3 +263,25 @@ def forward(self, *args, **kwargs):
     model.example_input_array = example_input
     summary = model.summarize(mode=mode)
     assert summary.in_sizes == [expected_size]
+
+@pytest.mark.parametrize(['mode'], [
+    pytest.param(ModelSummary.MODE_FULL),
+    pytest.param(ModelSummary.MODE_TOP),
+])
+def test_model_size(mode):
+    """ Test that model size is calculated correctly. """
+    model = PreCalculatedModel()
+    summary = model.summarize(mode=mode)
+    pre_calculated_model_size = torch.tensor(model.pre_calculated_model_size)
+    model_size = torch.tensor(summary.model_size())
+    assert torch.isclose(model_size, pre_calculated_model_size, atol=1e-4)
+
+@pytest.mark.parametrize(['mode'], [
+    pytest.param(ModelSummary.MODE_FULL),
+    pytest.param(ModelSummary.MODE_TOP),
+])
+def test_empty_model_size(mode):
+    """ Test that empty model size is zero. """
+    model = EmptyModule()
+    summary = model.summarize(mode=mode)
+    assert 0.0 == summary.model_size()