Lightning-AI · kaushikb11 · Mar 29, 2022 · Mar 24, 2022 · Mar 24, 2022 · Mar 24, 2022
@@ -296,7 +296,6 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
 
     model = MyModel()
     trainer = Trainer(accelerator="gpu", devices=4, strategy="deepspeed_stage_2_offload", precision=16)
@@ -341,7 +340,6 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
 
     import pytorch_lightning
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
@@ -385,7 +383,6 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -409,7 +406,6 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
 
 
     class MyModel(pl.LightningModule):
@@ -435,7 +431,6 @@ This reduces the time taken to initialize very large models, as well as ensure w
 
     import torch.nn as nn
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -549,7 +544,6 @@ This saves memory when training larger models, however requires using a checkpoi
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     import deepspeed
 
 
@@ -686,7 +680,7 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
     }
 
     model = MyModel()
-    trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(deepspeed_config), precision=16)
+    trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config=deepspeed_config), precision=16)
     trainer.fit(model)
 
 
@@ -699,7 +693,7 @@ We support taking the config as a json formatted file:
 
     model = MyModel()
     trainer = Trainer(
-        accelerator="gpu", devices=4, strategy=DeepSpeedStrategy("/path/to/deepspeed_config.json"), precision=16
+        accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config="/path/to/deepspeed_config.json"), precision=16
     )
     trainer.fit(model)
 

@@ -331,8 +331,7 @@ However, for in-memory datasets, that means that each process will hold a (redun
 For example, when training Graph Neural Networks, a common strategy is to load the entire graph into CPU memory for fast access to the entire graph structure and its features, and to then perform neighbor sampling to obtain mini-batches that fit onto the GPU.
 
 A simple way to prevent redundant dataset replicas is to rely on :obj:`torch.multiprocessing` to share the `data automatically between spawned processes via shared memory <https://pytorch.org/docs/stable/notes/multiprocessing.html>`_.
-For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy`
-training type strategy:
+For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` strategy.
 
 .. warning::
 

@@ -315,6 +315,7 @@ and the Lightning Team will be happy to integrate/help integrate it.
 
 -----------
 
+.. _customize_checkpointing:
 
 ***********************
 Customize Checkpointing
@@ -392,7 +393,7 @@ Custom Checkpoint IO Plugin
 
 .. note::
 
-    Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
+    Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable.
 
 -----------
 

@@ -1056,7 +1056,7 @@ automatic_optimization
 When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling
 your optimizers. However, we do take care of precision and any accelerators used.
 
-See :ref:`manual optimization<common/optimization:Manual optimization>` for details.
+See :ref:`manual optimization <common/optimization:Manual optimization>` for details.
 
 .. code-block:: python
 

@@ -1445,7 +1445,7 @@ checkpoint, training will start from the beginning of the next epoch.
 strategy
 ^^^^^^^^
 
-Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom training type plugins.
+Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom strategies.
 
 .. code-block:: python
 
@@ -1455,7 +1455,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
     # Training with the DDP Spawn strategy using 4 cpu processes
     trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=4)
 
-.. note:: Additionally, you can pass your custom training type plugins to the ``strategy`` argument.
+.. note:: Additionally, you can pass your custom strategy to the ``strategy`` argument.
 
 .. code-block:: python
 

@@ -6,54 +6,32 @@ Plugins
 
 .. include:: ../links.rst
 
-Plugins allow custom integrations to the internals of the Trainer such as a custom precision or
-distributed implementation.
+Plugins allow custom integrations to the internals of the Trainer such as custom precision, checkpointing or
+cluster environment implementation.
 
 Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically
-depending on the provided Trainer arguments. For example:
+depending on the provided Trainer arguments.
 
-.. code-block:: python
-
-    # accelerator: GPUAccelerator
-    # training strategy: DDPStrategy
-    # precision: NativeMixedPrecisionPlugin
-    trainer = Trainer(accelerator="gpu", devices=4, precision=16)
-
-
-We expose Accelerators and Plugins mainly for expert users that want to extend Lightning for:
-
-- New hardware (like TPU plugin)
-- Distributed backends (e.g. a backend not yet supported by
-  `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself)
-- Clusters (e.g. customized access to the cluster's environment interface)
-
-There are two types of Plugins in Lightning with different responsibilities:
-
-Strategy
---------
-
-- Launching and teardown of training processes (if applicable)
-- Setup communication between processes (NCCL, GLOO, MPI, ...)
-- Provide a unified communication interface for reduction, broadcast, etc.
-- Provide access to the wrapped LightningModule
+There are three types of Plugins in Lightning with different responsibilities:
 
+- Precision Plugins
+- CheckpointIO Plugins
+- Cluster Environments
 
-Furthermore, for multi-node training Lightning provides cluster environment plugins that allow the advanced user
-to configure Lightning to integrate with a :ref:`custom-cluster`.
 
+*****************
+Precision Plugins
+*****************
 
-.. image:: ../_static/images/accelerator/overview.svg
-
-
-The full list of built-in plugins is listed below.
-
+We provide precision plugins for you to benefit from numerical representations with lower precision than
+32-bit floating-point or higher precision, such as 64-bit floating-point.
 
-.. warning:: The Plugin API is in beta and subject to change.
-    For help setting up custom plugins/accelerators, please reach out to us at **support@pytorchlightning.ai**
+.. code-block:: python
 
+    # Training with 16-bit precision
+    trainer = Trainer(precision=16)
 
-Precision Plugins
------------------
+The full list of built-in precision plugins is listed below.
 
 .. currentmodule:: pytorch_lightning.plugins.precision
 
@@ -74,9 +52,39 @@ Precision Plugins
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
+More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>`
+
+-----------
+
+********************
+CheckpointIO Plugins
+********************
 
+As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin.
+With this, you have the ability to customize the checkpointing logic to match the needs of your infrastructure.
+
+Below is a list of built-in plugins for checkpointing.
+
+.. currentmodule:: pytorch_lightning.plugins.io
+
+.. autosummary::
+    :nosignatures:
+    :template: classtemplate.rst
+
+    CheckpointIO
+    HPUCheckpointIO
+    TorchCheckpointIO
+    XLACheckpointIO
+
+You could learn more about custom checkpointing with Lightning :ref:`here <customize_checkpointing>`.
+
+-----------
+
+********************
 Cluster Environments
---------------------
+********************
+
+You can define the interface of your own cluster environment based on the requirements of your infrastructure.
 
 .. currentmodule:: pytorch_lightning.plugins.environments
 
@@ -85,8 +93,8 @@ Cluster Environments
     :template: classtemplate.rst
 
     ClusterEnvironment
+    KubeflowEnvironment
     LightningEnvironment
     LSFEnvironment
-    TorchElasticEnvironment
-    KubeflowEnvironment
     SLURMEnvironment
+    TorchElasticEnvironment
@@ -387,7 +387,7 @@ Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"tpu_spawn"`
     lite = Lite(strategy="ddp_spawn", accelerator="cpu", devices=4)
 
 
-Additionally, you can pass in your custom training type strategy by configuring additional parameters.
+Additionally, you can pass in your custom strategy by configuring additional parameters.
 
 .. code-block:: python
 

@@ -235,7 +235,7 @@ def _run_optimization(
         closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer)
 
         if (
-            # when the training type plugin handles accumulation, we want to always call the optimizer step
+            # when the strategy handles accumulation, we want to always call the optimizer step
             not self.trainer.strategy.handles_gradient_accumulation
             and self.trainer.fit_loop._should_accumulate()
         ):

@@ -40,8 +40,7 @@
 
 
 class Strategy(ABC):
-    """Base class for all training type plugins that change the behaviour of the training, validation and test-
-    loop."""
+    """Base class for all strategies that change the behaviour of the training, validation and test- loop."""
 
     def __init__(
         self,

@@ -401,7 +401,7 @@ def __init__(
                     Please pass the path to ``Trainer.fit(..., ckpt_path=...)`` instead.
 
             strategy: Supports different training strategies with aliases
-                as well custom training type plugins.
+                as well custom strategies.
                 Default: ``None``.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
@@ -1152,7 +1152,7 @@ def _run(
         if hasattr(model, "hparams"):
             parsing.clean_namespace(model.hparams)
 
-        # attach model to the training type plugin
+        # attach model to the strategy
         self.strategy.connect(model)
 
         self._callback_connector._attach_model_callbacks()
@@ -2035,17 +2035,17 @@ def global_rank(self) -> int:
 
     @property
     def local_rank(self) -> int:
-        # some training types define a local rank
+        # some strategies define a local rank
         return getattr(self.strategy, "local_rank", 0)
 
     @property
     def node_rank(self) -> int:
-        # some training types define a node rank
+        # some strategies define a node rank
         return getattr(self.strategy, "node_rank", 0)
 
     @property
     def world_size(self) -> int:
-        # some training types define a world size
+        # some strategies define a world size
         return getattr(self.strategy, "world_size", 1)
 
     @property

@@ -79,7 +79,7 @@ def _assert_autocast_enabled(self):
 @pytest.mark.parametrize("precision", [16, "bf16"])
 @pytest.mark.parametrize("devices", [1, 2])
 def test_amp_cpus(tmpdir, strategy, precision, devices):
-    """Make sure combinations of AMP and training types work if supported."""
+    """Make sure combinations of AMP and strategies work if supported."""
     tutils.reset_seed()
 
     trainer = Trainer(
@@ -104,7 +104,7 @@ def test_amp_cpus(tmpdir, strategy, precision, devices):
 @pytest.mark.parametrize("precision", [16, "bf16"])
 @pytest.mark.parametrize("devices", [1, 2])
 def test_amp_gpus(tmpdir, strategy, precision, devices):
-    """Make sure combinations of AMP and training types work if supported."""
+    """Make sure combinations of AMP and strategies work if supported."""
     tutils.reset_seed()
 
     trainer = Trainer(

@@ -55,7 +55,7 @@ def get_from_queue(self, queue) -> None:
 def test_ddp_cpu():
     """Tests if device is set correctly when training for DDPSpawnStrategy."""
     trainer = Trainer(devices=2, accelerator="cpu", fast_dev_run=True)
-    # assert training type plugin attributes for device setting
+    # assert strategy attributes for device setting
 
     assert isinstance(trainer.strategy, DDPSpawnStrategy)
     assert trainer.strategy.root_device == torch.device("cpu")

@@ -37,7 +37,7 @@ def on_train_start(self) -> None:
 def test_ddp_with_2_gpus():
     """Tests if device is set correctly when training and after teardown for DDPStrategy."""
     trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True)
-    # assert training type plugin attributes for device setting
+    # assert strategy attributes for device setting
     assert isinstance(trainer.strategy, DDPStrategy)
     local_rank = trainer.strategy.local_rank
     assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}")

@@ -1496,7 +1496,7 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg
 
 
 def test_spawn_predict_return_predictions(tmpdir):
-    """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
+    """Test that `return_predictions=True` raise a MisconfigurationException with spawn strategies."""
     model = BoringModel()
     trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
     assert isinstance(trainer.strategy, DDPSpawnStrategy)