diff --git a/docs/source/advanced/model_parallel.rst b/docs/source/advanced/model_parallel.rst
index 18c83bde743c8..5cf7556be1efd 100644
--- a/docs/source/advanced/model_parallel.rst
+++ b/docs/source/advanced/model_parallel.rst
@@ -296,7 +296,6 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
 
     model = MyModel()
     trainer = Trainer(accelerator="gpu", devices=4, strategy="deepspeed_stage_2_offload", precision=16)
@@ -341,7 +340,6 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
 
     import pytorch_lightning
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
@@ -385,7 +383,6 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -409,7 +406,6 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
 
 
     class MyModel(pl.LightningModule):
@@ -435,7 +431,6 @@ This reduces the time taken to initialize very large models, as well as ensure w
 
     import torch.nn as nn
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -549,7 +544,6 @@ This saves memory when training larger models, however requires using a checkpoi
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.strategies import DeepSpeedStrategy
     import deepspeed
 
 
@@ -686,7 +680,7 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
     }
 
     model = MyModel()
-    trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(deepspeed_config), precision=16)
+    trainer = Trainer(accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config=deepspeed_config), precision=16)
     trainer.fit(model)
 
 
@@ -699,7 +693,7 @@ We support taking the config as a json formatted file:
 
     model = MyModel()
     trainer = Trainer(
-        accelerator="gpu", devices=4, strategy=DeepSpeedStrategy("/path/to/deepspeed_config.json"), precision=16
+        accelerator="gpu", devices=4, strategy=DeepSpeedStrategy(config="/path/to/deepspeed_config.json"), precision=16
     )
     trainer.fit(model)
 
diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
index ddfc72e24ef75..103eb34c3fe8e 100644
--- a/docs/source/advanced/training_tricks.rst
+++ b/docs/source/advanced/training_tricks.rst
@@ -331,8 +331,7 @@ However, for in-memory datasets, that means that each process will hold a (redun
 For example, when training Graph Neural Networks, a common strategy is to load the entire graph into CPU memory for fast access to the entire graph structure and its features, and to then perform neighbor sampling to obtain mini-batches that fit onto the GPU.
 
 A simple way to prevent redundant dataset replicas is to rely on :obj:`torch.multiprocessing` to share the `data automatically between spawned processes via shared memory <https://pytorch.org/docs/stable/notes/multiprocessing.html>`_.
-For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy`
-training type strategy:
+For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` strategy.
 
 .. warning::
 
diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst
index 2371964d1f278..31824e828cc7d 100644
--- a/docs/source/common/checkpointing.rst
+++ b/docs/source/common/checkpointing.rst
@@ -315,6 +315,7 @@ and the Lightning Team will be happy to integrate/help integrate it.
 
 -----------
 
+.. _customize_checkpointing:
 
 ***********************
 Customize Checkpointing
@@ -392,7 +393,7 @@ Custom Checkpoint IO Plugin
 
 .. note::
 
-    Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
+    Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable.
 
 -----------
 
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index 935e788310d7c..fd9de11f601d8 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -1056,7 +1056,7 @@ automatic_optimization
 When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling
 your optimizers. However, we do take care of precision and any accelerators used.
 
-See :ref:`manual optimization<common/optimization:Manual optimization>` for details.
+See :ref:`manual optimization <common/optimization:Manual optimization>` for details.
 
 .. code-block:: python
 
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index 819c5cb4897b4..56f8cb07ee787 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -1445,7 +1445,7 @@ checkpoint, training will start from the beginning of the next epoch.
 strategy
 ^^^^^^^^
 
-Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom training type plugins.
+Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom strategies.
 
 .. code-block:: python
 
@@ -1455,7 +1455,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
     # Training with the DDP Spawn strategy using 4 cpu processes
     trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=4)
 
-.. note:: Additionally, you can pass your custom training type plugins to the ``strategy`` argument.
+.. note:: Additionally, you can pass your custom strategy to the ``strategy`` argument.
 
 .. code-block:: python
 
diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst
index 3bfa7ad24b29c..601de26b39531 100644
--- a/docs/source/extensions/plugins.rst
+++ b/docs/source/extensions/plugins.rst
@@ -6,54 +6,32 @@ Plugins
 
 .. include:: ../links.rst
 
-Plugins allow custom integrations to the internals of the Trainer such as a custom precision or
-distributed implementation.
+Plugins allow custom integrations to the internals of the Trainer such as custom precision, checkpointing or
+cluster environment implementation.
 
 Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically
-depending on the provided Trainer arguments. For example:
+depending on the provided Trainer arguments.
 
-.. code-block:: python
-
-    # accelerator: GPUAccelerator
-    # training strategy: DDPStrategy
-    # precision: NativeMixedPrecisionPlugin
-    trainer = Trainer(accelerator="gpu", devices=4, precision=16)
-
-
-We expose Accelerators and Plugins mainly for expert users that want to extend Lightning for:
-
-- New hardware (like TPU plugin)
-- Distributed backends (e.g. a backend not yet supported by
-  `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself)
-- Clusters (e.g. customized access to the cluster's environment interface)
-
-There are two types of Plugins in Lightning with different responsibilities:
-
-Strategy
---------
-
-- Launching and teardown of training processes (if applicable)
-- Setup communication between processes (NCCL, GLOO, MPI, ...)
-- Provide a unified communication interface for reduction, broadcast, etc.
-- Provide access to the wrapped LightningModule
+There are three types of Plugins in Lightning with different responsibilities:
 
+- Precision Plugins
+- CheckpointIO Plugins
+- Cluster Environments
 
-Furthermore, for multi-node training Lightning provides cluster environment plugins that allow the advanced user
-to configure Lightning to integrate with a :ref:`custom-cluster`.
 
+*****************
+Precision Plugins
+*****************
 
-.. image:: ../_static/images/accelerator/overview.svg
-
-
-The full list of built-in plugins is listed below.
-
+We provide precision plugins for you to benefit from numerical representations with lower precision than
+32-bit floating-point or higher precision, such as 64-bit floating-point.
 
-.. warning:: The Plugin API is in beta and subject to change.
-    For help setting up custom plugins/accelerators, please reach out to us at **support@pytorchlightning.ai**
+.. code-block:: python
 
+    # Training with 16-bit precision
+    trainer = Trainer(precision=16)
 
-Precision Plugins
------------------
+The full list of built-in precision plugins is listed below.
 
 .. currentmodule:: pytorch_lightning.plugins.precision
 
@@ -74,9 +52,39 @@ Precision Plugins
     TPUBf16PrecisionPlugin
     TPUPrecisionPlugin
 
+More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>`
+
+-----------
+
+********************
+CheckpointIO Plugins
+********************
 
+As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin.
+With this, you have the ability to customize the checkpointing logic to match the needs of your infrastructure.
+
+Below is a list of built-in plugins for checkpointing.
+
+.. currentmodule:: pytorch_lightning.plugins.io
+
+.. autosummary::
+    :nosignatures:
+    :template: classtemplate.rst
+
+    CheckpointIO
+    HPUCheckpointIO
+    TorchCheckpointIO
+    XLACheckpointIO
+
+You could learn more about custom checkpointing with Lightning :ref:`here <customize_checkpointing>`.
+
+-----------
+
+********************
 Cluster Environments
---------------------
+********************
+
+You can define the interface of your own cluster environment based on the requirements of your infrastructure.
 
 .. currentmodule:: pytorch_lightning.plugins.environments
 
@@ -85,8 +93,8 @@ Cluster Environments
     :template: classtemplate.rst
 
     ClusterEnvironment
+    KubeflowEnvironment
     LightningEnvironment
     LSFEnvironment
-    TorchElasticEnvironment
-    KubeflowEnvironment
     SLURMEnvironment
+    TorchElasticEnvironment
diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst
index 2a838d75a4fa4..860bd60511efd 100644
--- a/docs/source/starter/lightning_lite.rst
+++ b/docs/source/starter/lightning_lite.rst
@@ -387,7 +387,7 @@ Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"tpu_spawn"`
     lite = Lite(strategy="ddp_spawn", accelerator="cpu", devices=4)
 
 
-Additionally, you can pass in your custom training type strategy by configuring additional parameters.
+Additionally, you can pass in your custom strategy by configuring additional parameters.
 
 .. code-block:: python
 
diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py
index bab025466789a..f9068b87b653d 100644
--- a/pytorch_lightning/loops/optimization/optimizer_loop.py
+++ b/pytorch_lightning/loops/optimization/optimizer_loop.py
@@ -235,7 +235,7 @@ def _run_optimization(
         closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer)
 
         if (
-            # when the training type plugin handles accumulation, we want to always call the optimizer step
+            # when the strategy handles accumulation, we want to always call the optimizer step
             not self.trainer.strategy.handles_gradient_accumulation
             and self.trainer.fit_loop._should_accumulate()
         ):
diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py
index db33c4ec72d72..87c5c171d0ece 100644
--- a/pytorch_lightning/strategies/strategy.py
+++ b/pytorch_lightning/strategies/strategy.py
@@ -40,8 +40,7 @@
 
 
 class Strategy(ABC):
-    """Base class for all training type plugins that change the behaviour of the training, validation and test-
-    loop."""
+    """Base class for all strategies that change the behaviour of the training, validation and test- loop."""
 
     def __init__(
         self,
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 53b16af117e34..c0ea6f6f38dbd 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -401,7 +401,7 @@ def __init__(
                     Please pass the path to ``Trainer.fit(..., ckpt_path=...)`` instead.
 
             strategy: Supports different training strategies with aliases
-                as well custom training type plugins.
+                as well custom strategies.
                 Default: ``None``.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
@@ -1152,7 +1152,7 @@ def _run(
         if hasattr(model, "hparams"):
             parsing.clean_namespace(model.hparams)
 
-        # attach model to the training type plugin
+        # attach model to the strategy
         self.strategy.connect(model)
 
         self._callback_connector._attach_model_callbacks()
@@ -2035,17 +2035,17 @@ def global_rank(self) -> int:
 
     @property
     def local_rank(self) -> int:
-        # some training types define a local rank
+        # some strategies define a local rank
         return getattr(self.strategy, "local_rank", 0)
 
     @property
     def node_rank(self) -> int:
-        # some training types define a node rank
+        # some strategies define a node rank
         return getattr(self.strategy, "node_rank", 0)
 
     @property
     def world_size(self) -> int:
-        # some training types define a world size
+        # some strategies define a world size
         return getattr(self.strategy, "world_size", 1)
 
     @property
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 3fb42fb0ce29e..0130270a5ac78 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -79,7 +79,7 @@ def _assert_autocast_enabled(self):
 @pytest.mark.parametrize("precision", [16, "bf16"])
 @pytest.mark.parametrize("devices", [1, 2])
 def test_amp_cpus(tmpdir, strategy, precision, devices):
-    """Make sure combinations of AMP and training types work if supported."""
+    """Make sure combinations of AMP and strategies work if supported."""
     tutils.reset_seed()
 
     trainer = Trainer(
@@ -104,7 +104,7 @@ def test_amp_cpus(tmpdir, strategy, precision, devices):
 @pytest.mark.parametrize("precision", [16, "bf16"])
 @pytest.mark.parametrize("devices", [1, 2])
 def test_amp_gpus(tmpdir, strategy, precision, devices):
-    """Make sure combinations of AMP and training types work if supported."""
+    """Make sure combinations of AMP and strategies work if supported."""
     tutils.reset_seed()
 
     trainer = Trainer(
diff --git a/tests/strategies/test_ddp_spawn_strategy.py b/tests/strategies/test_ddp_spawn_strategy.py
index 74ceb08058eb4..c7ce848376e0d 100644
--- a/tests/strategies/test_ddp_spawn_strategy.py
+++ b/tests/strategies/test_ddp_spawn_strategy.py
@@ -55,7 +55,7 @@ def get_from_queue(self, queue) -> None:
 def test_ddp_cpu():
     """Tests if device is set correctly when training for DDPSpawnStrategy."""
     trainer = Trainer(devices=2, accelerator="cpu", fast_dev_run=True)
-    # assert training type plugin attributes for device setting
+    # assert strategy attributes for device setting
 
     assert isinstance(trainer.strategy, DDPSpawnStrategy)
     assert trainer.strategy.root_device == torch.device("cpu")
diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py
index d34617b4b2664..3e62c17bc4ecd 100644
--- a/tests/strategies/test_ddp_strategy.py
+++ b/tests/strategies/test_ddp_strategy.py
@@ -37,7 +37,7 @@ def on_train_start(self) -> None:
 def test_ddp_with_2_gpus():
     """Tests if device is set correctly when training and after teardown for DDPStrategy."""
     trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True)
-    # assert training type plugin attributes for device setting
+    # assert strategy attributes for device setting
     assert isinstance(trainer.strategy, DDPStrategy)
     local_rank = trainer.strategy.local_rank
     assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}")
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 6f76e79bd284c..18083b2868889 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1496,7 +1496,7 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg
 
 
 def test_spawn_predict_return_predictions(tmpdir):
-    """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
+    """Test that `return_predictions=True` raise a MisconfigurationException with spawn strategies."""
     model = BoringModel()
     trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
     assert isinstance(trainer.strategy, DDPSpawnStrategy)