diff --git a/docs/source/advanced/model_parallel.rst b/docs/source/advanced/model_parallel.rst index 18c83bde743c8..5cf7556be1efd 100644 --- a/docs/source/advanced/model_parallel.rst +++ b/docs/source/advanced/model_parallel.rst @@ -296,7 +296,6 @@ Below we show an example of running `ZeRO-Offload `_. -For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` -training type strategy: +For this, all data pre-loading should be done on the main process inside :meth:`DataModule.__init__`. As a result, all tensor-data will get automatically shared when using the :class:`~pytorch_lightning.plugins.strategies.ddp_spawn.DDPSpawnStrategy` strategy. .. warning:: diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst index 2371964d1f278..31824e828cc7d 100644 --- a/docs/source/common/checkpointing.rst +++ b/docs/source/common/checkpointing.rst @@ -315,6 +315,7 @@ and the Lightning Team will be happy to integrate/help integrate it. ----------- +.. _customize_checkpointing: *********************** Customize Checkpointing @@ -392,7 +393,7 @@ Custom Checkpoint IO Plugin .. note:: - Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. + Some strategies like :class:`~pytorch_lightning.strategies.deepspeed.DeepSpeedStrategy` do not support custom :class:`~pytorch_lightning.plugins.io.checkpoint_plugin.CheckpointIO` as checkpointing logic is not modifiable. ----------- diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index 935e788310d7c..fd9de11f601d8 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -1056,7 +1056,7 @@ automatic_optimization When set to ``False``, Lightning does not automate the optimization process. This means you are responsible for handling your optimizers. However, we do take care of precision and any accelerators used. -See :ref:`manual optimization` for details. +See :ref:`manual optimization ` for details. .. code-block:: python diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst index 819c5cb4897b4..56f8cb07ee787 100644 --- a/docs/source/common/trainer.rst +++ b/docs/source/common/trainer.rst @@ -1445,7 +1445,7 @@ checkpoint, training will start from the beginning of the next epoch. strategy ^^^^^^^^ -Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom training type plugins. +Supports passing different training strategies with aliases (ddp, ddp_spawn, etc) as well as custom strategies. .. code-block:: python @@ -1455,7 +1455,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc # Training with the DDP Spawn strategy using 4 cpu processes trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=4) -.. note:: Additionally, you can pass your custom training type plugins to the ``strategy`` argument. +.. note:: Additionally, you can pass your custom strategy to the ``strategy`` argument. .. code-block:: python diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst index 3bfa7ad24b29c..601de26b39531 100644 --- a/docs/source/extensions/plugins.rst +++ b/docs/source/extensions/plugins.rst @@ -6,54 +6,32 @@ Plugins .. include:: ../links.rst -Plugins allow custom integrations to the internals of the Trainer such as a custom precision or -distributed implementation. +Plugins allow custom integrations to the internals of the Trainer such as custom precision, checkpointing or +cluster environment implementation. Under the hood, the Lightning Trainer is using plugins in the training routine, added automatically -depending on the provided Trainer arguments. For example: +depending on the provided Trainer arguments. -.. code-block:: python - - # accelerator: GPUAccelerator - # training strategy: DDPStrategy - # precision: NativeMixedPrecisionPlugin - trainer = Trainer(accelerator="gpu", devices=4, precision=16) - - -We expose Accelerators and Plugins mainly for expert users that want to extend Lightning for: - -- New hardware (like TPU plugin) -- Distributed backends (e.g. a backend not yet supported by - `PyTorch `_ itself) -- Clusters (e.g. customized access to the cluster's environment interface) - -There are two types of Plugins in Lightning with different responsibilities: - -Strategy --------- - -- Launching and teardown of training processes (if applicable) -- Setup communication between processes (NCCL, GLOO, MPI, ...) -- Provide a unified communication interface for reduction, broadcast, etc. -- Provide access to the wrapped LightningModule +There are three types of Plugins in Lightning with different responsibilities: +- Precision Plugins +- CheckpointIO Plugins +- Cluster Environments -Furthermore, for multi-node training Lightning provides cluster environment plugins that allow the advanced user -to configure Lightning to integrate with a :ref:`custom-cluster`. +***************** +Precision Plugins +***************** -.. image:: ../_static/images/accelerator/overview.svg - - -The full list of built-in plugins is listed below. - +We provide precision plugins for you to benefit from numerical representations with lower precision than +32-bit floating-point or higher precision, such as 64-bit floating-point. -.. warning:: The Plugin API is in beta and subject to change. - For help setting up custom plugins/accelerators, please reach out to us at **support@pytorchlightning.ai** +.. code-block:: python + # Training with 16-bit precision + trainer = Trainer(precision=16) -Precision Plugins ------------------ +The full list of built-in precision plugins is listed below. .. currentmodule:: pytorch_lightning.plugins.precision @@ -74,9 +52,39 @@ Precision Plugins TPUBf16PrecisionPlugin TPUPrecisionPlugin +More information regarding precision with Lightning can be found :doc:`here <../advanced/precision>` + +----------- + +******************** +CheckpointIO Plugins +******************** +As part of our commitment to extensibility, we have abstracted Lightning's checkpointing logic into the :class:`~pytorch_lightning.plugins.io.CheckpointIO` plugin. +With this, you have the ability to customize the checkpointing logic to match the needs of your infrastructure. + +Below is a list of built-in plugins for checkpointing. + +.. currentmodule:: pytorch_lightning.plugins.io + +.. autosummary:: + :nosignatures: + :template: classtemplate.rst + + CheckpointIO + HPUCheckpointIO + TorchCheckpointIO + XLACheckpointIO + +You could learn more about custom checkpointing with Lightning :ref:`here `. + +----------- + +******************** Cluster Environments --------------------- +******************** + +You can define the interface of your own cluster environment based on the requirements of your infrastructure. .. currentmodule:: pytorch_lightning.plugins.environments @@ -85,8 +93,8 @@ Cluster Environments :template: classtemplate.rst ClusterEnvironment + KubeflowEnvironment LightningEnvironment LSFEnvironment - TorchElasticEnvironment - KubeflowEnvironment SLURMEnvironment + TorchElasticEnvironment diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst index 2a838d75a4fa4..860bd60511efd 100644 --- a/docs/source/starter/lightning_lite.rst +++ b/docs/source/starter/lightning_lite.rst @@ -387,7 +387,7 @@ Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"tpu_spawn"` lite = Lite(strategy="ddp_spawn", accelerator="cpu", devices=4) -Additionally, you can pass in your custom training type strategy by configuring additional parameters. +Additionally, you can pass in your custom strategy by configuring additional parameters. .. code-block:: python diff --git a/pytorch_lightning/loops/optimization/optimizer_loop.py b/pytorch_lightning/loops/optimization/optimizer_loop.py index bab025466789a..f9068b87b653d 100644 --- a/pytorch_lightning/loops/optimization/optimizer_loop.py +++ b/pytorch_lightning/loops/optimization/optimizer_loop.py @@ -235,7 +235,7 @@ def _run_optimization( closure = self._make_closure(split_batch, batch_idx, opt_idx, optimizer) if ( - # when the training type plugin handles accumulation, we want to always call the optimizer step + # when the strategy handles accumulation, we want to always call the optimizer step not self.trainer.strategy.handles_gradient_accumulation and self.trainer.fit_loop._should_accumulate() ): diff --git a/pytorch_lightning/strategies/strategy.py b/pytorch_lightning/strategies/strategy.py index db33c4ec72d72..87c5c171d0ece 100644 --- a/pytorch_lightning/strategies/strategy.py +++ b/pytorch_lightning/strategies/strategy.py @@ -40,8 +40,7 @@ class Strategy(ABC): - """Base class for all training type plugins that change the behaviour of the training, validation and test- - loop.""" + """Base class for all strategies that change the behaviour of the training, validation and test- loop.""" def __init__( self, diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 53b16af117e34..c0ea6f6f38dbd 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -401,7 +401,7 @@ def __init__( Please pass the path to ``Trainer.fit(..., ckpt_path=...)`` instead. strategy: Supports different training strategies with aliases - as well custom training type plugins. + as well custom strategies. Default: ``None``. sync_batchnorm: Synchronize batch norm layers between process groups/whole world. @@ -1152,7 +1152,7 @@ def _run( if hasattr(model, "hparams"): parsing.clean_namespace(model.hparams) - # attach model to the training type plugin + # attach model to the strategy self.strategy.connect(model) self._callback_connector._attach_model_callbacks() @@ -2035,17 +2035,17 @@ def global_rank(self) -> int: @property def local_rank(self) -> int: - # some training types define a local rank + # some strategies define a local rank return getattr(self.strategy, "local_rank", 0) @property def node_rank(self) -> int: - # some training types define a node rank + # some strategies define a node rank return getattr(self.strategy, "node_rank", 0) @property def world_size(self) -> int: - # some training types define a world size + # some strategies define a world size return getattr(self.strategy, "world_size", 1) @property diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 3fb42fb0ce29e..0130270a5ac78 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -79,7 +79,7 @@ def _assert_autocast_enabled(self): @pytest.mark.parametrize("precision", [16, "bf16"]) @pytest.mark.parametrize("devices", [1, 2]) def test_amp_cpus(tmpdir, strategy, precision, devices): - """Make sure combinations of AMP and training types work if supported.""" + """Make sure combinations of AMP and strategies work if supported.""" tutils.reset_seed() trainer = Trainer( @@ -104,7 +104,7 @@ def test_amp_cpus(tmpdir, strategy, precision, devices): @pytest.mark.parametrize("precision", [16, "bf16"]) @pytest.mark.parametrize("devices", [1, 2]) def test_amp_gpus(tmpdir, strategy, precision, devices): - """Make sure combinations of AMP and training types work if supported.""" + """Make sure combinations of AMP and strategies work if supported.""" tutils.reset_seed() trainer = Trainer( diff --git a/tests/strategies/test_ddp_spawn_strategy.py b/tests/strategies/test_ddp_spawn_strategy.py index 74ceb08058eb4..c7ce848376e0d 100644 --- a/tests/strategies/test_ddp_spawn_strategy.py +++ b/tests/strategies/test_ddp_spawn_strategy.py @@ -55,7 +55,7 @@ def get_from_queue(self, queue) -> None: def test_ddp_cpu(): """Tests if device is set correctly when training for DDPSpawnStrategy.""" trainer = Trainer(devices=2, accelerator="cpu", fast_dev_run=True) - # assert training type plugin attributes for device setting + # assert strategy attributes for device setting assert isinstance(trainer.strategy, DDPSpawnStrategy) assert trainer.strategy.root_device == torch.device("cpu") diff --git a/tests/strategies/test_ddp_strategy.py b/tests/strategies/test_ddp_strategy.py index d34617b4b2664..3e62c17bc4ecd 100644 --- a/tests/strategies/test_ddp_strategy.py +++ b/tests/strategies/test_ddp_strategy.py @@ -37,7 +37,7 @@ def on_train_start(self) -> None: def test_ddp_with_2_gpus(): """Tests if device is set correctly when training and after teardown for DDPStrategy.""" trainer = Trainer(accelerator="gpu", devices=2, strategy="ddp", fast_dev_run=True) - # assert training type plugin attributes for device setting + # assert strategy attributes for device setting assert isinstance(trainer.strategy, DDPStrategy) local_rank = trainer.strategy.local_rank assert trainer.strategy.root_device == torch.device(f"cuda:{local_rank}") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6f76e79bd284c..18083b2868889 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1496,7 +1496,7 @@ def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *arg def test_spawn_predict_return_predictions(tmpdir): - """Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins.""" + """Test that `return_predictions=True` raise a MisconfigurationException with spawn strategies.""" model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True) assert isinstance(trainer.strategy, DDPSpawnStrategy)