Deprecate nvidia/apex (#16039)

Lightning-AI · Dec 20, 2022 · 6ce4c77 · 6ce4c77
1 parent a3ac162
commit 6ce4c77
Show file tree

Hide file tree

Showing 57 changed files with 509 additions and 443 deletions.
diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst
@@ -469,25 +469,26 @@ Validation and test step have the same option when using DP.
 Distributed and 16-bit precision
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Due to an issue with Apex and DataParallel (PyTorch and NVIDIA issue), Lightning does
-not allow 16-bit and DP training. We tried to get this to work, but it's an issue on their end.
-
 Below are the possible configurations we support.
 
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-| 1 GPU | 1+ GPUs | DP  | DDP | 16-bit | command                                                               |
+| 1 GPU | 1+ GPUs | DDP  | DP | 16-bit | command                                                               |
 +=======+=========+=====+=====+========+=======================================================================+
 | Y     |         |     |     |        | `Trainer(accelerator="gpu", devices=1)`                               |
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
 | Y     |         |     |     | Y      | `Trainer(accelerator="gpu", devices=1, precision=16)`                 |
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       | Y   |     |        | `Trainer(accelerator="gpu", devices=k, strategy='dp')`                |
+|       | Y       | Y   |     |        | `Trainer(accelerator="gpu", devices=k, strategy='ddp')`               |
++-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
+|       | Y       | Y   |     | Y      | `Trainer(accelerator="gpu", devices=k, strategy='ddp', precision=16)` |
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       |     | Y   |        | `Trainer(accelerator="gpu", devices=k, strategy='ddp')`               |
+|       | Y       |     | Y   |        | `Trainer(accelerator="gpu", devices=k, strategy='dp')`                |
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
-|       | Y       |     | Y   | Y      | `Trainer(accelerator="gpu", devices=k, strategy='ddp', precision=16)` |
+|       | Y       |     | Y   | Y      | `Trainer(accelerator="gpu", devices=k, strategy='dp', precision=16)`  |
 +-------+---------+-----+-----+--------+-----------------------------------------------------------------------+
 
+DDP and DP can also be used with 1 GPU, but there's no reason to do so other than debugging distributed-related issues.
+
 
 Implement Your Own Distributed (DDP) training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst
@@ -184,15 +184,14 @@ precision
     :nosignatures:
     :template: classtemplate.rst
 
-    ApexMixedPrecisionPlugin
     ColossalAIPrecisionPlugin
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
     FullyShardedNativeNativeMixedPrecisionPlugin
     HPUPrecisionPlugin
     IPUPrecisionPlugin
-    NativeMixedPrecisionPlugin
+    MixedPrecisionPlugin
     PrecisionPlugin
     ShardedNativeMixedPrecisionPlugin
     TPUBf16PrecisionPlugin

diff --git a/docs/source-pytorch/common/checkpointing_basic.rst b/docs/source-pytorch/common/checkpointing_basic.rst
@@ -186,5 +186,5 @@ If you don't just want to load weights, but instead restore the full training, d
    model = LitModel()
    trainer = Trainer()
 
-   # automatically restores model, epoch, step, LR schedulers, apex, etc...
+   # automatically restores model, epoch, step, LR schedulers, etc...
    trainer.fit(model, ckpt_path="some/path/to/my_checkpoint.ckpt")
diff --git a/docs/source-pytorch/common/optimization.rst b/docs/source-pytorch/common/optimization.rst
@@ -151,7 +151,6 @@ For example, here step optimizer A every batch and optimizer B every 2 batches.
         optimizer_idx,
         optimizer_closure,
         on_tpu=False,
-        using_native_amp=False,
         using_lbfgs=False,
     ):
         # update generator every step
@@ -183,7 +182,6 @@ Here we add a manual learning rate warm-up without an lr scheduler.
         optimizer_idx,
         optimizer_closure,
         on_tpu=False,
-        using_native_amp=False,
         using_lbfgs=False,
     ):
         # update params
@@ -215,7 +213,6 @@ to perform a step, Lightning won't be able to support accelerators, precision an
         optimizer_idx,
         optimizer_closure,
         on_tpu=False,
-        using_native_amp=False,
         using_lbfgs=False,
     ):
         optimizer.step(closure=optimizer_closure)
@@ -232,7 +229,6 @@ to perform a step, Lightning won't be able to support accelerators, precision an
         optimizer_idx,
         optimizer_closure,
         on_tpu=False,
-        using_native_amp=False,
         using_lbfgs=False,
     ):
         optimizer = optimizer.optimizer

diff --git a/docs/source-pytorch/common/precision_intermediate.rst b/docs/source-pytorch/common/precision_intermediate.rst
@@ -58,6 +58,7 @@ FP16 Mixed Precision
 ********************
 
 In most cases, mixed precision uses FP16. Supported `PyTorch operations <https://pytorch.org/docs/stable/amp.html#op-specific-behavior>`__ automatically run in FP16, saving memory and improving throughput on the supported accelerators.
+Since computation happens in FP16, there is a chance of numerical instability during training. This is handled internally by a dynamic grad scaler which skips invalid steps and adjusts the scaler to ensure subsequent steps fall within a finite range. For more information `see the autocast docs <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`__.
 
 
 .. note::
@@ -69,46 +70,6 @@ In most cases, mixed precision uses FP16. Supported `PyTorch operations <https:/
 
     Trainer(accelerator="gpu", devices=1, precision=16)
 
-
-PyTorch Native
---------------
-
-PyTorch 1.6 release introduced mixed precision functionality into their core as the AMP package, `torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`__. It is more flexible and intuitive compared to `NVIDIA APEX <https://github.com/NVIDIA/apex>`__.
-Since computation happens in FP16, there is a chance of numerical instability during training. This is handled internally by a dynamic grad scaler which skips invalid steps and adjusts the scaler to ensure subsequent steps fall within a finite range. For more information `see the autocast docs <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`__.
-Lightning uses native amp by default with ``precision=16|"bf16"``. You can also set it using:
-
-.. testcode::
-
-    Trainer(precision=16, amp_backend="native")
-
-
-NVIDIA APEX
------------
-
-.. warning::
-
-    We strongly recommend using the above native mixed precision rather than NVIDIA APEX unless you require more refined control.
-
-`NVIDIA APEX <https://github.com/NVIDIA/apex>`__ offers additional flexibility in setting mixed precision. This can be useful when trying out different precision configurations, such as keeping most of your weights in FP16 and running computation in FP16.
-
-.. testcode::
-    :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available()
-
-    Trainer(accelerator="gpu", devices=1, amp_backend="apex", precision=16)
-
-Set the `NVIDIA optimization level <https://nvidia.github.io/apex/amp.html#opt-levels>`__ via the precision plugin.
-
-.. testcode::
-    :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available()
-
-    from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
-
-
-    apex_plugin = ApexMixedPrecisionPlugin(amp_level="O3")
-    Trainer(accelerator="gpu", devices=1, precision=16, plugins=[apex_plugin])
-
-----
-
 ************************
 BFloat16 Mixed Precision
 ************************

diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
@@ -289,27 +289,6 @@ Example::
     # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that
     trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20})
 
-amp_backend
-^^^^^^^^^^^
-
-.. raw:: html
-
-    <video width="50%" max-width="400px" controls
-    poster="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/thumb/amp_backend.jpg"
-    src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/trainer_flags/amp_backend.mp4"></video>
-
-|
-
-Use PyTorch AMP ('native'), or NVIDIA apex ('apex').
-
-.. testcode::
-
-    # using PyTorch built-in AMP, default used by the Trainer
-    trainer = Trainer(amp_backend="native")
-
-    # using NVIDIA Apex
-    trainer = Trainer(amp_backend="apex")
-
 auto_scale_batch_size
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -1156,27 +1135,6 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin
 
 .. note:: When running on TPUs, torch.bfloat16 will be used but tensor printing will still show torch.float32.
 
-.. admonition::  If you are interested in using Apex 16-bit training:
-   :class: dropdown
-
-    NVIDIA Apex and DDP have instability problems. We recommend using the native AMP for 16-bit precision with multiple GPUs.
-    To use Apex 16-bit training:
-
-    1. `Install apex. <https://github.com/NVIDIA/apex#quick-start>`__
-
-    2. Set the ``precision`` trainer flag to 16. You can customize the `Apex optimization level <https://nvidia.github.io/apex/amp.html#opt-levels>`_ by setting the ``amp_level`` flag
-       in the precision plugin.
-
-    .. testcode::
-        :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available()
-
-        from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
-
-
-        apex_plugin = ApexMixedPrecisionPlugin(amp_level="O2")
-        # turn on 16-bit
-        trainer = Trainer(accelerator="gpu", devices=1, precision=16, plugins=[apex_plugin])
-
 profiler
 ^^^^^^^^
 

diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
@@ -398,7 +398,6 @@ def package_list_from_file(file):
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.cli import _JSONARGPARSE_SIGNATURES_AVAILABLE as _JSONARGPARSE_AVAILABLE
 from pytorch_lightning.utilities import (
-    _APEX_AVAILABLE,
     _TORCHVISION_AVAILABLE,
 )
 from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE

diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst
@@ -52,15 +52,14 @@ The full list of built-in precision plugins is listed below.
     :nosignatures:
     :template: classtemplate.rst
 
-    ApexMixedPrecisionPlugin
     ColossalAIPrecisionPlugin
     DeepSpeedPrecisionPlugin
     DoublePrecisionPlugin
     FullyShardedNativeMixedPrecisionPlugin
     FullyShardedNativeNativeMixedPrecisionPlugin
     HPUPrecisionPlugin
     IPUPrecisionPlugin
-    NativeMixedPrecisionPlugin
+    MixedPrecisionPlugin
     PrecisionPlugin
     ShardedNativeMixedPrecisionPlugin
     TPUBf16PrecisionPlugin

diff --git a/docs/source-pytorch/model/manual_optimization.rst b/docs/source-pytorch/model/manual_optimization.rst
@@ -319,4 +319,4 @@ Here is an example using a closure function.
         opt.step(closure=closure)
 
 .. warning::
-   The :class:`~torch.optim.LBFGS` optimizer is not supported for apex AMP, native AMP, IPUs, or DeepSpeed.
+   The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP, IPUs, or DeepSpeed.
diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py
@@ -26,7 +26,7 @@
 from lightning_fabric.plugins import (
     CheckpointIO,
     DeepSpeedPrecision,
-    NativeMixedPrecision,
+    MixedPrecision,
     Precision,
     TPUBf16Precision,
     TPUPrecision,
@@ -452,7 +452,7 @@ def _check_and_init_precision(self) -> Precision:
                     )
                 return TPUBf16Precision()
         if isinstance(self.strategy, DeepSpeedStrategy):
-            return DeepSpeedPrecision(self._precision_input, amp_type="native", amp_level=None)  # type: ignore
+            return DeepSpeedPrecision(self._precision_input)  # type: ignore
 
         if self._precision_input == 32:
             return Precision()
@@ -476,7 +476,7 @@ def _check_and_init_precision(self) -> Precision:
 
             if isinstance(self.strategy, FSDPStrategy):
                 return FSDPPrecision(precision=self._precision_input, device=device)
-            return NativeMixedPrecision(precision=self._precision_input, device=device)
+            return MixedPrecision(precision=self._precision_input, device=device)
 
         raise RuntimeError("No precision set")
 

diff --git a/src/lightning_fabric/plugins/__init__.py b/src/lightning_fabric/plugins/__init__.py
@@ -18,7 +18,7 @@
 from lightning_fabric.plugins.precision.deepspeed import DeepSpeedPrecision
 from lightning_fabric.plugins.precision.double import DoublePrecision
 from lightning_fabric.plugins.precision.fsdp import FSDPPrecision
-from lightning_fabric.plugins.precision.native_amp import NativeMixedPrecision
+from lightning_fabric.plugins.precision.native_amp import MixedPrecision
 from lightning_fabric.plugins.precision.precision import Precision
 from lightning_fabric.plugins.precision.tpu import TPUPrecision
 from lightning_fabric.plugins.precision.tpu_bf16 import TPUBf16Precision
@@ -31,7 +31,7 @@
     "Precision",
     "DeepSpeedPrecision",
     "DoublePrecision",
-    "NativeMixedPrecision",
+    "MixedPrecision",
     "TPUPrecision",
     "TPUBf16Precision",
     "FSDPPrecision",

diff --git a/src/lightning_fabric/plugins/precision/__init__.py b/src/lightning_fabric/plugins/precision/__init__.py
@@ -14,15 +14,15 @@
 from lightning_fabric.plugins.precision.deepspeed import DeepSpeedPrecision
 from lightning_fabric.plugins.precision.double import DoublePrecision
 from lightning_fabric.plugins.precision.fsdp import FSDPPrecision
-from lightning_fabric.plugins.precision.native_amp import NativeMixedPrecision
+from lightning_fabric.plugins.precision.native_amp import MixedPrecision
 from lightning_fabric.plugins.precision.precision import Precision
 from lightning_fabric.plugins.precision.tpu import TPUPrecision
 from lightning_fabric.plugins.precision.tpu_bf16 import TPUBf16Precision
 
 __all__ = [
     "DeepSpeedPrecision",
     "DoublePrecision",
-    "NativeMixedPrecision",
+    "MixedPrecision",
     "Precision",
     "TPUPrecision",
     "TPUBf16Precision",

diff --git a/src/lightning_fabric/plugins/precision/deepspeed.py b/src/lightning_fabric/plugins/precision/deepspeed.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from lightning_utilities.core.imports import RequirementCache
@@ -20,11 +20,10 @@
 
 from lightning_fabric.plugins.precision.precision import Precision
 from lightning_fabric.plugins.precision.utils import _convert_fp_tensor
-from lightning_fabric.utilities.enums import AMPType, PrecisionType
+from lightning_fabric.utilities.enums import PrecisionType
 from lightning_fabric.utilities.types import Steppable
 
 _DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
-_APEX_AVAILABLE = RequirementCache("apex")
 if TYPE_CHECKING and _DEEPSPEED_AVAILABLE:
     import deepspeed
 
@@ -34,28 +33,13 @@ class DeepSpeedPrecision(Precision):
 
     Args:
         precision: Full precision (32), half precision (16) or bfloat16 precision (bf16).
-        amp_type: The mixed precision backend to use ("native" or "apex").
-        amp_level: The optimization level to use (O1, O2, etc...). By default it will be set to "O2"
-            if ``amp_type`` is set to "apex".
 
     Raises:
-        MisconfigurationException:
-            If using ``bfloat16`` precision and ``deepspeed<v0.6``.
-
         ValueError:
             If unsupported ``precision`` is provided.
     """
 
-    def __init__(self, precision: Literal[16, 32, "bf16"], amp_type: str, amp_level: Optional[str] = None) -> None:
-        if amp_type == AMPType.APEX:
-            if not _APEX_AVAILABLE:
-                raise ModuleNotFoundError(
-                    "You have asked for Apex AMP but `apex` is not installed."
-                    " Install `apex` using this guide: https://github.com/NVIDIA/apex"
-                )
-
-            amp_level = amp_level or "O2"
-
+    def __init__(self, precision: Literal[16, 32, "bf16"]) -> None:
         supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT)
         if precision not in supported_precision:
             raise ValueError(
@@ -65,8 +49,6 @@ def __init__(self, precision: Literal[16, 32, "bf16"], amp_type: str, amp_level:
 
         super().__init__()
         self.precision = precision
-        self.amp_type = amp_type
-        self.amp_level = amp_level
 
     def convert_input(self, data: Tensor) -> Tensor:
         precision_to_type = {"bf16": torch.bfloat16, 16: torch.float16, 32: torch.float32}

diff --git a/src/lightning_fabric/plugins/precision/fsdp.py b/src/lightning_fabric/plugins/precision/fsdp.py
@@ -16,16 +16,16 @@
 import torch
 from typing_extensions import Literal
 
-from lightning_fabric.plugins.precision.native_amp import NativeMixedPrecision
+from lightning_fabric.plugins.precision.native_amp import MixedPrecision
 from lightning_fabric.utilities.enums import PrecisionType
 from lightning_fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 
 if TYPE_CHECKING:
-    from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+    from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
     from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 
 
-class FSDPPrecision(NativeMixedPrecision):
+class FSDPPrecision(MixedPrecision):
     """AMP for Fully Sharded Data Parallel training."""
 
     def __init__(
@@ -43,16 +43,16 @@ def __init__(
         )
 
     @property
-    def mixed_precision_config(self) -> "MixedPrecision":
-        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+    def mixed_precision_config(self) -> "TorchMixedPrecision":
+        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
 
         if self.precision == PrecisionType.HALF:
             dtype = torch.float16
         elif self.precision == PrecisionType.BFLOAT:
             dtype = torch.bfloat16
         else:
             raise ValueError(f"Was unable to infer precision type, received {self.precision!r}.")
-        return MixedPrecision(
+        return TorchMixedPrecision(
             param_dtype=dtype,
             reduce_dtype=dtype,
             buffer_dtype=dtype,