Lightning-AI · carmocca · Feb 4, 2022 · Dec 14, 2021 · Dec 15, 2021 · Dec 16, 2021
@@ -52,6 +52,7 @@ jobs:
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install fairscale==0.4.0
         pip install deepspeed==0.5.7
+        pip install bagua-cuda102==0.9.0
         pip install . --requirement requirements/devel.txt
         pip list
       displayName: 'Install dependencies'

@@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a warning when using `DistributedSampler` during validation/testing ([#11479](https://github.com/PyTorchLightning/pytorch-lightning/pull/11479))
 
 
+- Added support for `Bagua` training strategy ([#11146](https://github.com/PyTorchLightning/pytorch-lightning/pull/11146))
+
+
 ### Changed
 
 - Set the `prog_bar` flag to False in `LightningModule.log_grad_norm` ([#11472](https://github.com/PyTorchLightning/pytorch-lightning/pull/11472))

@@ -282,6 +282,7 @@ Lightning allows multiple ways of training
 - DistributedDataParallel (``strategy='ddp_spawn'``) (multiple-gpus across many machines (spawn based)).
 - DistributedDataParallel 2 (``strategy='ddp2'``) (DP in a machine, DDP across machines).
 - Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime)
+- Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms)
 - TPUs (``tpu_cores=8|x``) (tpu or TPU pod)
 
 .. note::
@@ -489,6 +490,127 @@ number of worker processes:
 See the official `Horovod documentation <https://horovod.readthedocs.io/en/stable>`_ for details
 on installation and performance tuning.
 
+
+Bagua
+^^^^^
+`Bagua <https://github.com/BaguaSys/bagua>`_ is a deep learning training acceleration framework which supports
+multiple advanced distributed training algorithms including:
+
+- `Gradient AllReduce <https://tutorials.baguasys.com/algorithms/gradient-allreduce>`_ for centralized synchronous communication, where gradients are averaged among all workers.
+- `Decentralized SGD <https://tutorials.baguasys.com/algorithms/decentralized>`_ for decentralized synchronous communication, where each worker exchanges data with one or a few specific workers.
+- `ByteGrad <https://tutorials.baguasys.com/algorithms/bytegrad>`_ and `QAdam <https://tutorials.baguasys.com/algorithms/q-adam>`_ for low precision communication, where data is compressed into low precision before communication.
+- `Asynchronous Model Average <https://tutorials.baguasys.com/algorithms/async-model-average>`_ for asynchronous communication, where workers are not required to be synchronized in the same iteration in a lock-step style.
+
+By default, Bagua uses *Gradient AllReduce* algorithm, which is also the algorithm implemented in Distributed Data Parallel and Horovod,
+but Bagua can usually produce a higher training throughput due to its backend written in Rust.
+
+.. code-block:: python
+
+    # train on 2 GPUs (using Bagua mode)
+    trainer = Trainer(strategy="bagua", accelerator="gpu", devices=4)
+
+
+By specifying the ``algorithm`` in the ``BaguaStrategy``, you can select more advanced training algorithms featured by Bagua:
+
+
+.. code-block:: python
+
+    # train on 4 GPUs, using Bagua Gradient AllReduce algorithm
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="gradient_allreduce"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua ByteGrad algorithm
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="bytegrad"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua Decentralized SGD
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="decentralized"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Bagua Low Precision Decentralized SGD
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="low_precision_decentralized"),
+        accelerator="gpu",
+        devices=4,
+    )
+
+    # train on 4 GPUs, using Asynchronous Model Average algorithm, with a synchronization interval of 100ms
+    trainer = Trainer(
+        strategy=BaguaStrategy(algorithm="async", sync_interval_ms=100),
+        accelerator="gpu",
+        devices=4,
+    )
+
+To use *QAdam*, we need to initialize
+`QAdamOptimizer <https://bagua.readthedocs.io/en/latest/autoapi/bagua/torch_api/algorithms/q_adam/index.html#bagua.torch_api.algorithms.q_adam.QAdamOptimizer>`_ first:
+
+.. code-block:: python
+
+    from pytorch_lightning.strategies import BaguaStrategy
+    from bagua.torch_api.algorithms.q_adam import QAdamOptimizer
+
+
+    class MyModel(pl.LightningModule):
+        ...
+
+        def configure_optimizers(self):
+            # initialize QAdam Optimizer
+            return QAdamOptimizer(self.parameters(), lr=0.05, warmup_steps=100)
+
+
+    model = MyModel()
+    trainer = Trainer(
+        accelerator="gpu",
+        devices=4,
+        strategy=BaguaStrategy(algorithm="qadam"),
+    )
+    trainer.fit(model)
+
+Bagua relies on its own `launcher <https://tutorials.baguasys.com/getting-started/#launch-job>`_ to schedule jobs.
+Below, find examples using ``bagua.distributed.launch`` which follows ``torch.distributed.launch`` API:
+
+.. code-block:: bash
+
+    # start training with 8 GPUs on a single node
+    python -m bagua.distributed.launch --nproc_per_node=8 train.py
+
+    # Run on node1 to start training on two nodes (node1 and node2), 8 GPUs per node
+    python -m bagua.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=hostname1 --master_port=port1 train.py
+
+    # Run on node2 to start training on two nodes (node1 and node2), 8 GPUs per node
+    python -m bagua.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=hostname1 --master_port=port1 train.py
+
+
+If the ssh service is available with passwordless login on each node, you can launch the distributed job on a
+single node with ``baguarun`` which has a similar syntax as ``mpirun``. When staring the job, ``baguarun`` will
+automatically spawn new processes on each of your training node provided by ``--host_list`` option and each node in it
+is described as an ip address followed by a ssh port.
+
+.. code-block:: bash
+
+    # Run on node1 (or node2) to start training on two nodes (node1 and node2), 8 GPUs per node
+    baguarun --host_list hostname1:ssh_port1,hostname2:ssh_port2 --nproc_per_node=8 --master_port=port1 train.py
+
+
+.. note:: You can also start training in the same way as Distributed Data Parallel. However, system optimizations like
+    `Bagua-Net <https://tutorials.baguasys.com/more-optimizations/bagua-net>`_ and
+    `Performance autotuning <https://tutorials.baguasys.com/performance-autotuning/>`_ can only be enabled through bagua
+    launcher. It is worth noting that with ``Bagua-Net``, Distributed Data Parallel can also achieve
+    better performance without modifying the training script.
+
+
+See `Bagua Tutorials <https://tutorials.baguasys.com/>`_ for more details on installation and advanced features.
+
+
 DP/DDP2 caveats
 ^^^^^^^^^^^^^^^
 In DP and DDP2 each GPU within a machine sees a portion of a batch.

@@ -43,6 +43,7 @@ Strategy API
     :nosignatures:
     :template: classtemplate.rst
 
+    BaguaStrategy
     DDP2Strategy
     DDPFullyShardedStrategy
     DDPShardedStrategy

@@ -107,6 +107,7 @@ Training Strategies
     DDPShardedStrategy
     DDPSpawnShardedStrategy
     DDPSpawnStrategy
+    BaguaStrategy
     DeepSpeedStrategy
     HorovodStrategy
     SingleTPUStrategy

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pytorch_lightning.plugins.environments.bagua_environment import BaguaEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.kubeflow_environment import KubeflowEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.lightning_environment import LightningEnvironment  # noqa: F401

@@ -0,0 +1,62 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+
+log = logging.getLogger(__name__)
+
+
+class BaguaEnvironment(ClusterEnvironment):
+    """Environment for distributed training with `Bagua <https://tutorials.baguasys.com/>`_"""
+
+    @property
+    def creates_processes_externally(self) -> bool:
+        return True
+
+    @property
+    def main_address(self) -> str:
+        return os.environ.get("MASTER_ADDR", "127.0.0.1")
+
+    @property
+    def main_port(self) -> int:
+        return int(os.environ.get("MASTER_PORT", -1))
+
+    @property
+    def service_port(self) -> int:
+        return int(os.environ.get("BAGUA_SERVICE_PORT", -1))
+
+    @staticmethod
+    def detect() -> bool:
+        return "BAGUA_SERVICE_PORT" in os.environ
+
+    def world_size(self) -> int:
+        return int(os.environ["WORLD_SIZE"])
+
+    def set_world_size(self, size: int) -> None:
+        log.debug("`BaguaEnvironment.set_world_size` was called, but setting world size is not allowed. Ignored.")
+
+    def global_rank(self) -> int:
+        return int(os.environ["RANK"])
+
+    def set_global_rank(self, rank: int) -> None:
+        log.debug("`BaguaEnvironment.set_global_rank` was called, but setting global rank is not allowed. Ignored.")
+
+    def local_rank(self) -> int:
+        return int(os.environ.get("LOCAL_RANK", 0))
+
+    def node_rank(self) -> int:
+        return int(os.environ.get("NODE_RANK", 0))
@@ -1,5 +1,6 @@
 from pathlib import Path
 
+from pytorch_lightning.strategies.bagua import BaguaStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp import DDPStrategy  # noqa: F401
 from pytorch_lightning.strategies.ddp2 import DDP2Strategy  # noqa: F401
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy  # noqa: F401