From 084f2a99d4c8d3a51b9a7e3133de133ff825db5f Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 14 Jun 2022 11:24:04 +0000
Subject: [PATCH 1/7] add sparse SyncBatchNorm

---
 .../paddle/incubate/sparse/nn/layer/norm.py   | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 4d4cf7df2f2e4..cc094487c465c 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -157,3 +157,53 @@ def forward(self, input):
             batch_norm_out,
             shape=input.shape,
             stop_gradient=input.stop_gradient)
+
+
+class SyncBatchNorm(paddle.nn.SyncBatchNorm):
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, None, name)
+
+    def forward(self, x):
+        out = super(SyncBatchNorm, self).forward(x.values())
+        return paddle.incubate.sparse.sparse_coo_tensor(
+            x.indices(), out, shape=x.shape, stop_gradient=x.stop_gradient)
+
+    @classmethod
+    def convert_sync_batchnorm(cls, layer):
+        layer_output = layer
+        if isinstance(layer, _BatchNormBase):
+            if layer._weight_attr != None and not isinstance(
+                    layer._weight_attr,
+                    bool) and layer._weight_attr.name != None:
+                layer._weight_attr.name = layer._weight_attr.name + '_sync'
+            if layer._bias_attr != None and not isinstance(
+                    layer._bias_attr, bool) and layer._bias_attr.name != None:
+                layer._bias_attr.name = layer._bias_attr.name + '_sync'
+
+            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
+                                         layer._epsilon, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
+
+            if layer._weight_attr != False and layer._bias_attr != False:
+                with no_grad():
+                    layer_output.weight = layer.weight
+                    layer_output.bias = layer.bias
+            layer_output._mean = layer._mean
+            layer_output._variance = layer._variance
+
+        for name, sublayer in layer.named_children():
+            layer_output.add_sublayer(name,
+                                      cls.convert_sync_batchnorm(sublayer))
+        del layer
+        return layer_output

From f8c44d242c925fef56e70f81fbd1ed1c17f24539 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 14 Jun 2022 11:58:26 +0000
Subject: [PATCH 2/7] add sparse SyncBatchNorm

---
 python/paddle/incubate/sparse/nn/__init__.py   | 3 ++-
 python/paddle/incubate/sparse/nn/layer/norm.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/incubate/sparse/nn/__init__.py b/python/paddle/incubate/sparse/nn/__init__.py
index be4985e694b4b..e0d9b0d6524f8 100644
--- a/python/paddle/incubate/sparse/nn/__init__.py
+++ b/python/paddle/incubate/sparse/nn/__init__.py
@@ -15,7 +15,7 @@
 from . import functional
 
 from .layer.activation import ReLU
-from .layer.norm import BatchNorm
+from .layer.norm import BatchNorm, SyncBatchNorm
 from .layer.conv import Conv3D
 from .layer.conv import SubmConv3D
 from .layer.pooling import MaxPool3D
@@ -23,6 +23,7 @@
 __all__ = [
     'ReLU',
     'BatchNorm',
+    'SyncBatchNorm',
     'Conv3D',
     'SubmConv3D',
     'MaxPool3D',
diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index cc094487c465c..5a35aaabe0076 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -27,6 +27,8 @@
 
 import paddle
 import warnings
+from paddle.nn.layer.norm import _BatchNormBase
+from paddle.framework import no_grad
 
 
 class BatchNorm(paddle.nn.BatchNorm1D):
@@ -171,7 +173,7 @@ def __init__(self,
                  name=None):
         super(SyncBatchNorm,
               self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, None, name)
+                             bias_attr, data_format, name)
 
     def forward(self, x):
         out = super(SyncBatchNorm, self).forward(x.values())

From 0b27ba5ab85856aaade58f1c6e17f9a0a4a63d9a Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 14 Jun 2022 12:26:37 +0000
Subject: [PATCH 3/7] test SyncBatchNorm

---
 .../tests/unittests/test_sparse_norm_op.py    | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index 8eccefed6ef64..3283d6317a620 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -16,6 +16,7 @@
 import unittest
 import numpy as np
 import paddle
+from paddle.incubate.sparse import nn
 import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
 import copy
@@ -56,11 +57,10 @@ def test(self):
 
             # test backward
             sparse_y.backward(sparse_y)
-            assert np.allclose(
-                dense_x.grad.flatten().numpy(),
-                sparse_x.grad.values().flatten().numpy(),
-                atol=1e-5,
-                rtol=1e-5)
+            assert np.allclose(dense_x.grad.flatten().numpy(),
+                               sparse_x.grad.values().flatten().numpy(),
+                               atol=1e-5,
+                               rtol=1e-5)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_error_layout(self):
@@ -86,5 +86,22 @@ def test2(self):
             # [1, 6, 6, 6, 3]
 
 
+class TestConvertSyncBatchNorm(unittest.TestCase):
+
+    def test_convert(self):
+        base_model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5),
+                                          nn.BatchNorm(5))
+
+        model = paddle.nn.Sequential(
+            nn.Conv3D(3, 5, 3), nn.BatchNorm(5),
+            nn.BatchNorm(5,
+                         weight_attr=fluid.ParamAttr(name='bn.scale'),
+                         bias_attr=fluid.ParamAttr(name='bn.bias')))
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        for idx, sublayer in enumerate(base_model.sublayers()):
+            if isinstance(sublayer, nn.BatchNorm):
+                self.assertEqual(isinstance(model[idx], nn.SyncBatchNorm), True)
+
+
 if __name__ == "__main__":
     unittest.main()

From d1b8e39b8af94973f9372711717b9a2df442d3b7 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 26 Jul 2022 09:00:20 +0000
Subject: [PATCH 4/7] SyncBatchNorm distinguishes between dense and sparse

---
 python/paddle/incubate/sparse/nn/layer/norm.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 5a35aaabe0076..7fa63dc9965d4 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -192,10 +192,19 @@ def convert_sync_batchnorm(cls, layer):
                     layer._bias_attr, bool) and layer._bias_attr.name != None:
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
-            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
-                                         layer._epsilon, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            #sparse
+            if isinstance(layer, BatchNorm):
+                layer_output = SyncBatchNorm(layer._num_features,
+                                             layer._momentum, layer._epsilon,
+                                             layer._weight_attr,
+                                             layer._bias_attr,
+                                             layer._data_format, layer._name)
+            #dense
+            else:
+                layer_output = paddle.nn.SyncBatchNorm(
+                    layer._num_features, layer._momentum, layer._epsilon,
+                    layer._weight_attr, layer._bias_attr, layer._data_format,
+                    layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():

From ef0fe0819ddfd66d6123b9a9c6c8090ab0537afb Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 27 Jul 2022 07:54:12 +0000
Subject: [PATCH 5/7] add doc

---
 .../paddle/incubate/sparse/nn/layer/norm.py   | 110 +++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 7fa63dc9965d4..3229fe86f04da 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -162,6 +162,91 @@ def forward(self, input):
 
 
 class SyncBatchNorm(paddle.nn.SyncBatchNorm):
+    r"""
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
+
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable scale parameter vector
+    - :math:`\beta` : trainable shift parameter vector 
+
+    Note:
+        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
+        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
+        ``list`` to pack the model. 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.incubate.sparse.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          x = paddle.to_tensor(x)
+
+          if paddle.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1)
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
 
     def __init__(self,
                  num_features,
@@ -176,12 +261,33 @@ def __init__(self,
                              bias_attr, data_format, name)
 
     def forward(self, x):
+        assert x.is_sparse_coo(
+        ), "SyncBatchNorm only support SparseTensor in COO format."
         out = super(SyncBatchNorm, self).forward(x.values())
         return paddle.incubate.sparse.sparse_coo_tensor(
             x.indices(), out, shape=x.shape, stop_gradient=x.stop_gradient)
 
     @classmethod
     def convert_sync_batchnorm(cls, layer):
+        """
+        Helper function to convert :class: `paddle.incubate.sparse.nn.BatchNorm` layers in the model to :class: `paddle.incubate.sparse.nn.SyncBatchNorm` layers.
+
+        Parameters:
+            layer(paddle.nn.Layer): model containing one or more `BatchNorm` layers.
+
+        Returns:
+            The original model with converted SyncBatchNorm layers. If BatchNorm layer in the model, use SyncBatchNorm layer instead.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.incubate.sparse.nn as nn
+
+                model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
+                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
             if layer._weight_attr != None and not isinstance(
@@ -192,14 +298,14 @@ def convert_sync_batchnorm(cls, layer):
                     layer._bias_attr, bool) and layer._bias_attr.name != None:
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
-            #sparse
+            #convert sparse BatchNorm
             if isinstance(layer, BatchNorm):
                 layer_output = SyncBatchNorm(layer._num_features,
                                              layer._momentum, layer._epsilon,
                                              layer._weight_attr,
                                              layer._bias_attr,
                                              layer._data_format, layer._name)
-            #dense
+            #convert dense BatchNorm
             else:
                 layer_output = paddle.nn.SyncBatchNorm(
                     layer._num_features, layer._momentum, layer._epsilon,

From af7d68d3e12aa7cb71279e0a9979951e15c37e76 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 28 Jul 2022 08:30:16 +0000
Subject: [PATCH 6/7] add unittest

---
 .../fluid/tests/unittests/test_sparse_norm_op.py   | 14 +++++++++++++-
 python/paddle/incubate/sparse/nn/layer/norm.py     | 10 +++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index 3283d6317a620..31d3e380c5196 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -86,7 +86,19 @@ def test2(self):
             # [1, 6, 6, 6, 3]
 
 
-class TestConvertSyncBatchNorm(unittest.TestCase):
+class TestSyncBatchNorm(unittest.TestCase):
+
+    def test_sync_batch_norm(self):
+        with _test_eager_guard():
+            x = np.array([[[[0.3, 0.4], [0.3, 0.07]],
+                           [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+            x = paddle.to_tensor(x)
+            x = x.to_sparse_coo(len(x.shape) - 1)
+
+            if paddle.is_compiled_with_cuda():
+                sync_batch_norm = nn.SyncBatchNorm(2)
+                hidden1 = sync_batch_norm(x)
+                print(hidden1)
 
     def test_convert(self):
         base_model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5),
diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 3229fe86f04da..4d8c1380a6e69 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -240,12 +240,20 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
 
           x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
           x = paddle.to_tensor(x)
+          x = x.to_sparse_coo(len(x.shape)-1)
 
           if paddle.is_compiled_with_cuda():
               sync_batch_norm = nn.SyncBatchNorm(2)
               hidden1 = sync_batch_norm(x)
               print(hidden1)
-              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+              # Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+              #        indices=[[0, 0, 0, 0],
+              #                 [0, 0, 1, 1],
+              #                 [0, 1, 0, 1]],
+              #        values=[[-0.40730840, -0.13725480],
+              #                 [-0.40730840, -1.20299828],
+              #                 [ 1.69877410, -0.23414057],
+              #                 [-0.88415730,  1.57439375]])
     """
 
     def __init__(self,

From bf899b1b84c01698f89681739fd9896456e4c207 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 29 Jul 2022 03:01:04 +0000
Subject: [PATCH 7/7] fix example

---
 python/paddle/incubate/sparse/nn/layer/norm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 4d8c1380a6e69..2dbefcd4bfedc 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -234,6 +234,7 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
     Examples:
         .. code-block:: python
 
+          # required: gpu
           import paddle
           import paddle.incubate.sparse.nn as nn
           import numpy as np