From 084f2a99d4c8d3a51b9a7e3133de133ff825db5f Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 14 Jun 2022 11:24:04 +0000 Subject: [PATCH 1/7] add sparse SyncBatchNorm --- .../paddle/incubate/sparse/nn/layer/norm.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index 4d4cf7df2f2e4..cc094487c465c 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -157,3 +157,53 @@ def forward(self, input): batch_norm_out, shape=input.shape, stop_gradient=input.stop_gradient) + + +class SyncBatchNorm(paddle.nn.SyncBatchNorm): + + def __init__(self, + num_features, + momentum=0.9, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + data_format='NCHW', + name=None): + super(SyncBatchNorm, + self).__init__(num_features, momentum, epsilon, weight_attr, + bias_attr, data_format, None, name) + + def forward(self, x): + out = super(SyncBatchNorm, self).forward(x.values()) + return paddle.incubate.sparse.sparse_coo_tensor( + x.indices(), out, shape=x.shape, stop_gradient=x.stop_gradient) + + @classmethod + def convert_sync_batchnorm(cls, layer): + layer_output = layer + if isinstance(layer, _BatchNormBase): + if layer._weight_attr != None and not isinstance( + layer._weight_attr, + bool) and layer._weight_attr.name != None: + layer._weight_attr.name = layer._weight_attr.name + '_sync' + if layer._bias_attr != None and not isinstance( + layer._bias_attr, bool) and layer._bias_attr.name != None: + layer._bias_attr.name = layer._bias_attr.name + '_sync' + + layer_output = SyncBatchNorm(layer._num_features, layer._momentum, + layer._epsilon, layer._weight_attr, + layer._bias_attr, layer._data_format, + layer._name) + + if layer._weight_attr != False and layer._bias_attr != False: + with no_grad(): + layer_output.weight = layer.weight + layer_output.bias = layer.bias + layer_output._mean = layer._mean + layer_output._variance = layer._variance + + for name, sublayer in layer.named_children(): + layer_output.add_sublayer(name, + cls.convert_sync_batchnorm(sublayer)) + del layer + return layer_output From f8c44d242c925fef56e70f81fbd1ed1c17f24539 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 14 Jun 2022 11:58:26 +0000 Subject: [PATCH 2/7] add sparse SyncBatchNorm --- python/paddle/incubate/sparse/nn/__init__.py | 3 ++- python/paddle/incubate/sparse/nn/layer/norm.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/sparse/nn/__init__.py b/python/paddle/incubate/sparse/nn/__init__.py index be4985e694b4b..e0d9b0d6524f8 100644 --- a/python/paddle/incubate/sparse/nn/__init__.py +++ b/python/paddle/incubate/sparse/nn/__init__.py @@ -15,7 +15,7 @@ from . import functional from .layer.activation import ReLU -from .layer.norm import BatchNorm +from .layer.norm import BatchNorm, SyncBatchNorm from .layer.conv import Conv3D from .layer.conv import SubmConv3D from .layer.pooling import MaxPool3D @@ -23,6 +23,7 @@ __all__ = [ 'ReLU', 'BatchNorm', + 'SyncBatchNorm', 'Conv3D', 'SubmConv3D', 'MaxPool3D', diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index cc094487c465c..5a35aaabe0076 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -27,6 +27,8 @@ import paddle import warnings +from paddle.nn.layer.norm import _BatchNormBase +from paddle.framework import no_grad class BatchNorm(paddle.nn.BatchNorm1D): @@ -171,7 +173,7 @@ def __init__(self, name=None): super(SyncBatchNorm, self).__init__(num_features, momentum, epsilon, weight_attr, - bias_attr, data_format, None, name) + bias_attr, data_format, name) def forward(self, x): out = super(SyncBatchNorm, self).forward(x.values()) From 0b27ba5ab85856aaade58f1c6e17f9a0a4a63d9a Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 14 Jun 2022 12:26:37 +0000 Subject: [PATCH 3/7] test SyncBatchNorm --- .../tests/unittests/test_sparse_norm_op.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py index 8eccefed6ef64..3283d6317a620 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py @@ -16,6 +16,7 @@ import unittest import numpy as np import paddle +from paddle.incubate.sparse import nn import paddle.fluid as fluid from paddle.fluid.framework import _test_eager_guard import copy @@ -56,11 +57,10 @@ def test(self): # test backward sparse_y.backward(sparse_y) - assert np.allclose( - dense_x.grad.flatten().numpy(), - sparse_x.grad.values().flatten().numpy(), - atol=1e-5, - rtol=1e-5) + assert np.allclose(dense_x.grad.flatten().numpy(), + sparse_x.grad.values().flatten().numpy(), + atol=1e-5, + rtol=1e-5) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_error_layout(self): @@ -86,5 +86,22 @@ def test2(self): # [1, 6, 6, 6, 3] +class TestConvertSyncBatchNorm(unittest.TestCase): + + def test_convert(self): + base_model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5), + nn.BatchNorm(5)) + + model = paddle.nn.Sequential( + nn.Conv3D(3, 5, 3), nn.BatchNorm(5), + nn.BatchNorm(5, + weight_attr=fluid.ParamAttr(name='bn.scale'), + bias_attr=fluid.ParamAttr(name='bn.bias'))) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + for idx, sublayer in enumerate(base_model.sublayers()): + if isinstance(sublayer, nn.BatchNorm): + self.assertEqual(isinstance(model[idx], nn.SyncBatchNorm), True) + + if __name__ == "__main__": unittest.main() From d1b8e39b8af94973f9372711717b9a2df442d3b7 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 26 Jul 2022 09:00:20 +0000 Subject: [PATCH 4/7] SyncBatchNorm distinguishes between dense and sparse --- python/paddle/incubate/sparse/nn/layer/norm.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index 5a35aaabe0076..7fa63dc9965d4 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -192,10 +192,19 @@ def convert_sync_batchnorm(cls, layer): layer._bias_attr, bool) and layer._bias_attr.name != None: layer._bias_attr.name = layer._bias_attr.name + '_sync' - layer_output = SyncBatchNorm(layer._num_features, layer._momentum, - layer._epsilon, layer._weight_attr, - layer._bias_attr, layer._data_format, - layer._name) + #sparse + if isinstance(layer, BatchNorm): + layer_output = SyncBatchNorm(layer._num_features, + layer._momentum, layer._epsilon, + layer._weight_attr, + layer._bias_attr, + layer._data_format, layer._name) + #dense + else: + layer_output = paddle.nn.SyncBatchNorm( + layer._num_features, layer._momentum, layer._epsilon, + layer._weight_attr, layer._bias_attr, layer._data_format, + layer._name) if layer._weight_attr != False and layer._bias_attr != False: with no_grad(): From ef0fe0819ddfd66d6123b9a9c6c8090ab0537afb Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 27 Jul 2022 07:54:12 +0000 Subject: [PATCH 5/7] add doc --- .../paddle/incubate/sparse/nn/layer/norm.py | 110 +++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index 7fa63dc9965d4..3229fe86f04da 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -162,6 +162,91 @@ def forward(self, input): class SyncBatchNorm(paddle.nn.SyncBatchNorm): + r""" + This interface is used to construct a callable object of the ``SyncBatchNorm`` class. + It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can + be used as a normalizer function for other operations, such as conv2d and fully connected + operations. + The data is normalized by the mean and variance of the channel based on whole mini-batch + , which including data in all gpus. + Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `_ + for more details. + + When model in training mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus. + Calculated as follows: + + .. math:: + + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ + + - :math:`x` : whole mini-batch data in all gpus + - :math:`m` : the size of the whole mini-batch data + + When model in evaluation mode, the :math:`\\mu_{\\beta}` + and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, + which usually got from the pre-trained model). Global statistics calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ + + The formula of normalization is as follows: + + .. math:: + + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift + + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable scale parameter vector + - :math:`\beta` : trainable shift parameter vector + + Note: + If you want to use container to pack your model and has ``SyncBatchNorm`` in the + evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of + ``list`` to pack the model. + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of this layer. If it is set to None or one attribute of ParamAttr, this layerr + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. If it is set to False, + this layer will not have trainable scale parameter. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer. + If it is set to None or one attribute of ParamAttr, this layer + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. If it is set to False, this layer will not + have trainable bias parameter. Default: None. + + Shapes: + input: Tensor that the dimension from 2 to 5. + output: Tensor with the same shape as input. + + Examples: + .. code-block:: python + + import paddle + import paddle.incubate.sparse.nn as nn + import numpy as np + + x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') + x = paddle.to_tensor(x) + + if paddle.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(2) + hidden1 = sync_batch_norm(x) + print(hidden1) + # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] + """ def __init__(self, num_features, @@ -176,12 +261,33 @@ def __init__(self, bias_attr, data_format, name) def forward(self, x): + assert x.is_sparse_coo( + ), "SyncBatchNorm only support SparseTensor in COO format." out = super(SyncBatchNorm, self).forward(x.values()) return paddle.incubate.sparse.sparse_coo_tensor( x.indices(), out, shape=x.shape, stop_gradient=x.stop_gradient) @classmethod def convert_sync_batchnorm(cls, layer): + """ + Helper function to convert :class: `paddle.incubate.sparse.nn.BatchNorm` layers in the model to :class: `paddle.incubate.sparse.nn.SyncBatchNorm` layers. + + Parameters: + layer(paddle.nn.Layer): model containing one or more `BatchNorm` layers. + + Returns: + The original model with converted SyncBatchNorm layers. If BatchNorm layer in the model, use SyncBatchNorm layer instead. + + Examples: + + .. code-block:: python + import paddle + import paddle.incubate.sparse.nn as nn + + model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5)) + sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + + """ layer_output = layer if isinstance(layer, _BatchNormBase): if layer._weight_attr != None and not isinstance( @@ -192,14 +298,14 @@ def convert_sync_batchnorm(cls, layer): layer._bias_attr, bool) and layer._bias_attr.name != None: layer._bias_attr.name = layer._bias_attr.name + '_sync' - #sparse + #convert sparse BatchNorm if isinstance(layer, BatchNorm): layer_output = SyncBatchNorm(layer._num_features, layer._momentum, layer._epsilon, layer._weight_attr, layer._bias_attr, layer._data_format, layer._name) - #dense + #convert dense BatchNorm else: layer_output = paddle.nn.SyncBatchNorm( layer._num_features, layer._momentum, layer._epsilon, From af7d68d3e12aa7cb71279e0a9979951e15c37e76 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 28 Jul 2022 08:30:16 +0000 Subject: [PATCH 6/7] add unittest --- .../fluid/tests/unittests/test_sparse_norm_op.py | 14 +++++++++++++- python/paddle/incubate/sparse/nn/layer/norm.py | 10 +++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py index 3283d6317a620..31d3e380c5196 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py @@ -86,7 +86,19 @@ def test2(self): # [1, 6, 6, 6, 3] -class TestConvertSyncBatchNorm(unittest.TestCase): +class TestSyncBatchNorm(unittest.TestCase): + + def test_sync_batch_norm(self): + with _test_eager_guard(): + x = np.array([[[[0.3, 0.4], [0.3, 0.07]], + [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') + x = paddle.to_tensor(x) + x = x.to_sparse_coo(len(x.shape) - 1) + + if paddle.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(2) + hidden1 = sync_batch_norm(x) + print(hidden1) def test_convert(self): base_model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5), diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index 3229fe86f04da..4d8c1380a6e69 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -240,12 +240,20 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm): x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') x = paddle.to_tensor(x) + x = x.to_sparse_coo(len(x.shape)-1) if paddle.is_compiled_with_cuda(): sync_batch_norm = nn.SyncBatchNorm(2) hidden1 = sync_batch_norm(x) print(hidden1) - # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] + # Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # indices=[[0, 0, 0, 0], + # [0, 0, 1, 1], + # [0, 1, 0, 1]], + # values=[[-0.40730840, -0.13725480], + # [-0.40730840, -1.20299828], + # [ 1.69877410, -0.23414057], + # [-0.88415730, 1.57439375]]) """ def __init__(self, From bf899b1b84c01698f89681739fd9896456e4c207 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 29 Jul 2022 03:01:04 +0000 Subject: [PATCH 7/7] fix example --- python/paddle/incubate/sparse/nn/layer/norm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py index 4d8c1380a6e69..2dbefcd4bfedc 100644 --- a/python/paddle/incubate/sparse/nn/layer/norm.py +++ b/python/paddle/incubate/sparse/nn/layer/norm.py @@ -234,6 +234,7 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm): Examples: .. code-block:: python + # required: gpu import paddle import paddle.incubate.sparse.nn as nn import numpy as np