From 92db47f7ac02578f831cf487cb6b3ad0f150f2d7 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Mon, 14 Sep 2020 21:59:06 +0300
Subject: [PATCH 01/23] cosface loss

---
 catalyst/contrib/nn/criterion/__init__.py   |  1 +
 catalyst/contrib/nn/criterion/cosface.py    | 96 +++++++++++++++++++++
 catalyst/contrib/nn/tests/test_criterion.py | 46 ++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 catalyst/contrib/nn/criterion/cosface.py

diff --git a/catalyst/contrib/nn/criterion/__init__.py b/catalyst/contrib/nn/criterion/__init__.py
index 70d4be2f77..843e3fc1a3 100644
--- a/catalyst/contrib/nn/criterion/__init__.py
+++ b/catalyst/contrib/nn/criterion/__init__.py
@@ -36,3 +36,4 @@
     TripletMarginLossWithSampler,
 )
 from catalyst.contrib.nn.criterion.wing import WingLoss
+from catalyst.contrib.nn.criterion.cosface import CosFaceLoss
diff --git a/catalyst/contrib/nn/criterion/cosface.py b/catalyst/contrib/nn/criterion/cosface.py
new file mode 100644
index 0000000000..2ca1ac7f43
--- /dev/null
+++ b/catalyst/contrib/nn/criterion/cosface.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _WeightedLoss
+
+
+class CosFaceLoss(_WeightedLoss):
+    """Implementation of CosFace loss for metric learning.
+
+    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition
+        https://arxiv.org/abs/1801.09414
+    """
+
+    def __init__(
+        self,
+        embedding_size: int,
+        num_classes: int,
+        s: float = 64.0,
+        m: float = 0.35,
+        weight: torch.Tensor = None,
+        size_average=None,
+        ignore_index: int = -100,
+        reduce=None,
+        reduction: str = "mean",
+    ):
+        """
+        Args:
+            embedding_size (int): size of each input sample.
+            num_classes (int): size of each output sample.
+            s (float): norm of input feature,
+                Default: ``64.0``.
+            m (float): margin.
+                Default: ``0.35``.
+            weight (float, optional): – a manual rescaling weight given to each class.
+                If given, has to be a Tensor of size `num_classes`.
+                Default: ``None``.
+            size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+                the losses are averaged over each loss element in the batch. Note that for
+                some losses, there are multiple elements per sample. If the field :attr:`size_average`
+                is set to ``False``, the losses are instead summed for each minibatch. Ignored
+                when reduce is ``False``.
+                Default: ``True``
+            reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+                losses are averaged or summed over observations for each minibatch depending
+                on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+                batch element instead and ignores :attr:`size_average`.
+                Default: ``True``
+            reduction (string, optional): Specifies the reduction to apply to the output:
+                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+                ``'mean'``: the sum of the output will be divided by the number of
+                elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+                and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+                specifying either of those two args will override :attr:`reduction`.
+                Default: ``'mean'``
+        """
+        super(CosFaceLoss, self).__init__(
+            weight, size_average, reduce, reduction
+        )
+        self.ignore_index = ignore_index
+        self.embedding_size = embedding_size
+        self.num_classes = num_classes
+        self.s = s
+        self.m = m
+
+        self.projection = nn.Parameter(
+            torch.FloatTensor(num_classes, embedding_size)
+        )
+        nn.init.xavier_uniform_(self.projection)
+
+    def forward(
+        self, input: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): input features,
+                expected shapes BxF.
+            target (torch.Tensor): target classes,
+                expected shapes B.
+
+        Returns:
+            torch.Tensor with loss value.
+        """
+        cosine = F.linear(F.normalize(input), F.normalize(self.projection))
+        phi = cosine - self.m
+        one_hot = torch.zeros(cosine.size()).to(input.device)
+        one_hot.scatter_(1, target.view(-1, 1).long(), 1)
+        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+        logits *= self.s
+
+        return F.cross_entropy(
+            logits,
+            target,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+        )
diff --git a/catalyst/contrib/nn/tests/test_criterion.py b/catalyst/contrib/nn/tests/test_criterion.py
index 01cadc42cb..3be51b937a 100644
--- a/catalyst/contrib/nn/tests/test_criterion.py
+++ b/catalyst/contrib/nn/tests/test_criterion.py
@@ -1,7 +1,10 @@
+import torch
+import numpy as np
 from catalyst.contrib.nn import criterion as module
 from catalyst.contrib.nn.criterion import (
     CircleLoss,
     TripletMarginLossWithSampler,
+    CosFaceLoss,
 )
 from catalyst.data import AllTripletsSampler
 
@@ -19,3 +22,46 @@ def test_criterion_init():
             else:
                 instance = module_class()
             assert instance is not None
+
+
+def test_cosface_loss():
+    emb_size = 4
+    n_classes = 3
+    s = 3.0
+    m = 0.1
+    loss_fn = CosFaceLoss(emb_size, n_classes, s, m)
+
+    features = np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+        ],
+        dtype="f",
+    )
+    target = np.array([0, 2], dtype="l")
+    projection = np.array(
+        [
+            [0.1, 0.2, 0.3, 0.4],
+            [1.1, 3.2, 5.3, 0.4],
+            [0.1, 0.2, 6.3, 0.4],
+        ],
+        dtype="f",
+    )
+
+    loss_fn.projection.data = torch.from_numpy(projection)
+
+    def normalize(matr):
+        return matr / np.sqrt((matr ** 2).sum(axis=1))[:, np.newaxis]
+
+    normalized_features = normalize(features)
+    normalized_projection = normalize(projection)
+
+    cosine = normalized_features @ normalized_projection.T
+    phi = cosine - m
+
+    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")
+    feats = (mask * phi + (1.0 - mask) * cosine) * s
+
+    expected_loss = 1.3651
+    actual = loss_fn(torch.from_numpy(features), torch.LongTensor(target))
+    assert abs(expected_loss - actual.item()) < 1e-5

From e63eedfcd1fa9b993b9090f1adfc00226308b493 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Mon, 14 Sep 2020 22:24:27 +0300
Subject: [PATCH 02/23] pep fixes

---
 catalyst/contrib/nn/criterion/cosface.py | 47 +++++++++++++++---------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/catalyst/contrib/nn/criterion/cosface.py b/catalyst/contrib/nn/criterion/cosface.py
index 2ca1ac7f43..cb4fb8704b 100644
--- a/catalyst/contrib/nn/criterion/cosface.py
+++ b/catalyst/contrib/nn/criterion/cosface.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.modules.loss import _WeightedLoss
+from torch.nn.modules.loss import _WeightedLoss  # noqa
 
 
 class CosFaceLoss(_WeightedLoss):
@@ -31,26 +31,39 @@ def __init__(
                 Default: ``64.0``.
             m (float): margin.
                 Default: ``0.35``.
-            weight (float, optional): – a manual rescaling weight given to each class.
-                If given, has to be a Tensor of size `num_classes`.
+            weight (float, optional): – a manual rescaling weight
+                given to each class. If given, has to be a Tensor
+                of size `num_classes`.
                 Default: ``None``.
-            size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
-                the losses are averaged over each loss element in the batch. Note that for
-                some losses, there are multiple elements per sample. If the field :attr:`size_average`
-                is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            size_average (bool, optional):
+                Deprecated (see :attr:`reduction`).
+                By default, the losses are averaged over each
+                loss element in the batch. Note that for
+                some losses, there are multiple elements
+                per sample. If the field :attr:`size_average`
+                is set to ``False``, the losses are instead
+                summed for each minibatch. Ignored
                 when reduce is ``False``.
                 Default: ``True``
-            reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
-                losses are averaged or summed over observations for each minibatch depending
-                on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
-                batch element instead and ignores :attr:`size_average`.
+            reduce (bool, optional):
+                Deprecated (see :attr:`reduction`).
+                By default, the losses are averaged or summed
+                over observations for each minibatch depending
+                on :attr:`size_average`. When :attr:`reduce` is
+                ``False``, returns a loss per batch element
+                instead and ignores :attr:`size_average`.
                 Default: ``True``
-            reduction (string, optional): Specifies the reduction to apply to the output:
-                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-                ``'mean'``: the sum of the output will be divided by the number of
-                elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-                and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-                specifying either of those two args will override :attr:`reduction`.
+            reduction (string, optional): Specifies the reduction
+                to apply to the output:
+                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``:
+                no reduction will be applied, ``'mean'``: the sum
+                of the output will be divided by the number of
+                elements in the output, ``'sum'``: the output will
+                be summed. Note: :attr:`size_average`
+                and :attr:`reduce` are in the process of being
+                deprecated, and in the meantime, specifying either
+                of those two args will
+                override :attr:`reduction`.
                 Default: ``'mean'``
         """
         super(CosFaceLoss, self).__init__(

From 5539c7f08314e282ba1ededcc48f222c3b14c4c0 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Mon, 14 Sep 2020 22:28:19 +0300
Subject: [PATCH 03/23] fixed link

---
 catalyst/contrib/nn/criterion/cosface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/catalyst/contrib/nn/criterion/cosface.py b/catalyst/contrib/nn/criterion/cosface.py
index cb4fb8704b..57b063162d 100644
--- a/catalyst/contrib/nn/criterion/cosface.py
+++ b/catalyst/contrib/nn/criterion/cosface.py
@@ -7,7 +7,7 @@
 class CosFaceLoss(_WeightedLoss):
     """Implementation of CosFace loss for metric learning.
 
-    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition
+    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition:
         https://arxiv.org/abs/1801.09414
     """
 

From df4a32f4f9f5c4ad6e5afccbb287b891a786135a Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Mon, 14 Sep 2020 23:22:32 +0300
Subject: [PATCH 04/23] more tests

---
 catalyst/contrib/nn/tests/test_criterion.py | 58 +++++++++++++++++----
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/catalyst/contrib/nn/tests/test_criterion.py b/catalyst/contrib/nn/tests/test_criterion.py
index 3be51b937a..41c0e906b8 100644
--- a/catalyst/contrib/nn/tests/test_criterion.py
+++ b/catalyst/contrib/nn/tests/test_criterion.py
@@ -29,7 +29,6 @@ def test_cosface_loss():
     n_classes = 3
     s = 3.0
     m = 0.1
-    loss_fn = CosFaceLoss(emb_size, n_classes, s, m)
 
     features = np.array(
         [
@@ -48,20 +47,57 @@ def test_cosface_loss():
         dtype="f",
     )
 
+    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="none")
     loss_fn.projection.data = torch.from_numpy(projection)
 
     def normalize(matr):
-        return matr / np.sqrt((matr ** 2).sum(axis=1))[:, np.newaxis]
+        return (
+            matr / np.sqrt((matr ** 2).sum(axis=1))[:, np.newaxis]
+        )  # for each row
 
-    normalized_features = normalize(features)
-    normalized_projection = normalize(projection)
+    def softmax(x):
+        e_x = np.exp(x - np.max(x))
+        return e_x / e_x.sum(1)[:, np.newaxis]  # for each row
 
-    cosine = normalized_features @ normalized_projection.T
-    phi = cosine - m
+    def cross_entropy(preds, targs, axis=None):
+        print(softmax(preds))
+        return -(targs * np.log(softmax(preds))).sum(axis)
 
-    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")
-    feats = (mask * phi + (1.0 - mask) * cosine) * s
+    normalized_features = normalize(features)  # 2x4
+    normalized_projection = normalize(projection)  # 3x4
 
-    expected_loss = 1.3651
-    actual = loss_fn(torch.from_numpy(features), torch.LongTensor(target))
-    assert abs(expected_loss - actual.item()) < 1e-5
+    cosine = normalized_features @ normalized_projection.T  # 2x4 * 4x3 = 2x3
+    phi = cosine - m  # 2x3
+
+    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")  # one_hot(target)
+    feats = (mask * phi + (1.0 - mask) * cosine) * s  # 2x3
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
+        .detach()
+        .numpy()
+    )
+    assert np.allclose(expected_loss, actual)
+
+    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="mean")
+    loss_fn.projection.data = torch.from_numpy(projection)
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.mean(), actual)
+
+    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="sum")
+    loss_fn.projection.data = torch.from_numpy(projection)
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.sum(), actual)
\ No newline at end of file

From a9734e119a6c63ba1c789a88e90f87353193152d Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Mon, 14 Sep 2020 23:24:46 +0300
Subject: [PATCH 05/23] ignore flake

---
 catalyst/contrib/nn/tests/test_criterion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/catalyst/contrib/nn/tests/test_criterion.py b/catalyst/contrib/nn/tests/test_criterion.py
index 41c0e906b8..c7ba8137b8 100644
--- a/catalyst/contrib/nn/tests/test_criterion.py
+++ b/catalyst/contrib/nn/tests/test_criterion.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import torch
 import numpy as np
 from catalyst.contrib.nn import criterion as module

From e6c1588c8184c961aa0e95a7f1414150369f2422 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 16:11:09 +0300
Subject: [PATCH 06/23] cosface now is a layer, softmax, cosface, tests

---
 catalyst/contrib/nn/criterion/__init__.py   |   1 -
 catalyst/contrib/nn/criterion/cosface.py    | 109 ----------
 catalyst/contrib/nn/modules/__init__.py     |   4 +
 catalyst/contrib/nn/modules/arcface.py      |  86 ++++++++
 catalyst/contrib/nn/modules/cosface.py      |  75 +++++++
 catalyst/contrib/nn/modules/softmax.py      |  49 +++++
 catalyst/contrib/nn/tests/test_criterion.py |  84 +-------
 catalyst/contrib/nn/tests/test_modules.py   | 212 ++++++++++++++++++++
 8 files changed, 429 insertions(+), 191 deletions(-)
 delete mode 100644 catalyst/contrib/nn/criterion/cosface.py
 create mode 100644 catalyst/contrib/nn/modules/arcface.py
 create mode 100644 catalyst/contrib/nn/modules/cosface.py
 create mode 100644 catalyst/contrib/nn/modules/softmax.py
 create mode 100644 catalyst/contrib/nn/tests/test_modules.py

diff --git a/catalyst/contrib/nn/criterion/__init__.py b/catalyst/contrib/nn/criterion/__init__.py
index 843e3fc1a3..70d4be2f77 100644
--- a/catalyst/contrib/nn/criterion/__init__.py
+++ b/catalyst/contrib/nn/criterion/__init__.py
@@ -36,4 +36,3 @@
     TripletMarginLossWithSampler,
 )
 from catalyst.contrib.nn.criterion.wing import WingLoss
-from catalyst.contrib.nn.criterion.cosface import CosFaceLoss
diff --git a/catalyst/contrib/nn/criterion/cosface.py b/catalyst/contrib/nn/criterion/cosface.py
deleted file mode 100644
index 57b063162d..0000000000
--- a/catalyst/contrib/nn/criterion/cosface.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.modules.loss import _WeightedLoss  # noqa
-
-
-class CosFaceLoss(_WeightedLoss):
-    """Implementation of CosFace loss for metric learning.
-
-    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition:
-        https://arxiv.org/abs/1801.09414
-    """
-
-    def __init__(
-        self,
-        embedding_size: int,
-        num_classes: int,
-        s: float = 64.0,
-        m: float = 0.35,
-        weight: torch.Tensor = None,
-        size_average=None,
-        ignore_index: int = -100,
-        reduce=None,
-        reduction: str = "mean",
-    ):
-        """
-        Args:
-            embedding_size (int): size of each input sample.
-            num_classes (int): size of each output sample.
-            s (float): norm of input feature,
-                Default: ``64.0``.
-            m (float): margin.
-                Default: ``0.35``.
-            weight (float, optional): – a manual rescaling weight
-                given to each class. If given, has to be a Tensor
-                of size `num_classes`.
-                Default: ``None``.
-            size_average (bool, optional):
-                Deprecated (see :attr:`reduction`).
-                By default, the losses are averaged over each
-                loss element in the batch. Note that for
-                some losses, there are multiple elements
-                per sample. If the field :attr:`size_average`
-                is set to ``False``, the losses are instead
-                summed for each minibatch. Ignored
-                when reduce is ``False``.
-                Default: ``True``
-            reduce (bool, optional):
-                Deprecated (see :attr:`reduction`).
-                By default, the losses are averaged or summed
-                over observations for each minibatch depending
-                on :attr:`size_average`. When :attr:`reduce` is
-                ``False``, returns a loss per batch element
-                instead and ignores :attr:`size_average`.
-                Default: ``True``
-            reduction (string, optional): Specifies the reduction
-                to apply to the output:
-                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``:
-                no reduction will be applied, ``'mean'``: the sum
-                of the output will be divided by the number of
-                elements in the output, ``'sum'``: the output will
-                be summed. Note: :attr:`size_average`
-                and :attr:`reduce` are in the process of being
-                deprecated, and in the meantime, specifying either
-                of those two args will
-                override :attr:`reduction`.
-                Default: ``'mean'``
-        """
-        super(CosFaceLoss, self).__init__(
-            weight, size_average, reduce, reduction
-        )
-        self.ignore_index = ignore_index
-        self.embedding_size = embedding_size
-        self.num_classes = num_classes
-        self.s = s
-        self.m = m
-
-        self.projection = nn.Parameter(
-            torch.FloatTensor(num_classes, embedding_size)
-        )
-        nn.init.xavier_uniform_(self.projection)
-
-    def forward(
-        self, input: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Args:
-            input (torch.Tensor): input features,
-                expected shapes BxF.
-            target (torch.Tensor): target classes,
-                expected shapes B.
-
-        Returns:
-            torch.Tensor with loss value.
-        """
-        cosine = F.linear(F.normalize(input), F.normalize(self.projection))
-        phi = cosine - self.m
-        one_hot = torch.zeros(cosine.size()).to(input.device)
-        one_hot.scatter_(1, target.view(-1, 1).long(), 1)
-        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
-        logits *= self.s
-
-        return F.cross_entropy(
-            logits,
-            target,
-            weight=self.weight,
-            ignore_index=self.ignore_index,
-            reduction=self.reduction,
-        )
diff --git a/catalyst/contrib/nn/modules/__init__.py b/catalyst/contrib/nn/modules/__init__.py
index c4f4d015eb..96401825eb 100644
--- a/catalyst/contrib/nn/modules/__init__.py
+++ b/catalyst/contrib/nn/modules/__init__.py
@@ -31,3 +31,7 @@
     scSE,
     cSE,
 )
+
+from catalyst.contrib.nn.modules.softmax import SoftMax
+from catalyst.contrib.nn.modules.arcface import ArcFace
+from catalyst.contrib.nn.modules.cosface import CosFace
diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
new file mode 100644
index 0000000000..3a86d1674b
--- /dev/null
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -0,0 +1,86 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ArcFace(nn.Module):
+    """Implementation of ArcFace loss for metric learning.
+
+    .. _ArcFace: Additive Angular Margin Loss for Deep Face Recognition:
+        https://arxiv.org/abs/1801.07698v1
+
+    Example:
+        >>> layer = ArcFace(5, 10, s=1.31, m=0.5)
+        >>> loss_fn = nn.CrosEntropyLoss()
+        >>> embedding = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = layer(embedding, target)
+        >>> loss = loss_fn(output, target)
+        >>> loss.backward()
+
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        s: float = 64.0,
+        m: float = 0.5,
+    ):
+        """
+        Args:
+            in_features (int): size of each input sample.
+            out_features (int): size of each output sample.
+            s (float, optional): norm of input feature,
+                Default: ``64.0``.
+            m (float, optional): margin.
+                Default: ``0.5``.
+        """
+        super(ArcFace, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.s = s
+        self.m = m
+
+        self.cos_m = math.cos(m)
+        self.sin_m = math.sin(m)
+        self.th = math.cos(math.pi - m)
+        self.mm = math.sin(math.pi - m) * m
+
+        self.weight = nn.Parameter(
+            torch.FloatTensor(out_features, in_features)
+        )
+        nn.init.xavier_uniform_(self.weight)
+
+    def __repr__(self) -> str:
+        return "ArcFace(in_features={},out_features={},s={},m={})".format(
+            self.in_features, self.out_features, self.s, self.m
+        )
+
+    def forward(
+        self, input: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): input features,
+                expected shapes BxF.
+            target (torch.Tensor): target classes,
+                expected shapes B.
+
+        Returns:
+            torch.Tensor with loss value.
+        """
+        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
+        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
+        phi = cosine * self.cos_m - sine * self.sin_m
+
+        phi = torch.where(cosine > self.th, phi, cosine - self.mm)
+
+        one_hot = torch.zeros(cosine.size()).to(input.device)
+        one_hot.scatter_(1, target.view(-1, 1).long(), 1)
+        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+        logits *= self.s
+
+        return logits
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
new file mode 100644
index 0000000000..5c1b4aa720
--- /dev/null
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class CosFace(nn.Module):
+    """Implementation of CosFace loss for metric learning.
+
+    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition:
+        https://arxiv.org/abs/1801.09414
+
+    Example:
+        >>> layer = CosFaceLoss(5, 10, s=1.31, m=0.1)
+        >>> loss_fn = nn.CrosEntropyLoss()
+        >>> embedding = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = layer(embedding, target)
+        >>> loss = loss_fn(output, target)
+        >>> loss.backward()
+
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        s: float = 64.0,
+        m: float = 0.35,
+    ):
+        """
+        Args:
+            in_features (int): size of each input sample.
+            out_features (int): size of each output sample.
+            s (float, optional): norm of input feature,
+                Default: ``64.0``.
+            m (float, optional): margin.
+                Default: ``0.35``.
+        """
+        super(CosFace, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.s = s
+        self.m = m
+
+        self.weight = nn.Parameter(
+            torch.FloatTensor(out_features, in_features)
+        )
+        nn.init.xavier_uniform_(self.weight)
+
+    def __repr__(self) -> str:
+        return "CosFace(in_features={},out_features={},s={},m={})".format(
+            self.in_features, self.out_features, self.s, self.m
+        )
+
+    def forward(
+        self, input: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): input features,
+                expected shapes BxF.
+            target (torch.Tensor): target classes,
+                expected shapes B.
+
+        Returns:
+            torch.Tensor with loss value.
+        """
+        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
+        phi = cosine - self.m
+        one_hot = torch.zeros(cosine.size()).to(input.device)
+        one_hot.scatter_(1, target.view(-1, 1).long(), 1)
+        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+        logits *= self.s
+
+        return logits
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
new file mode 100644
index 0000000000..1f094cd50e
--- /dev/null
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -0,0 +1,49 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SoftMax(nn.Module):
+    """Implementation of SoftMax head for metric learning.
+
+    Example:
+        >>> layer = SoftMax()
+        >>> loss_fn = nn.CrosEntropyLoss()
+        >>> embedding = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = layer(embedding, target)
+        >>> loss = loss_fn(output, target)
+        >>> loss.backward()
+
+    """
+
+    def __init__(self, in_features: int, num_classes: int):
+        """
+        Args:
+            in_features (int): size of each input sample.
+            num_classes (int): size of each output sample.
+        """
+        super(SoftMax, self).__init__()
+        self.in_features = in_features
+        self.out_features = num_classes
+        self.weight = nn.Parameter(torch.FloatTensor(num_classes, in_features))
+        self.bias = nn.Parameter(torch.FloatTensor(num_classes))
+
+        nn.init.xavier_uniform_(self.weight)
+        nn.init.zeros_(self.bias)
+
+    def __repr__(self) -> str:
+        return "SoftMax(in_features={},out_features={})".format(
+            self.in_features, self.out_features
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): input features,
+                expected shapes BxF.
+
+        Returns:
+            torch.Tensor with loss value.
+        """
+        return F.linear(input, self.weight, self.bias)
diff --git a/catalyst/contrib/nn/tests/test_criterion.py b/catalyst/contrib/nn/tests/test_criterion.py
index c7ba8137b8..f230071790 100644
--- a/catalyst/contrib/nn/tests/test_criterion.py
+++ b/catalyst/contrib/nn/tests/test_criterion.py
@@ -1,11 +1,12 @@
 # flake8: noqa
-import torch
 import numpy as np
+
+import torch
+
 from catalyst.contrib.nn import criterion as module
 from catalyst.contrib.nn.criterion import (
     CircleLoss,
     TripletMarginLossWithSampler,
-    CosFaceLoss,
 )
 from catalyst.data import AllTripletsSampler
 
@@ -23,82 +24,3 @@ def test_criterion_init():
             else:
                 instance = module_class()
             assert instance is not None
-
-
-def test_cosface_loss():
-    emb_size = 4
-    n_classes = 3
-    s = 3.0
-    m = 0.1
-
-    features = np.array(
-        [
-            [1, 2, 3, 4],
-            [5, 6, 7, 8],
-        ],
-        dtype="f",
-    )
-    target = np.array([0, 2], dtype="l")
-    projection = np.array(
-        [
-            [0.1, 0.2, 0.3, 0.4],
-            [1.1, 3.2, 5.3, 0.4],
-            [0.1, 0.2, 6.3, 0.4],
-        ],
-        dtype="f",
-    )
-
-    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="none")
-    loss_fn.projection.data = torch.from_numpy(projection)
-
-    def normalize(matr):
-        return (
-            matr / np.sqrt((matr ** 2).sum(axis=1))[:, np.newaxis]
-        )  # for each row
-
-    def softmax(x):
-        e_x = np.exp(x - np.max(x))
-        return e_x / e_x.sum(1)[:, np.newaxis]  # for each row
-
-    def cross_entropy(preds, targs, axis=None):
-        print(softmax(preds))
-        return -(targs * np.log(softmax(preds))).sum(axis)
-
-    normalized_features = normalize(features)  # 2x4
-    normalized_projection = normalize(projection)  # 3x4
-
-    cosine = normalized_features @ normalized_projection.T  # 2x4 * 4x3 = 2x3
-    phi = cosine - m  # 2x3
-
-    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")  # one_hot(target)
-    feats = (mask * phi + (1.0 - mask) * cosine) * s  # 2x3
-
-    expected_loss = cross_entropy(feats, mask, 1)
-    actual = (
-        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
-        .detach()
-        .numpy()
-    )
-    assert np.allclose(expected_loss, actual)
-
-    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="mean")
-    loss_fn.projection.data = torch.from_numpy(projection)
-
-    expected_loss = cross_entropy(feats, mask, 1)
-    actual = (
-        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
-        .detach()
-        .numpy()
-    )
-    assert np.isclose(expected_loss.mean(), actual)
-
-    loss_fn = CosFaceLoss(emb_size, n_classes, s, m, reduction="sum")
-    loss_fn.projection.data = torch.from_numpy(projection)
-
-    expected_loss = cross_entropy(feats, mask, 1)
-    actual = (
-        loss_fn(torch.from_numpy(features), torch.LongTensor(target))
-        .detach()
-        .numpy()
-    )
-    assert np.isclose(expected_loss.sum(), actual)
\ No newline at end of file
diff --git a/catalyst/contrib/nn/tests/test_modules.py b/catalyst/contrib/nn/tests/test_modules.py
new file mode 100644
index 0000000000..e971f1bfbe
--- /dev/null
+++ b/catalyst/contrib/nn/tests/test_modules.py
@@ -0,0 +1,212 @@
+# flake8: noqa
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from catalyst.contrib.nn.modules import ArcFace, CosFace, SoftMax
+
+
+def normalize(m: np.ndarray) -> np.ndarray:
+    m_s = np.sqrt((m ** 2).sum(axis=1))[:, np.newaxis]  # for each row
+    return m / m_s
+
+
+def softmax(x: np.ndarray) -> np.ndarray:
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum(1)[:, np.newaxis]  # for each row
+
+
+def cross_entropy(
+    preds: np.ndarray, targs: np.ndarray, axis: int = 1
+) -> float:
+    return -(targs * np.log(softmax(preds))).sum(axis)
+
+
+def test_softmax():
+    emb_size = 4
+    n_classes = 3
+
+    # fmt: off
+    features = np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+        ],
+        dtype="f",
+    )
+    target = np.array([0, 2], dtype="l")
+    weight = np.array(
+        [
+            [0.1, 0.2, 0.3, 0.4],
+            [1.1, 3.2, 5.3, 0.4],
+            [0.1, 0.2, 6.3, 0.4],
+        ],
+        dtype="f",
+    )
+    bias = np.array([0.2, 0.01, 0.1], dtype="f")
+    # fmt: on
+
+    layer = SoftMax(emb_size, n_classes)
+    layer.weight.data = torch.from_numpy(weight)
+    layer.bias.data = torch.from_numpy(bias)
+
+    expected = features @ weight.T + bias
+    actual = layer(torch.from_numpy(features)).detach().numpy()
+    assert np.allclose(expected, actual)
+
+
+def test_arcface_with_cross_entropy_loss():
+    emb_size = 4
+    n_classes = 3
+    s = 3.0
+    m = 0.5
+
+    # fmt: off
+    features = np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+        ],
+        dtype="f",
+    )
+    target = np.array([0, 2], dtype="l")
+    weight = np.array(
+        [
+            [0.1, 0.2, 0.3, 0.4],
+            [1.1, 3.2, 5.3, 0.4],
+            [0.1, 0.2, 6.3, 0.4],
+        ],
+        dtype="f",
+    )
+    # fmt: on
+
+    layer = ArcFace(emb_size, n_classes, s, m)
+    layer.weight.data = torch.from_numpy(weight)
+    loss_fn = nn.CrossEntropyLoss(reduction="none")
+
+    normalized_features = normalize(features)  # 2x4
+    normalized_projection = normalize(weight)  # 3x4
+
+    cosine = normalized_features @ normalized_projection.T  # 2x4 * 4x3 = 2x3
+    sine = np.sqrt(1 - np.power(cosine, 2))  # 2x3
+    phi = cosine * np.cos(m) - sine * np.sin(m)  # 2x3
+    phi = np.where(
+        cosine > np.cos(np.pi - m), phi, cosine - np.sin(np.pi - m) * m
+    )  # 2x3
+
+    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")  # one_hot(target)
+    feats = (mask * phi + (1.0 - mask) * cosine) * s  # 2x3
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.allclose(expected_loss, actual)
+
+    loss_fn = nn.CrossEntropyLoss(reduction="mean")
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.mean(), actual)
+
+    loss_fn = nn.CrossEntropyLoss(reduction="sum")
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.sum(), actual)
+
+
+def test_cosface_with_cross_entropy_loss():
+    emb_size = 4
+    n_classes = 3
+    s = 3.0
+    m = 0.1
+
+    # fmt: off
+    features = np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+        ],
+        dtype="f",
+    )
+    target = np.array([0, 2], dtype="l")
+    weight = np.array(
+        [
+            [0.1, 0.2, 0.3, 0.4],
+            [1.1, 3.2, 5.3, 0.4],
+            [0.1, 0.2, 6.3, 0.4],
+        ],
+        dtype="f",
+    )
+    # fmt: on
+
+    layer = CosFace(emb_size, n_classes, s, m)
+    layer.weight.data = torch.from_numpy(weight)
+    loss_fn = nn.CrossEntropyLoss(reduction="none")
+
+    normalized_features = normalize(features)  # 2x4
+    normalized_projection = normalize(weight)  # 3x4
+
+    cosine = normalized_features @ normalized_projection.T  # 2x4 * 4x3 = 2x3
+    phi = cosine - m  # 2x3
+
+    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")  # one_hot(target)
+    feats = (mask * phi + (1.0 - mask) * cosine) * s  # 2x3
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.allclose(expected_loss, actual)
+
+    loss_fn = nn.CrossEntropyLoss(reduction="mean")
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.mean(), actual)
+
+    loss_fn = nn.CrossEntropyLoss(reduction="sum")
+
+    expected_loss = cross_entropy(feats, mask, 1)
+    actual = (
+        loss_fn(
+            layer(torch.from_numpy(features), torch.LongTensor(target)),
+            torch.LongTensor(target),
+        )
+        .detach()
+        .numpy()
+    )
+    assert np.isclose(expected_loss.sum(), actual)

From 2893c1bdc4986c91050b21e1ee2f72154cb1e17d Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 16:19:55 +0300
Subject: [PATCH 07/23] docs

---
 docs/api/contrib.rst | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/api/contrib.rst b/docs/api/contrib.rst
index 7039a8a812..ebfd99dfd7 100644
--- a/docs/api/contrib.rst
+++ b/docs/api/contrib.rst
@@ -250,6 +250,13 @@ Wing
 Modules
 ~~~~~~~~~~~~~~~~
 
+ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.. automodule:: catalyst.contrib.nn.modules.arcface
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 Common modules
 """"""""""""""""""""""""""""""""""""""""""
 .. automodule:: catalyst.contrib.nn.modules.common
@@ -257,6 +264,13 @@ Common modules
     :undoc-members:
     :show-inheritance:
 
+CosFace: Large Margin Cosine Loss for Deep Face Recognition
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.. automodule:: catalyst.contrib.nn.modules.cosface
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 Last-Mean-Average-Attention (LAMA)-Pooling
 """"""""""""""""""""""""""""""""""""""""""
 .. automodule:: catalyst.contrib.nn.modules.lama
@@ -285,6 +299,12 @@ SqueezeAndExcitation
     :undoc-members:
     :show-inheritance:
 
+SoftMax
+""""""""""""""""""""""""""""""""""""""""""
+.. automodule:: catalyst.contrib.nn.modules.softmax
+    :members:
+    :undoc-members:
+    :show-inheritance:
 
 
 Optimizers

From e8ae3f75c658df9ca33c9ed9750845e179344fc7 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 16:21:34 +0300
Subject: [PATCH 08/23] softmax, cosface, arcface layers

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c225eb0f9..cf0782374b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Runner registry support for Config API ([#936](https://github.com/catalyst-team/catalyst/pull/936))
+- SoftMax, CosFace, ArcFace layers to contrib ([#939](https://github.com/catalyst-team/catalyst/pull/939))
 
 ### Changed
 

From 2f401d2034e840c0313a8c99666ebb69eb1732b1 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 20:16:08 +0300
Subject: [PATCH 09/23] docs for __repr__

---
 catalyst/contrib/nn/modules/arcface.py | 1 +
 catalyst/contrib/nn/modules/cosface.py | 1 +
 catalyst/contrib/nn/modules/softmax.py | 1 +
 3 files changed, 3 insertions(+)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 3a86d1674b..afa023ad89 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -55,6 +55,7 @@ def __init__(
         nn.init.xavier_uniform_(self.weight)
 
     def __repr__(self) -> str:
+        """ArcFace representation."""
         return "ArcFace(in_features={},out_features={},s={},m={})".format(
             self.in_features, self.out_features, self.s, self.m
         )
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 5c1b4aa720..59337b2a9e 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -48,6 +48,7 @@ def __init__(
         nn.init.xavier_uniform_(self.weight)
 
     def __repr__(self) -> str:
+        """CosFace representation."""
         return "CosFace(in_features={},out_features={},s={},m={})".format(
             self.in_features, self.out_features, self.s, self.m
         )
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 1f094cd50e..26ec19e487 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -33,6 +33,7 @@ def __init__(self, in_features: int, num_classes: int):
         nn.init.zeros_(self.bias)
 
     def __repr__(self) -> str:
+        """SoftMax representation."""
         return "SoftMax(in_features={},out_features={})".format(
             self.in_features, self.out_features
         )

From ee26d7a1980795aa386265ca11ca0d856d0c9b53 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 20:40:44 +0300
Subject: [PATCH 10/23] another docs fix

---
 catalyst/contrib/nn/modules/arcface.py | 2 +-
 catalyst/contrib/nn/modules/cosface.py | 2 +-
 catalyst/contrib/nn/modules/softmax.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index afa023ad89..88cbaac625 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -55,7 +55,7 @@ def __init__(
         nn.init.xavier_uniform_(self.weight)
 
     def __repr__(self) -> str:
-        """ArcFace representation."""
+        """Object representation."""
         return "ArcFace(in_features={},out_features={},s={},m={})".format(
             self.in_features, self.out_features, self.s, self.m
         )
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 59337b2a9e..cb51ec5fd7 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -48,7 +48,7 @@ def __init__(
         nn.init.xavier_uniform_(self.weight)
 
     def __repr__(self) -> str:
-        """CosFace representation."""
+        """Object representation."""
         return "CosFace(in_features={},out_features={},s={},m={})".format(
             self.in_features, self.out_features, self.s, self.m
         )
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 26ec19e487..49d84e716d 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -33,7 +33,7 @@ def __init__(self, in_features: int, num_classes: int):
         nn.init.zeros_(self.bias)
 
     def __repr__(self) -> str:
-        """SoftMax representation."""
+        """"Object representation."""
         return "SoftMax(in_features={},out_features={})".format(
             self.in_features, self.out_features
         )

From 0887d13181a026de7d99ad15ca268014b8bd48bb Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sun, 20 Sep 2020 20:58:43 +0300
Subject: [PATCH 11/23] and another docs fix

---
 catalyst/contrib/nn/modules/softmax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 49d84e716d..f17c5aa3ef 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -33,7 +33,7 @@ def __init__(self, in_features: int, num_classes: int):
         nn.init.zeros_(self.bias)
 
     def __repr__(self) -> str:
-        """"Object representation."""
+        """Object representation."""
         return "SoftMax(in_features={},out_features={})".format(
             self.in_features, self.out_features
         )

From e95ac6fa9df15f64e420a8766b03f92c22f1e061 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Fri, 25 Sep 2020 22:17:24 +0300
Subject: [PATCH 12/23] fixed arcface

---
 catalyst/contrib/nn/modules/arcface.py    | 41 +++++++++++++----------
 catalyst/contrib/nn/modules/cosface.py    |  9 +++--
 catalyst/contrib/nn/modules/softmax.py    |  9 +++--
 catalyst/contrib/nn/tests/test_modules.py | 15 ++++-----
 4 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 88cbaac625..f98d3df51a 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -28,6 +28,7 @@ def __init__(
         out_features: int,
         s: float = 64.0,
         m: float = 0.5,
+        eps: float = 1e-6,
     ):
         """
         Args:
@@ -37,17 +38,16 @@ def __init__(
                 Default: ``64.0``.
             m (float, optional): margin.
                 Default: ``0.5``.
+            eps (float, optional): operation accuracy.
+                Default: ``1e-6``.
         """
         super(ArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.s = s
         self.m = m
-
-        self.cos_m = math.cos(m)
-        self.sin_m = math.sin(m)
-        self.th = math.cos(math.pi - m)
-        self.mm = math.sin(math.pi - m) * m
+        self.threshold = math.pi - m
+        self.eps = eps
 
         self.weight = nn.Parameter(
             torch.FloatTensor(out_features, in_features)
@@ -56,8 +56,8 @@ def __init__(
 
     def __repr__(self) -> str:
         """Object representation."""
-        return "ArcFace(in_features={},out_features={},s={},m={})".format(
-            self.in_features, self.out_features, self.s, self.m
+        return "ArcFace(in_features={},out_features={},s={},m={},eps={})".format(
+            self.in_features, self.out_features, self.s, self.m, self.eps
         )
 
     def forward(
@@ -66,22 +66,29 @@ def forward(
         """
         Args:
             input (torch.Tensor): input features,
-                expected shapes BxF.
+                expected shapes ``BxF`` where ``B``
+                is batch dimension and ``F`` is an
+                input feature dimension.
             target (torch.Tensor): target classes,
-                expected shapes B.
+                expected shapes ``B`` where
+                ``B`` is batch dimension.
 
         Returns:
-            torch.Tensor with loss value.
+            logits tensor with shapes ``BxC`` where C is a number of classes.
         """
-        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
-        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
-        phi = cosine * self.cos_m - sine * self.sin_m
-
-        phi = torch.where(cosine > self.th, phi, cosine - self.mm)
+        cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))
+        theta = torch.acos(
+            torch.clamp(cos_theta, -1.0 + self.eps, 1.0 - self.eps)
+        )
 
-        one_hot = torch.zeros(cosine.size()).to(input.device)
+        one_hot = torch.zeros_like(cos_theta, device=input.device)
         one_hot.scatter_(1, target.view(-1, 1).long(), 1)
-        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+
+        mask = torch.where(
+            theta > self.threshold, torch.zeros_like(one_hot), one_hot
+        )
+
+        logits = torch.cos(torch.where(mask.bool(), theta + self.m, theta))
         logits *= self.s
 
         return logits
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index cb51ec5fd7..5a9847dafa 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -59,12 +59,15 @@ def forward(
         """
         Args:
             input (torch.Tensor): input features,
-                expected shapes BxF.
+                expected shapes ``BxF`` where ``B``
+                is batch dimension and ``F`` is an
+                input feature dimension.
             target (torch.Tensor): target classes,
-                expected shapes B.
+                expected shapes ``B`` where
+                ``B`` is batch dimension.
 
         Returns:
-            torch.Tensor with loss value.
+            logits tensor with shapes ``BxC`` where C is a number of classes.
         """
         cosine = F.linear(F.normalize(input), F.normalize(self.weight))
         phi = cosine - self.m
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index f17c5aa3ef..4a815afe25 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -42,9 +42,14 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         """
         Args:
             input (torch.Tensor): input features,
-                expected shapes BxF.
+                expected shapes ``BxF`` where ``B``
+                is batch dimension and ``F`` is an
+                input feature dimension.
+            target (torch.Tensor): target classes,
+                expected shapes ``B`` where
+                ``B`` is batch dimension.
 
         Returns:
-            torch.Tensor with loss value.
+            logits tensor with shapes ``BxC`` where C is a number of classes.
         """
         return F.linear(input, self.weight, self.bias)
diff --git a/catalyst/contrib/nn/tests/test_modules.py b/catalyst/contrib/nn/tests/test_modules.py
index e971f1bfbe..a99ecd8140 100644
--- a/catalyst/contrib/nn/tests/test_modules.py
+++ b/catalyst/contrib/nn/tests/test_modules.py
@@ -61,6 +61,7 @@ def test_arcface_with_cross_entropy_loss():
     n_classes = 3
     s = 3.0
     m = 0.5
+    eps = 1e-8
 
     # fmt: off
     features = np.array(
@@ -81,7 +82,7 @@ def test_arcface_with_cross_entropy_loss():
     )
     # fmt: on
 
-    layer = ArcFace(emb_size, n_classes, s, m)
+    layer = ArcFace(emb_size, n_classes, s, m, eps)
     layer.weight.data = torch.from_numpy(weight)
     loss_fn = nn.CrossEntropyLoss(reduction="none")
 
@@ -89,14 +90,12 @@ def test_arcface_with_cross_entropy_loss():
     normalized_projection = normalize(weight)  # 3x4
 
     cosine = normalized_features @ normalized_projection.T  # 2x4 * 4x3 = 2x3
-    sine = np.sqrt(1 - np.power(cosine, 2))  # 2x3
-    phi = cosine * np.cos(m) - sine * np.sin(m)  # 2x3
-    phi = np.where(
-        cosine > np.cos(np.pi - m), phi, cosine - np.sin(np.pi - m) * m
-    )  # 2x3
+    theta = np.arccos(np.clip(cosine, -1 + eps, 1 - eps))  # 2x3
 
-    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")  # one_hot(target)
-    feats = (mask * phi + (1.0 - mask) * cosine) * s  # 2x3
+    # one_hot(target)
+    mask = np.array([[1, 0, 0], [0, 0, 1]], dtype="l")
+    mask = np.where(theta > (np.pi - m), np.zeros_like(mask), mask)  # 2x3
+    feats = np.cos(np.where(mask > 0, theta + m, theta)) * s  # 2x3
 
     expected_loss = cross_entropy(feats, mask, 1)
     actual = (

From 6c5c5d9cdf8e33d45183abec6c308651df8e43c4 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sat, 26 Sep 2020 00:21:38 +0300
Subject: [PATCH 13/23] fix: docs

---
 catalyst/contrib/nn/modules/arcface.py | 3 ++-
 catalyst/contrib/nn/modules/cosface.py | 3 ++-
 catalyst/contrib/nn/modules/softmax.py | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index f98d3df51a..57af46db9a 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -74,7 +74,8 @@ def forward(
                 ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC`` where C is a number of classes.
+            logits tensor with shapes ``BxC``
+            where C is a number of classes.
         """
         cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))
         theta = torch.acos(
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 5a9847dafa..2dcb68a38e 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -67,7 +67,8 @@ def forward(
                 ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC`` where C is a number of classes.
+            logits tensor with shapes ``BxC``
+            where C is a number of classes.
         """
         cosine = F.linear(F.normalize(input), F.normalize(self.weight))
         phi = cosine - self.m
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 4a815afe25..e05d44f6f1 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -50,6 +50,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
                 ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC`` where C is a number of classes.
+            logits tensor with shapes ``BxC``
+            where C is a number of classes.
         """
         return F.linear(input, self.weight, self.bias)

From e27db82f35733e6e439c146ffb1ca74bf4f0669d Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Sat, 26 Sep 2020 02:31:25 +0300
Subject: [PATCH 14/23] fixed docs

---
 catalyst/contrib/nn/modules/arcface.py | 18 ++++++++++++++----
 catalyst/contrib/nn/modules/cosface.py |  4 ++--
 catalyst/contrib/nn/modules/softmax.py |  7 ++-----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 57af46db9a..4966ec653b 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -56,8 +56,18 @@ def __init__(
 
     def __repr__(self) -> str:
         """Object representation."""
-        return "ArcFace(in_features={},out_features={},s={},m={},eps={})".format(
-            self.in_features, self.out_features, self.s, self.m, self.eps
+        return (
+            "ArcFace("
+            + ",".join(
+                [
+                    f"in_features={self.in_features}",
+                    f"out_features={self.out_features}",
+                    f"s={self.s}",
+                    f"m={self.m}",
+                    f"eps={self.eps}",
+                ]
+            )
+            + ")"
         )
 
     def forward(
@@ -74,8 +84,8 @@ def forward(
                 ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC``
-            where C is a number of classes.
+            tensor (logits) with shapes ``BxC``
+            where ``C`` is a number of classes.
         """
         cos_theta = F.linear(F.normalize(input), F.normalize(self.weight))
         theta = torch.acos(
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 2dcb68a38e..82ee6b2bdb 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -67,8 +67,8 @@ def forward(
                 ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC``
-            where C is a number of classes.
+            tensor (logits) with shapes ``BxC``
+            where ``C`` is a number of classes.
         """
         cosine = F.linear(F.normalize(input), F.normalize(self.weight))
         phi = cosine - self.m
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index e05d44f6f1..72d07dbb34 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -45,12 +45,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
                 expected shapes ``BxF`` where ``B``
                 is batch dimension and ``F`` is an
                 input feature dimension.
-            target (torch.Tensor): target classes,
-                expected shapes ``B`` where
-                ``B`` is batch dimension.
 
         Returns:
-            logits tensor with shapes ``BxC``
-            where C is a number of classes.
+            tensor (logits) with shapes ``BxC``
+            where ``C`` is a number of classes.
         """
         return F.linear(input, self.weight, self.bias)

From ef0d14712923958c1b774fcd85b1d2bba430d047 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Wed, 30 Sep 2020 23:19:54 +0300
Subject: [PATCH 15/23] new docs format & SubCenterArcFace

---
 catalyst/contrib/nn/modules/__init__.py |   2 +-
 catalyst/contrib/nn/modules/arcface.py  | 122 +++++++++++++++++++++---
 catalyst/contrib/nn/modules/cosface.py  |  21 ++--
 catalyst/contrib/nn/modules/softmax.py  |   9 +-
 4 files changed, 124 insertions(+), 30 deletions(-)

diff --git a/catalyst/contrib/nn/modules/__init__.py b/catalyst/contrib/nn/modules/__init__.py
index 96401825eb..8ec4d226c7 100644
--- a/catalyst/contrib/nn/modules/__init__.py
+++ b/catalyst/contrib/nn/modules/__init__.py
@@ -33,5 +33,5 @@
 )
 
 from catalyst.contrib.nn.modules.softmax import SoftMax
-from catalyst.contrib.nn.modules.arcface import ArcFace
+from catalyst.contrib.nn.modules.arcface import ArcFace, SubCenterArcFace
 from catalyst.contrib.nn.modules.cosface import CosFace
diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 4966ec653b..1a8e7c2908 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -6,11 +6,21 @@
 
 
 class ArcFace(nn.Module):
-    """Implementation of ArcFace loss for metric learning.
+    """Implementation of `ArcFace: Additive Angular Margin Loss for Deep Face Recognition`_.
 
-    .. _ArcFace: Additive Angular Margin Loss for Deep Face Recognition:
+    .. _ArcFace\: Additive Angular Margin Loss for Deep Face Recognition:
         https://arxiv.org/abs/1801.07698v1
 
+    Args:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        s: norm of input feature.
+            Default: ``64.0``.
+        m: margin.
+            Default: ``0.5``.
+        eps: operation accuracy.
+            Default: ``1e-6``.
+
     Example:
         >>> layer = ArcFace(5, 10, s=1.31, m=0.5)
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -30,17 +40,6 @@ def __init__(
         m: float = 0.5,
         eps: float = 1e-6,
     ):
-        """
-        Args:
-            in_features (int): size of each input sample.
-            out_features (int): size of each output sample.
-            s (float, optional): norm of input feature,
-                Default: ``64.0``.
-            m (float, optional): margin.
-                Default: ``0.5``.
-            eps (float, optional): operation accuracy.
-                Default: ``1e-6``.
-        """
         super(ArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -103,3 +102,100 @@ def forward(
         logits *= self.s
 
         return logits
+
+
+class SubCenterArcFace(nn.Module):
+    """Implementation of `Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces`_.
+
+    .. _Sub-center ArcFace\: Boosting Face Recognition by Large-scale Noisy Web Faces:
+        https://ibug.doc.ic.ac.uk/media/uploads/documents/eccv_1445.pdf
+
+    Args:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        s: norm of input feature,
+            Default: ``64.0``.
+        m: margin.
+            Default: ``0.5``.
+        k: number of possible class centroids.
+            Default: ``3``.
+        eps (float, optional): operation accuracy.
+            Default: ``1e-6``.
+
+    Example:
+        >>> layer = SubCenterArcFace(5, 10, s=1.31, m=0.35, k=2)
+        >>> loss_fn = nn.CrosEntropyLoss()
+        >>> embedding = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = layer(embedding, target)
+        >>> loss = loss_fn(output, target)
+        >>> loss.backward()
+
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        s: float = 64.0,
+        m: float = 0.50,
+        k: int = 3,
+        eps: float = 1e-6,
+    ):
+        super(SubCenterArcFace, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.s = s
+        self.m = m
+        self.k = k
+        self.eps = eps
+
+        self.weight = nn.Parameter(
+            torch.FloatTensor(k, in_features, out_features)
+        )
+        nn.init.xavier_uniform_(self.weight)
+
+        self.threshold = math.pi - self.m
+
+    def __repr__(self) -> str:
+        """Object representation."""
+        return (
+            "SubCenterArcFace("
+            + ",".join(
+                [
+                    f"in_features={self.in_features}",
+                    f"out_features={self.out_features}",
+                    f"s={self.s}",
+                    f"m={self.m}",
+                    f"k={self.k}",
+                    f"eps={self.eps}",
+                ]
+            )
+            + ")"
+        )
+
+    def forward(self, input, label):
+        cos_theta = torch.bmm(
+            F.normalize(input)
+            .unsqueeze(0)
+            .expand(self.k, *input.shape),  # k*b*f
+            F.normalize(
+                self.weight, dim=1
+            ),  # normalize in_features dim   # k*f*c
+        )  # k*b*f
+        cos_theta = torch.max(cos_theta, dim=0)[0]  # b*f
+        theta = torch.acos(
+            torch.clamp(cos_theta, -1.0 + self.eps, 1.0 - self.eps)
+        )
+
+        one_hot = torch.zeros(cos_theta.size()).to(input.device)
+        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
+
+        selected = torch.where(
+            theta > self.threshold, torch.zeros_like(one_hot), one_hot
+        )
+
+        output = torch.cos(torch.where(selected.bool(), theta + self.m, theta))
+        output *= self.s
+        return output
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 82ee6b2bdb..6c159029b1 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -4,11 +4,19 @@
 
 
 class CosFace(nn.Module):
-    """Implementation of CosFace loss for metric learning.
+    """Implementation of `CosFace\: Large Margin Cosine Loss for Deep Face Recognition`_.
 
-    .. _CosFace: Large Margin Cosine Loss for Deep Face Recognition:
+    .. _CosFace\: Large Margin Cosine Loss for Deep Face Recognition:
         https://arxiv.org/abs/1801.09414
 
+    Args:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        s: norm of input feature.
+            Default: ``64.0``.
+        m: margin.
+            Default: ``0.35``.
+
     Example:
         >>> layer = CosFaceLoss(5, 10, s=1.31, m=0.1)
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -27,15 +35,6 @@ def __init__(
         s: float = 64.0,
         m: float = 0.35,
     ):
-        """
-        Args:
-            in_features (int): size of each input sample.
-            out_features (int): size of each output sample.
-            s (float, optional): norm of input feature,
-                Default: ``64.0``.
-            m (float, optional): margin.
-                Default: ``0.35``.
-        """
         super(CosFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 72d07dbb34..6d881b4039 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -6,6 +6,10 @@
 class SoftMax(nn.Module):
     """Implementation of SoftMax head for metric learning.
 
+    Args:
+        in_features (int): size of each input sample.
+        num_classes (int): size of each output sample.
+
     Example:
         >>> layer = SoftMax()
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -18,11 +22,6 @@ class SoftMax(nn.Module):
     """
 
     def __init__(self, in_features: int, num_classes: int):
-        """
-        Args:
-            in_features (int): size of each input sample.
-            num_classes (int): size of each output sample.
-        """
         super(SoftMax, self).__init__()
         self.in_features = in_features
         self.out_features = num_classes

From 46508aad9203a46430638507b4e32f2aa9ed69a7 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Wed, 30 Sep 2020 23:21:00 +0300
Subject: [PATCH 16/23] arcface title

---
 docs/api/contrib.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/contrib.rst b/docs/api/contrib.rst
index c238f7c07f..d48bbbb9a7 100644
--- a/docs/api/contrib.rst
+++ b/docs/api/contrib.rst
@@ -250,7 +250,7 @@ Wing
 Modules
 ~~~~~~~~~~~~~~~~
 
-ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+ArcFace and SubCenterArcFace
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 .. automodule:: catalyst.contrib.nn.modules.arcface
     :members:

From ed6a1c51060ba803d9c90c618073c67625d0f11f Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Wed, 30 Sep 2020 23:54:13 +0300
Subject: [PATCH 17/23] docs

---
 catalyst/contrib/nn/modules/arcface.py | 22 ++++++++++++++++++----
 catalyst/contrib/nn/modules/cosface.py | 11 +++++++++--
 catalyst/contrib/nn/modules/softmax.py |  8 +++++++-
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 1a8e7c2908..0a4b539c06 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -6,7 +6,8 @@
 
 
 class ArcFace(nn.Module):
-    """Implementation of `ArcFace: Additive Angular Margin Loss for Deep Face Recognition`_.
+    """Implementation of
+    `ArcFace: Additive Angular Margin Loss for Deep Face Recognition`_.
 
     .. _ArcFace\: Additive Angular Margin Loss for Deep Face Recognition:
         https://arxiv.org/abs/1801.07698v1
@@ -21,6 +22,12 @@ class ArcFace(nn.Module):
         eps: operation accuracy.
             Default: ``1e-6``.
 
+    Shape:
+        - Input: :math:`(batch, H_{in})` where
+          :math:`H_{in} = in\_features`.
+        - Output: :math:`(batch, H_{out})` where
+          :math:`H_{out} = out\_features`.
+
     Example:
         >>> layer = ArcFace(5, 10, s=1.31, m=0.5)
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -39,7 +46,7 @@ def __init__(
         s: float = 64.0,
         m: float = 0.5,
         eps: float = 1e-6,
-    ):
+    ):  # noqa: D107
         super(ArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -105,7 +112,8 @@ def forward(
 
 
 class SubCenterArcFace(nn.Module):
-    """Implementation of `Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces`_.
+    """Implementation of
+    `Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces`_.
 
     .. _Sub-center ArcFace\: Boosting Face Recognition by Large-scale Noisy Web Faces:
         https://ibug.doc.ic.ac.uk/media/uploads/documents/eccv_1445.pdf
@@ -122,6 +130,12 @@ class SubCenterArcFace(nn.Module):
         eps (float, optional): operation accuracy.
             Default: ``1e-6``.
 
+    Shape:
+        - Input: :math:`(batch, H_{in})` where
+          :math:`H_{in} = in\_features`.
+        - Output: :math:`(batch, H_{out})` where
+          :math:`H_{out} = out\_features`.
+
     Example:
         >>> layer = SubCenterArcFace(5, 10, s=1.31, m=0.35, k=2)
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -141,7 +155,7 @@ def __init__(
         m: float = 0.50,
         k: int = 3,
         eps: float = 1e-6,
-    ):
+    ):  # noqa: D107
         super(SubCenterArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 6c159029b1..66b63bd4d3 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -4,7 +4,8 @@
 
 
 class CosFace(nn.Module):
-    """Implementation of `CosFace\: Large Margin Cosine Loss for Deep Face Recognition`_.
+    """Implementation of
+    `CosFace\: Large Margin Cosine Loss for Deep Face Recognition`_.
 
     .. _CosFace\: Large Margin Cosine Loss for Deep Face Recognition:
         https://arxiv.org/abs/1801.09414
@@ -17,6 +18,12 @@ class CosFace(nn.Module):
         m: margin.
             Default: ``0.35``.
 
+    Shape:
+        - Input: :math:`(batch, H_{in})` where
+          :math:`H_{in} = in\_features`.
+        - Output: :math:`(batch, H_{out})` where
+          :math:`H_{out} = out\_features`.
+
     Example:
         >>> layer = CosFaceLoss(5, 10, s=1.31, m=0.1)
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -34,7 +41,7 @@ def __init__(
         out_features: int,
         s: float = 64.0,
         m: float = 0.35,
-    ):
+    ):  # noqa: D107
         super(CosFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index 6d881b4039..bcf2644181 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -10,6 +10,12 @@ class SoftMax(nn.Module):
         in_features (int): size of each input sample.
         num_classes (int): size of each output sample.
 
+    Shape:
+        - Input: :math:`(batch, H_{in})` where
+          :math:`H_{in} = in\_features`.
+        - Output: :math:`(batch, H_{out})` where
+          :math:`H_{out} = out\_features`.
+
     Example:
         >>> layer = SoftMax()
         >>> loss_fn = nn.CrosEntropyLoss()
@@ -21,7 +27,7 @@ class SoftMax(nn.Module):
 
     """
 
-    def __init__(self, in_features: int, num_classes: int):
+    def __init__(self, in_features: int, num_classes: int):  # noqa: D107
         super(SoftMax, self).__init__()
         self.in_features = in_features
         self.out_features = num_classes

From c621648d5cc27975533e357b8eb81536e18c73b4 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Wed, 30 Sep 2020 23:55:28 +0300
Subject: [PATCH 18/23] docs for forward method

---
 catalyst/contrib/nn/modules/arcface.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 0a4b539c06..1f5319d6ef 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -190,6 +190,20 @@ def __repr__(self) -> str:
         )
 
     def forward(self, input, label):
+        """
+        Args:
+            input (torch.Tensor): input features,
+                expected shapes ``BxF`` where ``B``
+                is batch dimension and ``F`` is an
+                input feature dimension.
+            target (torch.Tensor): target classes,
+                expected shapes ``B`` where
+                ``B`` is batch dimension.
+
+        Returns:
+            tensor (logits) with shapes ``BxC``
+            where ``C`` is a number of classes.
+        """
         cos_theta = torch.bmm(
             F.normalize(input)
             .unsqueeze(0)

From 90418ca2011cd4f10ab9269c4f7be5c624ef1c0e Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Wed, 30 Sep 2020 23:58:19 +0300
Subject: [PATCH 19/23] typings & docs

---
 catalyst/contrib/nn/modules/arcface.py | 6 ++----
 catalyst/contrib/nn/modules/cosface.py | 4 +---
 catalyst/contrib/nn/modules/softmax.py | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 1f5319d6ef..8e93da081e 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -76,9 +76,7 @@ def __repr__(self) -> str:
             + ")"
         )
 
-    def forward(
-        self, input: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, input, target):
         """
         Args:
             input (torch.Tensor): input features,
@@ -196,7 +194,7 @@ def forward(self, input, label):
                 expected shapes ``BxF`` where ``B``
                 is batch dimension and ``F`` is an
                 input feature dimension.
-            target (torch.Tensor): target classes,
+            label (torch.Tensor): target classes,
                 expected shapes ``B`` where
                 ``B`` is batch dimension.
 
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 66b63bd4d3..72112cc363 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -59,9 +59,7 @@ def __repr__(self) -> str:
             self.in_features, self.out_features, self.s, self.m
         )
 
-    def forward(
-        self, input: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, input, target):
         """
         Args:
             input (torch.Tensor): input features,
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index bcf2644181..bb3bc03677 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -43,7 +43,7 @@ def __repr__(self) -> str:
             self.in_features, self.out_features
         )
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         """
         Args:
             input (torch.Tensor): input features,

From f4752df88f11ecd7e0bdc6567084253427a968f1 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Thu, 1 Oct 2020 00:00:09 +0300
Subject: [PATCH 20/23] moved noqa comment

---
 catalyst/contrib/nn/modules/arcface.py | 4 ++--
 catalyst/contrib/nn/modules/cosface.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 8e93da081e..4be50a54ae 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -39,14 +39,14 @@ class ArcFace(nn.Module):
 
     """
 
-    def __init__(
+    def __init__(  # noqa: D107
         self,
         in_features: int,
         out_features: int,
         s: float = 64.0,
         m: float = 0.5,
         eps: float = 1e-6,
-    ):  # noqa: D107
+    ):
         super(ArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 72112cc363..7d46181ace 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -35,13 +35,13 @@ class CosFace(nn.Module):
 
     """
 
-    def __init__(
+    def __init__(  # noqa: D107
         self,
         in_features: int,
         out_features: int,
         s: float = 64.0,
         m: float = 0.35,
-    ):  # noqa: D107
+    ):
         super(CosFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features

From 63797404fb975c814300da3500ee20508250a3fb Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Thu, 1 Oct 2020 00:05:59 +0300
Subject: [PATCH 21/23] fixed docs

---
 catalyst/contrib/nn/modules/arcface.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 4be50a54ae..72536b61b4 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -111,9 +111,11 @@ def forward(self, input, target):
 
 class SubCenterArcFace(nn.Module):
     """Implementation of
-    `Sub-center ArcFace: Boosting Face Recognition by Large-scale Noisy Web Faces`_.
+    `Sub-center ArcFace: Boosting Face Recognition
+    by Large-scale Noisy Web Faces`_.
 
-    .. _Sub-center ArcFace\: Boosting Face Recognition by Large-scale Noisy Web Faces:
+    .. _Sub-center ArcFace\: Boosting Face Recognition \
+        by Large-scale Noisy Web Faces:
         https://ibug.doc.ic.ac.uk/media/uploads/documents/eccv_1445.pdf
 
     Args:
@@ -150,7 +152,7 @@ def __init__(
         in_features: int,
         out_features: int,
         s: float = 64.0,
-        m: float = 0.50,
+        m: float = 0.5,
         k: int = 3,
         eps: float = 1e-6,
     ):  # noqa: D107

From 1fdbc8917095caf79162e02a1610dfa1e32bfcab Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Thu, 1 Oct 2020 00:07:02 +0300
Subject: [PATCH 22/23] fixed init docs

---
 catalyst/contrib/nn/modules/arcface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 72536b61b4..4c7bc0e107 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -147,7 +147,7 @@ class SubCenterArcFace(nn.Module):
 
     """
 
-    def __init__(
+    def __init__(  # noqa: D107
         self,
         in_features: int,
         out_features: int,
@@ -155,7 +155,7 @@ def __init__(
         m: float = 0.5,
         k: int = 3,
         eps: float = 1e-6,
-    ):  # noqa: D107
+    ):
         super(SubCenterArcFace, self).__init__()
         self.in_features = in_features
         self.out_features = out_features

From 800ae52c826ffdd55c6fe661d595d93f7ab509f0 Mon Sep 17 00:00:00 2001
From: Dmytro Doroshenko <dimdoroshenko@gmail.com>
Date: Fri, 2 Oct 2020 00:23:01 +0300
Subject: [PATCH 23/23] fixed examples

---
 catalyst/contrib/nn/modules/arcface.py | 11 ++++++-----
 catalyst/contrib/nn/modules/cosface.py |  2 +-
 catalyst/contrib/nn/modules/softmax.py | 16 +++++++++++-----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/catalyst/contrib/nn/modules/arcface.py b/catalyst/contrib/nn/modules/arcface.py
index 4c7bc0e107..74132f4be1 100644
--- a/catalyst/contrib/nn/modules/arcface.py
+++ b/catalyst/contrib/nn/modules/arcface.py
@@ -32,7 +32,7 @@ class ArcFace(nn.Module):
         >>> layer = ArcFace(5, 10, s=1.31, m=0.5)
         >>> loss_fn = nn.CrosEntropyLoss()
         >>> embedding = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(10)
         >>> output = layer(embedding, target)
         >>> loss = loss_fn(output, target)
         >>> loss.backward()
@@ -140,7 +140,7 @@ class SubCenterArcFace(nn.Module):
         >>> layer = SubCenterArcFace(5, 10, s=1.31, m=0.35, k=2)
         >>> loss_fn = nn.CrosEntropyLoss()
         >>> embedding = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(10)
         >>> output = layer(embedding, target)
         >>> loss = loss_fn(output, target)
         >>> loss.backward()
@@ -224,6 +224,7 @@ def forward(self, input, label):
             theta > self.threshold, torch.zeros_like(one_hot), one_hot
         )
 
-        output = torch.cos(torch.where(selected.bool(), theta + self.m, theta))
-        output *= self.s
-        return output
+        logits = torch.cos(torch.where(selected.bool(), theta + self.m, theta))
+        logits *= self.s
+
+        return logits
diff --git a/catalyst/contrib/nn/modules/cosface.py b/catalyst/contrib/nn/modules/cosface.py
index 7d46181ace..7e2ceb02f9 100644
--- a/catalyst/contrib/nn/modules/cosface.py
+++ b/catalyst/contrib/nn/modules/cosface.py
@@ -28,7 +28,7 @@ class CosFace(nn.Module):
         >>> layer = CosFaceLoss(5, 10, s=1.31, m=0.1)
         >>> loss_fn = nn.CrosEntropyLoss()
         >>> embedding = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(10)
         >>> output = layer(embedding, target)
         >>> loss = loss_fn(output, target)
         >>> loss.backward()
diff --git a/catalyst/contrib/nn/modules/softmax.py b/catalyst/contrib/nn/modules/softmax.py
index bb3bc03677..7404b7f230 100644
--- a/catalyst/contrib/nn/modules/softmax.py
+++ b/catalyst/contrib/nn/modules/softmax.py
@@ -4,11 +4,17 @@
 
 
 class SoftMax(nn.Module):
-    """Implementation of SoftMax head for metric learning.
+    """Implementation of
+    `Significance of Softmax-based Features in Comparison to
+    Distance Metric Learning-based Features`_.
+
+    .. _Significance of Softmax-based Features in Comparison to \
+        Distance Metric Learning-based Features:
+        https://arxiv.org/abs/1712.10151
 
     Args:
-        in_features (int): size of each input sample.
-        num_classes (int): size of each output sample.
+        in_features: size of each input sample.
+        out_features: size of each output sample.
 
     Shape:
         - Input: :math:`(batch, H_{in})` where
@@ -17,10 +23,10 @@ class SoftMax(nn.Module):
           :math:`H_{out} = out\_features`.
 
     Example:
-        >>> layer = SoftMax()
+        >>> layer = SoftMax(5, 10)
         >>> loss_fn = nn.CrosEntropyLoss()
         >>> embedding = torch.randn(3, 5, requires_grad=True)
-        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> target = torch.empty(3, dtype=torch.long).random_(10)
         >>> output = layer(embedding, target)
         >>> loss = loss_fn(output, target)
         >>> loss.backward()