diff --git a/catalyst/core/engine.py b/catalyst/core/engine.py
index 567cf70253..a59c6ba11a 100644
--- a/catalyst/core/engine.py
+++ b/catalyst/core/engine.py
@@ -6,7 +6,7 @@
 
 
 @contextmanager
-def nullcontext(enter_result=None):
+def nullcontext(enter_result: Any = None):
     """Context handler."""
     yield enter_result
 
@@ -37,8 +37,7 @@ def rank(self) -> int:
     @property
     @abstractmethod
     def world_size(self) -> int:
-        """Process world size  for distributed training."""
-        # only for ddp
+        """Process world size for distributed training."""
         pass
 
     @property
@@ -49,26 +48,28 @@ def is_ddp(self) -> bool:
     @property
     def is_master_process(self) -> bool:
         """Checks if a process is master process.
-        Should be implemented only for DDP setup in other cases should always return True.
+        Should be implemented only for distributed training (ddp).
+        For non distributed training should always return `True`.
 
         Returns:
-            `True` if current process is a master process, otherwise `False`.
+            `True` if current process is a master process in other cases return `False`.
         """
         return True
 
     @property
     def is_worker_process(self) -> bool:
         """Checks if a process is worker process.
-        Should be implemented only for DDP setup in other cases should always return False.
+        Should be implemented only for distributed training (ddp).
+        For non distributed training should always return `False`.
 
         Returns:
-            `True` if current process is a worker process, otherwise `False`.
+            `True` if current process is a worker process in other cases return `False`.
         """
         return False
 
     @abstractmethod
     def sync_device(self, tensor_or_module: Any) -> Any:
-        """Moves ``tensor_or_module`` to Engine's deivce.
+        """Moves ``tensor_or_module`` to Engine's device.
 
         Args:
             tensor_or_module: tensor to mode
@@ -89,23 +90,50 @@ def init_components(
 
     @abstractmethod
     def deinit_components(self):
-        """Deinits the runs components."""
-        # only for ddp
+        """Deinits the runs components.
+        In distributed mode should destroy process group.
+        """
         pass
 
     @abstractmethod
     def zero_grad(self, loss, model, optimizer) -> None:
-        """Abstraction over ``model.zero_grad()`` step."""
+        """Abstraction over ``model.zero_grad()`` step.
+        Should be overloaded in cases when required to set arguments
+        for ``model.zero_grad()`` like `set_to_none=True` or
+        you need to use custom scheme which replaces/improves
+        `.zero_grad()` method.
+
+        Args:
+            loss: tensor with loss value.
+            model: model module.
+            optimizer: model optimizer.
+        """
         pass
 
     @abstractmethod
     def backward_loss(self, loss, model, optimizer) -> None:
-        """Abstraction over ``loss.backward()`` step."""
+        """Abstraction over ``loss.backward()`` step.
+        Should be overloaded in cases when required loss scaling.
+        Examples - APEX and AMP.
+
+        Args:
+            loss: tensor with loss value.
+            model: model module.
+            optimizer: model optimizer.
+        """
         pass
 
     @abstractmethod
     def optimizer_step(self, loss, model, optimizer) -> None:
-        """Abstraction over ``optimizer.step()`` step."""
+        """Abstraction over ``optimizer.step()`` step.
+        Should be overloaded in cases when required gradient scaling.
+        Example - AMP.
+
+        Args:
+            loss: tensor with loss value.
+            model: model module.
+            optimizer: model optimizer.
+        """
         pass
 
     @abstractmethod
@@ -174,7 +202,8 @@ def load_checkpoint(self, path: str) -> Dict:
         pass
 
     def autocast(self, *args, **kwargs):
-        """AMP scaling context. Default autocast context does not scale anything.
+        """AMP scaling context.
+        Default autocast context does not scale anything.
 
         Args:
             *args: some args
diff --git a/catalyst/engines/amp.py b/catalyst/engines/amp.py
index 5db1b91a9c..1d0f850e3b 100644
--- a/catalyst/engines/amp.py
+++ b/catalyst/engines/amp.py
@@ -10,6 +10,35 @@ class AMPEngine(DeviceEngine):
 
     Args:
         device: used device, default is `"cuda"`.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.AMPEngine("cuda:1")
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: AMPEngine
+            device: cuda:1
+
+        stages:
+            ...
+
     """
 
     def __init__(self, device: str = "cuda"):
@@ -36,7 +65,36 @@ def autocast(self):
 
 
 class DataParallelAMPEngine(AMPEngine):
-    """AMP multi-gpu training device engine."""
+    """AMP multi-gpu training device engine.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DataParallelAMPEngine()
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DataParallelAMPEngine
+
+        stages:
+            ...
+
+    """
 
     def __init__(self):
         """Init."""
@@ -75,10 +133,42 @@ class DistributedDataParallelAMPEngine(DistributedDataParallelEngine):
     """Distributed AMP multi-gpu training device engine.
 
     Args:
-        address: process address to use (required for PyTorch backend), default is `"localhost"`.
-        port: process port to listen (required for PyTorch backend), default is `"12345"`.
-        backend: multiprocessing backend to use, default is `"nccl"`.
+        address: process address to use
+            (required for PyTorch backend), default is `"localhost"`.
+        port: process port to listen
+            (required for PyTorch backend), default is `"12345"`.
+        backend: multiprocessing backend to use,
+            default is `"nccl"`.
         world_size: number of processes.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DistributedDataParallelAMPEngine(port=12345)
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DistributedDataParallelAMPEngine
+            port: 12345
+
+        stages:
+            ...
+
     """
 
     def __init__(
diff --git a/catalyst/engines/apex.py b/catalyst/engines/apex.py
index 56aade0c7f..c46609afad 100644
--- a/catalyst/engines/apex.py
+++ b/catalyst/engines/apex.py
@@ -126,23 +126,55 @@ class APEXEngine(DeviceEngine):
 
     Args:
         device: use device, default is `"cuda"`.
-        opt_level: optimization level, should be one of "O0", "O1", "O2", "O3" or "O4".
+        opt_level: optimization level, should be one of ``"O0"``,
+            ``"O1"``, ``"O2"`` or ``"O3"``.
 
-            - "O0" - no-op training
-            - "O1" - mixed precision (FP16) training (default)
-            - "O2" - "almost" mixed precision training
-            - "O3" - another implementation of mixed precision training
+            - ``"O0"`` - no-op training
+            - ``"O1"`` - mixed precision (FP16) training (default)
+            - ``"O2"`` - "almost" mixed precision training
+            - ``"O3"`` - another implementation of mixed precision training
 
             Details about levels can be found here:
             https://nvidia.github.io/apex/amp.html#opt-levels
-        keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm
+        keep_batchnorm_fp32: To enhance precision and enable CUDNN batchnorm
             (which improves performance),
             it’s often beneficial to keep batchnorm weights in FP32 even
             if the rest of the model is FP16.
         loss_scale: If loss_scale is a float value,
-            use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic",
+            use this value as the static (fixed) loss scale
+            If loss_scale is the string "dynamic",
             adaptively adjust the loss scale over time.
             Dynamic loss scale adjustments are performed by Amp automatically.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.APEXEngine(opt_level="O1", keep_batchnorm_fp32=False)
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: APEXEngine
+            opt_level: O1
+            keep_batchnorm_fp32: false
+
+        stages:
+            ...
+
     """
 
     def __init__(
@@ -264,7 +296,49 @@ def unpack_checkpoint(
 
 
 class DataParallelApexEngine(APEXEngine):
-    """Apex multi-gpu training device engine."""
+    """Apex multi-gpu training device engine.
+
+    Args:
+        opt_level: optimization level, should be one of ``"O0"``,
+            ``"O1"``, ``"O2"`` or ``"O3"``.
+
+            - ``"O0"`` - no-op training
+            - ``"O1"`` - mixed precision (FP16) training (default)
+            - ``"O2"`` - "almost" mixed precision training
+            - ``"O3"`` - another implementation of mixed precision training
+
+            Details about levels can be found here:
+            https://nvidia.github.io/apex/amp.html#opt-levels
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DataParallelApexEngine(opt_level="O1")
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DataParallelApexEngine
+            opt_level: O1
+
+        stages:
+            ...
+
+    """
 
     def __init__(self, opt_level: str = "O1"):
         """Init."""
@@ -303,29 +377,67 @@ class DistributedDataParallelApexEngine(DistributedDataParallelEngine):
     """Distributed Apex MultiGPU training device engine.
 
     Args:
-        address: process address to use (required for PyTorch backend), default is `"localhost"`.
-        port: process port to listen (required for PyTorch backend), default is `"12345"`.
-        backend: multiprocessing backend to use, default is `"nccl"`.
+        address: process address to use
+            (required for PyTorch backend), default is `"localhost"`.
+        port: process port to listen
+            (required for PyTorch backend), default is `"12345"`.
+        backend: multiprocessing backend to use,
+            default is `"nccl"`.
         world_size: number of processes.
-        opt_level: optimization level, should be one of "O0", "O1", "O2", "O3" or "O4".
+        opt_level: optimization level, should be one of ``"O0"``,
+            ``"O1"``, ``"O2"`` or ``"O3"``.
 
-            - "O0" - no-op training
-            - "O1" - mixed precision (FP16) training (default)
-            - "O2" - "almost" mixed precision training
-            - "O3" - another implementation of mixed precision training
+            - ``"O0"`` - no-op training
+            - ``"O1"`` - mixed precision (FP16) training (default)
+            - ``"O2"`` - "almost" mixed precision training
+            - ``"O3"`` - another implementation of mixed precision training
 
             Details about levels can be found here:
             https://nvidia.github.io/apex/amp.html#opt-levels
 
-        keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm
-            (which improves performance),
+        keep_batchnorm_fp32: To enhance precision and
+            enable CUDNN batchnorm (which improves performance),
             it’s often beneficial to keep batchnorm weights in FP32 even
             if the rest of the model is FP16.
         loss_scale: If loss_scale is a float value,
-            use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic",
+            use this value as the static (fixed) loss scale.
+            If loss_scale is the string "dynamic",
             adaptively adjust the loss scale over time.
             Dynamic loss scale adjustments are performed by Amp automatically.
-        delay_all_reduce: boolean flag for delayed all reduce
+        delay_all_reduce (bool): boolean flag for delayed all reduce,
+            default is `True`.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DistributedDataParallelApexEngine(
+                    port=12345,
+                    opt_level="O1"
+                )
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DistributedDataParallelApexEngine
+            port: 12345
+            opt_level: O1
+
+        stages:
+            ...
     """
 
     def __init__(
diff --git a/catalyst/engines/torch.py b/catalyst/engines/torch.py
index 4ff041a601..176c9d1c34 100644
--- a/catalyst/engines/torch.py
+++ b/catalyst/engines/torch.py
@@ -22,7 +22,36 @@ class DeviceEngine(IEngine):
     """Single training device engine.
 
     Args:
-        device (str, optional): use device, default is `"cpu"`.
+        device: use device, default is `"cpu"`.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DeviceEngine("cuda:1")
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DeviceEngine
+            device: cuda:1
+
+        stages:
+            ...
+
     """
 
     def __init__(self, device: str = None):
@@ -40,7 +69,7 @@ def rank(self) -> int:
 
     @property
     def world_size(self) -> int:
-        """Process world size  for distributed training."""
+        """Process world size for distributed training."""
         return 1
 
     def sync_device(
@@ -167,7 +196,36 @@ def load_checkpoint(self, path: str):
 
 
 class DataParallelEngine(DeviceEngine):
-    """MultiGPU training device engine."""
+    """MultiGPU training device engine.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DataParallelEngine()
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DataParallelEngine
+
+        stages:
+            ...
+
+    """
 
     def __init__(self):
         """Init"""
@@ -206,10 +264,42 @@ class DistributedDataParallelEngine(DeviceEngine):
     """Distributed MultiGPU training device engine.
 
     Args:
-        address: process address to use (required for PyTorch backend), default is `"localhost"`.
-        port: process port to listen (required for PyTorch backend), default is `"12345"`.
-        backend: multiprocessing backend to use, default is `"nccl"`.
+        address: process address to use
+            (required for PyTorch backend), default is `"localhost"`.
+        port: process port to listen
+            (required for PyTorch backend), default is `"12345"`.
+        backend: multiprocessing backend to use,
+            default is `"nccl"`.
         world_size: number of processes.
+
+    Examples:
+
+    .. code-block:: python
+
+        from catalyst import dl
+
+        class MyRunner(dl.IRunner):
+            # ...
+            def get_engine(self):
+                return dl.DistributedDataParallelEngine(port=12345)
+            # ...
+
+    .. code-block:: yaml
+
+        args:
+            logs: ...
+
+        model:
+            _target_: ...
+            ...
+
+        engine:
+            _target_: DistributedDataParallelEngine
+            port: 12345
+
+        stages:
+            ...
+
     """
 
     def __init__(
@@ -266,7 +356,13 @@ def is_worker_process(self) -> bool:
         return self._rank > 0
 
     def setup_process(self, rank: int = -1, world_size: int = 1):
-        """Initialize DDP variables and processes."""
+        """Initialize DDP variables and processes.
+
+        Args:
+            rank: process rank. Default is `-1`.
+            world_size: number of devices in netwok to expect for train.
+                Default is `1`.
+        """
         self._rank = rank
         self._world_size = world_size
         os.environ["MASTER_ADDR"] = str(self.address)