diff --git a/catalyst/core/engine.py b/catalyst/core/engine.py index 567cf70253..a59c6ba11a 100644 --- a/catalyst/core/engine.py +++ b/catalyst/core/engine.py @@ -6,7 +6,7 @@ @contextmanager -def nullcontext(enter_result=None): +def nullcontext(enter_result: Any = None): """Context handler.""" yield enter_result @@ -37,8 +37,7 @@ def rank(self) -> int: @property @abstractmethod def world_size(self) -> int: - """Process world size for distributed training.""" - # only for ddp + """Process world size for distributed training.""" pass @property @@ -49,26 +48,28 @@ def is_ddp(self) -> bool: @property def is_master_process(self) -> bool: """Checks if a process is master process. - Should be implemented only for DDP setup in other cases should always return True. + Should be implemented only for distributed training (ddp). + For non distributed training should always return `True`. Returns: - `True` if current process is a master process, otherwise `False`. + `True` if current process is a master process in other cases return `False`. """ return True @property def is_worker_process(self) -> bool: """Checks if a process is worker process. - Should be implemented only for DDP setup in other cases should always return False. + Should be implemented only for distributed training (ddp). + For non distributed training should always return `False`. Returns: - `True` if current process is a worker process, otherwise `False`. + `True` if current process is a worker process in other cases return `False`. """ return False @abstractmethod def sync_device(self, tensor_or_module: Any) -> Any: - """Moves ``tensor_or_module`` to Engine's deivce. + """Moves ``tensor_or_module`` to Engine's device. Args: tensor_or_module: tensor to mode @@ -89,23 +90,50 @@ def init_components( @abstractmethod def deinit_components(self): - """Deinits the runs components.""" - # only for ddp + """Deinits the runs components. + In distributed mode should destroy process group. + """ pass @abstractmethod def zero_grad(self, loss, model, optimizer) -> None: - """Abstraction over ``model.zero_grad()`` step.""" + """Abstraction over ``model.zero_grad()`` step. + Should be overloaded in cases when required to set arguments + for ``model.zero_grad()`` like `set_to_none=True` or + you need to use custom scheme which replaces/improves + `.zero_grad()` method. + + Args: + loss: tensor with loss value. + model: model module. + optimizer: model optimizer. + """ pass @abstractmethod def backward_loss(self, loss, model, optimizer) -> None: - """Abstraction over ``loss.backward()`` step.""" + """Abstraction over ``loss.backward()`` step. + Should be overloaded in cases when required loss scaling. + Examples - APEX and AMP. + + Args: + loss: tensor with loss value. + model: model module. + optimizer: model optimizer. + """ pass @abstractmethod def optimizer_step(self, loss, model, optimizer) -> None: - """Abstraction over ``optimizer.step()`` step.""" + """Abstraction over ``optimizer.step()`` step. + Should be overloaded in cases when required gradient scaling. + Example - AMP. + + Args: + loss: tensor with loss value. + model: model module. + optimizer: model optimizer. + """ pass @abstractmethod @@ -174,7 +202,8 @@ def load_checkpoint(self, path: str) -> Dict: pass def autocast(self, *args, **kwargs): - """AMP scaling context. Default autocast context does not scale anything. + """AMP scaling context. + Default autocast context does not scale anything. Args: *args: some args diff --git a/catalyst/engines/amp.py b/catalyst/engines/amp.py index 5db1b91a9c..1d0f850e3b 100644 --- a/catalyst/engines/amp.py +++ b/catalyst/engines/amp.py @@ -10,6 +10,35 @@ class AMPEngine(DeviceEngine): Args: device: used device, default is `"cuda"`. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.AMPEngine("cuda:1") + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: AMPEngine + device: cuda:1 + + stages: + ... + """ def __init__(self, device: str = "cuda"): @@ -36,7 +65,36 @@ def autocast(self): class DataParallelAMPEngine(AMPEngine): - """AMP multi-gpu training device engine.""" + """AMP multi-gpu training device engine. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DataParallelAMPEngine() + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DataParallelAMPEngine + + stages: + ... + + """ def __init__(self): """Init.""" @@ -75,10 +133,42 @@ class DistributedDataParallelAMPEngine(DistributedDataParallelEngine): """Distributed AMP multi-gpu training device engine. Args: - address: process address to use (required for PyTorch backend), default is `"localhost"`. - port: process port to listen (required for PyTorch backend), default is `"12345"`. - backend: multiprocessing backend to use, default is `"nccl"`. + address: process address to use + (required for PyTorch backend), default is `"localhost"`. + port: process port to listen + (required for PyTorch backend), default is `"12345"`. + backend: multiprocessing backend to use, + default is `"nccl"`. world_size: number of processes. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DistributedDataParallelAMPEngine(port=12345) + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DistributedDataParallelAMPEngine + port: 12345 + + stages: + ... + """ def __init__( diff --git a/catalyst/engines/apex.py b/catalyst/engines/apex.py index 56aade0c7f..c46609afad 100644 --- a/catalyst/engines/apex.py +++ b/catalyst/engines/apex.py @@ -126,23 +126,55 @@ class APEXEngine(DeviceEngine): Args: device: use device, default is `"cuda"`. - opt_level: optimization level, should be one of "O0", "O1", "O2", "O3" or "O4". + opt_level: optimization level, should be one of ``"O0"``, + ``"O1"``, ``"O2"`` or ``"O3"``. - - "O0" - no-op training - - "O1" - mixed precision (FP16) training (default) - - "O2" - "almost" mixed precision training - - "O3" - another implementation of mixed precision training + - ``"O0"`` - no-op training + - ``"O1"`` - mixed precision (FP16) training (default) + - ``"O2"`` - "almost" mixed precision training + - ``"O3"`` - another implementation of mixed precision training Details about levels can be found here: https://nvidia.github.io/apex/amp.html#opt-levels - keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm + keep_batchnorm_fp32: To enhance precision and enable CUDNN batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16. loss_scale: If loss_scale is a float value, - use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", + use this value as the static (fixed) loss scale + If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.APEXEngine(opt_level="O1", keep_batchnorm_fp32=False) + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: APEXEngine + opt_level: O1 + keep_batchnorm_fp32: false + + stages: + ... + """ def __init__( @@ -264,7 +296,49 @@ def unpack_checkpoint( class DataParallelApexEngine(APEXEngine): - """Apex multi-gpu training device engine.""" + """Apex multi-gpu training device engine. + + Args: + opt_level: optimization level, should be one of ``"O0"``, + ``"O1"``, ``"O2"`` or ``"O3"``. + + - ``"O0"`` - no-op training + - ``"O1"`` - mixed precision (FP16) training (default) + - ``"O2"`` - "almost" mixed precision training + - ``"O3"`` - another implementation of mixed precision training + + Details about levels can be found here: + https://nvidia.github.io/apex/amp.html#opt-levels + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DataParallelApexEngine(opt_level="O1") + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DataParallelApexEngine + opt_level: O1 + + stages: + ... + + """ def __init__(self, opt_level: str = "O1"): """Init.""" @@ -303,29 +377,67 @@ class DistributedDataParallelApexEngine(DistributedDataParallelEngine): """Distributed Apex MultiGPU training device engine. Args: - address: process address to use (required for PyTorch backend), default is `"localhost"`. - port: process port to listen (required for PyTorch backend), default is `"12345"`. - backend: multiprocessing backend to use, default is `"nccl"`. + address: process address to use + (required for PyTorch backend), default is `"localhost"`. + port: process port to listen + (required for PyTorch backend), default is `"12345"`. + backend: multiprocessing backend to use, + default is `"nccl"`. world_size: number of processes. - opt_level: optimization level, should be one of "O0", "O1", "O2", "O3" or "O4". + opt_level: optimization level, should be one of ``"O0"``, + ``"O1"``, ``"O2"`` or ``"O3"``. - - "O0" - no-op training - - "O1" - mixed precision (FP16) training (default) - - "O2" - "almost" mixed precision training - - "O3" - another implementation of mixed precision training + - ``"O0"`` - no-op training + - ``"O1"`` - mixed precision (FP16) training (default) + - ``"O2"`` - "almost" mixed precision training + - ``"O3"`` - another implementation of mixed precision training Details about levels can be found here: https://nvidia.github.io/apex/amp.html#opt-levels - keep_batchnorm_fp32: To enhance precision and enable cudnn batchnorm - (which improves performance), + keep_batchnorm_fp32: To enhance precision and + enable CUDNN batchnorm (which improves performance), it’s often beneficial to keep batchnorm weights in FP32 even if the rest of the model is FP16. loss_scale: If loss_scale is a float value, - use this value as the static (fixed) loss scale. If loss_scale is the string "dynamic", + use this value as the static (fixed) loss scale. + If loss_scale is the string "dynamic", adaptively adjust the loss scale over time. Dynamic loss scale adjustments are performed by Amp automatically. - delay_all_reduce: boolean flag for delayed all reduce + delay_all_reduce (bool): boolean flag for delayed all reduce, + default is `True`. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DistributedDataParallelApexEngine( + port=12345, + opt_level="O1" + ) + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DistributedDataParallelApexEngine + port: 12345 + opt_level: O1 + + stages: + ... """ def __init__( diff --git a/catalyst/engines/torch.py b/catalyst/engines/torch.py index 4ff041a601..176c9d1c34 100644 --- a/catalyst/engines/torch.py +++ b/catalyst/engines/torch.py @@ -22,7 +22,36 @@ class DeviceEngine(IEngine): """Single training device engine. Args: - device (str, optional): use device, default is `"cpu"`. + device: use device, default is `"cpu"`. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DeviceEngine("cuda:1") + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DeviceEngine + device: cuda:1 + + stages: + ... + """ def __init__(self, device: str = None): @@ -40,7 +69,7 @@ def rank(self) -> int: @property def world_size(self) -> int: - """Process world size for distributed training.""" + """Process world size for distributed training.""" return 1 def sync_device( @@ -167,7 +196,36 @@ def load_checkpoint(self, path: str): class DataParallelEngine(DeviceEngine): - """MultiGPU training device engine.""" + """MultiGPU training device engine. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DataParallelEngine() + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DataParallelEngine + + stages: + ... + + """ def __init__(self): """Init""" @@ -206,10 +264,42 @@ class DistributedDataParallelEngine(DeviceEngine): """Distributed MultiGPU training device engine. Args: - address: process address to use (required for PyTorch backend), default is `"localhost"`. - port: process port to listen (required for PyTorch backend), default is `"12345"`. - backend: multiprocessing backend to use, default is `"nccl"`. + address: process address to use + (required for PyTorch backend), default is `"localhost"`. + port: process port to listen + (required for PyTorch backend), default is `"12345"`. + backend: multiprocessing backend to use, + default is `"nccl"`. world_size: number of processes. + + Examples: + + .. code-block:: python + + from catalyst import dl + + class MyRunner(dl.IRunner): + # ... + def get_engine(self): + return dl.DistributedDataParallelEngine(port=12345) + # ... + + .. code-block:: yaml + + args: + logs: ... + + model: + _target_: ... + ... + + engine: + _target_: DistributedDataParallelEngine + port: 12345 + + stages: + ... + """ def __init__( @@ -266,7 +356,13 @@ def is_worker_process(self) -> bool: return self._rank > 0 def setup_process(self, rank: int = -1, world_size: int = 1): - """Initialize DDP variables and processes.""" + """Initialize DDP variables and processes. + + Args: + rank: process rank. Default is `-1`. + world_size: number of devices in netwok to expect for train. + Default is `1`. + """ self._rank = rank self._world_size = world_size os.environ["MASTER_ADDR"] = str(self.address)