catalyst-team · Scitator · Jun 27, 2021 · May 31, 2021 · May 31, 2021 · May 31, 2021
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
   - `utils.ddp_sync_run` function for synchronous ddp run
   - CIFAR10 and CIFAR100 datasets from torchvision (no cv-based requirements)
   - [Catalyst Engines demo](https://github.com/catalyst-team/catalyst/tree/master/examples/engines)
+- Profiler callback ([#1226](https://github.com/catalyst-team/catalyst/pull/1226))
 
 ### Changed
 

@@ -1,5 +1,9 @@
 # flake8: noqa
 
+from distutils.version import LooseVersion
+
+import torch
+
 from catalyst.settings import SETTINGS
 
 from catalyst.core.callback import (
@@ -48,6 +52,9 @@
 if SETTINGS.quantization_required:
     from catalyst.callbacks.quantization import QuantizationCallback
 
+if LooseVersion(torch.__version__) >= LooseVersion("1.8.1"):
+    from catalyst.callbacks.profiler import ProfilerCallback
+
 from catalyst.callbacks.scheduler import (
     ISchedulerCallback,
     SchedulerCallback,

@@ -0,0 +1,224 @@
+from typing import Any, Dict, Sequence
+import os
+from tempfile import TemporaryDirectory
+
+import torch
+
+from catalyst.core.callback import Callback, CallbackNode, CallbackOrder
+from catalyst.core.runner import IRunner
+
+
+class ProfilerCallback(Callback):
+    """Profile specified epoch or some fixed number of batches.
+
+    Args:
+        loader_key: name of the loader to use for profiling.
+            If ``None`` then will be used first loader from experiment.
+        epoch: epoch number to use for profiling.
+        num_batches: number of batches to use in epoch to do a profiling.
+            If ``None`` then will be used all batches in loader.
+        profiler_kwargs: arguments to pass to a profiler.
+            To get more info about possible arguments please use PyTorch
+            `profiler docs`_.
+        tensorboard_path: path where should be stored logs for tensorboard.
+            If ``None`` then will be ignored.
+        export_chrome_trace_path: path to export chrome trace.
+            If ``None`` then will be ignored exporting chrome trace to a file.
+        export_stacks_kwargs: arguments to pass to a ``profiler.export_stacks`` method.
+            If ``None`` then triggering ``profiler.export_stacks`` will be avoided.
+
+            Example of using **FlameGraph** tool:
+
+            .. code-block:: bash
+
+                git clone https://github.com/brendangregg/FlameGraph
+                cd FlameGraph
+                ./flamegraph.pl –title “CPU time” –countname “us.” profiler.stacks > perf_viz.svg
+
+    .. note::
+        Export to tensorboard and chrome trace mutually exclusive and specifying both of
+        them will raise an error.
+
+    Example:
+        .. code-block:: python
+
+            import os
+
+            import torch
+            from torch import nn
+            from torch.utils.data import DataLoader
+
+            from catalyst import dl
+            from catalyst.data import ToTensor
+            from catalyst.contrib.datasets import MNIST
+            from catalyst.contrib.nn.modules import Flatten
+
+            loaders = {
+                "train": DataLoader(
+                    MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()),
+                    batch_size=32,
+                ),
+                "valid": DataLoader(
+                    MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()),
+                    batch_size=32,
+                ),
+            }
+
+            model = nn.Sequential(Flatten(), nn.Linear(784, 512), nn.ReLU(), nn.Linear(512, 10))
+            criterion = nn.CrossEntropyLoss()
+            optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
+            runner = dl.SupervisedRunner()
+            runner.train(
+                model=model,
+                callbacks=[dl.ProfilerCallback(
+                    loader_key="train", epoch=3,
+                    profiler_kwargs=dict(
+                        activities=[
+                            torch.profiler.ProfilerActivity.CPU,
+                            torch.profiler.ProfilerActivity.CUDA,
+                        ],
+                        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                            "./logs/tb_profile"
+                        ),
+                        with_stack=True,
+                        with_flops=True,
+                    )
+                )],
+                loaders=loaders,
+                criterion=criterion,
+                optimizer=optimizer,
+                num_epochs=5,
+                logdir="./logs",
+            )
+
+    .. _profiler docs: https://pytorch.org/docs/stable/profiler.html
+
+    """
+
+    def __init__(
+        self,
+        loader_key: str = None,
+        epoch: int = 1,
+        num_batches: int = None,
+        profiler_kwargs: Dict[str, Any] = None,
+        tensorboard_path: str = None,
+        export_chrome_trace_path: str = None,
+        export_stacks_kwargs: Dict[str, Any] = None,
+    ):
+        super().__init__(order=CallbackOrder.Internal, node=CallbackNode.Master)
+
+        self.loader_key = loader_key
+        self.epoch = epoch
+        self.num_batches = num_batches
+        self.batch_cnt = 0
+
+        self.profiler_kwargs = {} if profiler_kwargs is None else profiler_kwargs
+        if tensorboard_path is not None and "on_trace_ready" not in profiler_kwargs:
+            self.profiler_kwargs["on_trace_ready"] = torch.profiler.tensorboard_trace_handler(
+                tensorboard_path
+            )
+        self.export_chrome_trace_path = export_chrome_trace_path
+        self.export_stacks_kwargs = export_stacks_kwargs
+        self.profiler = None
+        self.stats = None
+
+    def on_experiment_start(self, runner: IRunner) -> None:
+        """
+        On batch end action
+
+        Args:
+            runner: current runner
+        """
+        if self.loader_key is None:
+            self.loader_key = runner.loader_key  # use first loader for profile
+
+    def _should_use_profiler(self, loader_key: str, epoch: int):
+        if self.loader_key == loader_key and self.epoch == epoch:
+            if self.num_batches is not None:
+                return self.batch_cnt < self.num_batches
+            return True
+        return False
+
+    def _enter_profiler(self, runner: IRunner) -> None:
+        loader_key = runner.loader_key
+        epoch = runner.stage_epoch_step
+
+        if not self._should_use_profiler(loader_key, epoch):
+            return
+
+        if self.profiler is None:
+            self.profiler = torch.profiler.profile(**self.profiler_kwargs)
+            self.profiler.__enter__()
+
+    def _exit_profiler(self, runner: IRunner) -> None:
+        loader_key = runner.loader_key
+        epoch = runner.stage_epoch_step
+
+        if not self._should_use_profiler(loader_key, epoch) or self.profiler is None:
+            return
+
+        if self.stats is None:
+            self.profiler.__exit__(None, None, None)
+
+            if "on_trace_ready" not in self.profiler_kwargs and self.export_chrome_trace_path:
+                self.profiler.export_chrome_trace(self.export_chrome_trace_path)
+
+            if self.export_stacks_kwargs is not None:
+                self.profiler.export_stacks(**self.export_stacks_kwargs)
+
+            self.stats = self.profiler.key_averages()
+            table_txt = self.stats.table(sort_by="cpu_time_total")  # , row_limit=100)
+
+            with TemporaryDirectory() as tmp_dir:
+                artifact_path = os.path.join(tmp_dir, "profiler_table.txt")
+                with open(artifact_path, "w") as f:
+                    f.write(table_txt)
+                runner.log_artifact(
+                    tag="profiler", artifact="profiler.txt", path_to_artifact=artifact_path,
+                )
+
+            print(table_txt)
+
+    def on_loader_start(self, runner: IRunner) -> None:
+        """
+        On loader start action
+
+        Args:
+            runner: current runner
+        """
+        self._enter_profiler(runner)
+
+    def on_loader_end(self, runner: IRunner) -> None:
+        """
+        On loader end action
+
+        Args:
+            runner: current runner
+        """
+        self._exit_profiler(runner)
+
+    def on_batch_start(self, runner: IRunner) -> None:
+        """
+        On batch start action
+
+        Args:
+            runner: current runner
+        """
+        self._enter_profiler(runner)
+
+    def on_batch_end(self, runner: IRunner) -> None:
+        """
+        On batch end action
+
+        Args:
+            runner: current runner
+        """
+        if self.profiler is None:
+            return
+
+        if self.num_batches is not None and self.batch_cnt < self.num_batches:
+            # do a profiling step after each batch
+            self.profiler.step()
+            self.batch_cnt += 1
+
+        self._exit_profiler(runner)
@@ -134,6 +134,13 @@ PruningCallback
     :exclude-members: __init__, on_experiment_start, on_stage_start, on_epoch_start, on_loader_start, on_batch_start, on_batch_end, on_loader_end, on_epoch_end, on_stage_end, on_experiment_end
     :show-inheritance:
 
+ProfilerCallback
+~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: catalyst.callbacks.profiler.ProfilerCallback
+    :members:
+    :exclude-members: __init__, on_experiment_start, on_stage_start, on_epoch_start, on_loader_start, on_batch_start, on_batch_end, on_loader_end, on_epoch_end, on_stage_end, on_experiment_end
+    :show-inheritance:
+
 QuantizationCallback
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: catalyst.callbacks.quantization.QuantizationCallback