add VERBOSE logging level

Lightning-AI · Dec 9, 2021 · f0ffc81 · f0ffc81
1 parent 46f718d
commit f0ffc81
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 0 deletions.
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
@@ -4,6 +4,18 @@
 
 from pytorch_lightning.__about__ import *  # noqa: F401, F403
 
+VERBOSE = 15  # between logging.INFO and logging.DEBUG, used for logging in production use cases
+
+
+def verbose(self, message, *args, **kws):
+    if self.isEnabledFor(VERBOSE):
+        self._log(VERBOSE, message, args, **kws)
+
+
+logging.addLevelName(VERBOSE, "VERBOSE")
+logging.verbose = verbose
+logging.Logger.verbose = verbose
+
 _root_logger = logging.getLogger()
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
@@ -100,6 +100,7 @@ def __init__(
             checkpoint_io=checkpoint_io,
             precision_plugin=precision_plugin,
         )
+        log.verbose(f"Initializing DDP: {self.__class__.__name__}")
         self.interactive_ddp_procs = []
         self._num_nodes = 1
         self.sync_batchnorm = False
@@ -222,6 +223,7 @@ def _call_children_scripts(self):
         self._rank_0_has_called_call_children_scripts = True
 
     def setup_distributed(self):
+        log.verbose(f"{self.__class__.__name__}: setting up distributed...")
         reset_seed()
 
         # determine which process we are and world size
@@ -329,6 +331,7 @@ def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int):
         trainer.convert_to_lightning_optimizers()
 
     def configure_ddp(self) -> None:
+        log.verbose(f"{self.__class__.__name__}: configuring DDP...")
         self.pre_configure_ddp()
         self._model = self._setup_model(LightningDistributedModule(self.model))
         self._register_ddp_hooks()
@@ -377,6 +380,7 @@ def pre_backward(self, closure_loss: torch.Tensor) -> None:
             prepare_for_backward(self.model, closure_loss)
 
     def model_to_device(self):
+        log.verbose(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
         self.model.to(self.root_device)
 
     def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor:
@@ -497,12 +501,14 @@ def reconciliate_processes(self, trace: str) -> None:
         raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
 
     def teardown(self) -> None:
+        log.verbose(f"{self.__class__.__name__}: tearing down plugin...")
         super().teardown()
         if isinstance(self.model, DistributedDataParallel):
             self.model = self.lightning_module
 
         if self.on_gpu:
             # GPU teardown
+            log.verbose(f"{self.__class__.__name__}: moving model to CPU...")
             self.lightning_module.cpu()
             # clean up memory
             torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import contextlib
 from typing import Dict, Generator, List, Optional
+import logging
 
 import torch
 
@@ -30,6 +31,8 @@
     from fairscale.nn import default_auto_wrap_policy, enable_wrap
     from fairscale.nn.data_parallel import FullyShardedDataParallel
 
+log = logging.getLogger(__name__)
+
 
 class DDPFullyShardedPlugin(DDPPlugin):
 
@@ -129,6 +132,7 @@ def setup_distributed(self) -> None:
 
     @contextlib.contextmanager
     def model_sharded_context(self) -> Generator:
+        log.verbose(f"{self.__class__.__name__}: entered model_sharded_context.")
         precision = self.precision_plugin.precision
 
         def wrap_policy(*args, **kwargs):
@@ -150,7 +154,10 @@ def wrap_policy(*args, **kwargs):
         ):
             yield
 
+        log.verbose(f"{self.__class__.__name__}: exiting model_sharded_context.")
+
     def configure_ddp(self) -> None:
+        log.verbose(f"{self.__class__.__name__}: configuring DDP... (cpu_offload: [{self.cpu_offload}])")
         if not self.cpu_offload:
             # When using CPU Offload, FSDP will manage the CUDA movement for us.
             # Note: this would be problematic for large model (which could not fit in one GPU)
@@ -170,6 +177,7 @@ def pre_dispatch(self, trainer: "pl.Trainer") -> None:
         self.setup_optimizers(trainer)
 
     def model_to_device(self) -> None:
+        log.verbose(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
         # ensure we update the device type in the lightning module
         self.lightning_module.to(self.root_device)
 

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -14,6 +14,7 @@
 
 import os
 import re
+import logging
 from typing import Any, Dict, Optional
 
 import torch
@@ -35,6 +36,9 @@
     from omegaconf import Container
 
 
+log: logging.Logger = logging.getLogger(__name__)
+
+
 class CheckpointConnector:
     def __init__(self, trainer: "pl.Trainer", resume_from_checkpoint: Optional[_PATH] = None) -> None:
         self.trainer = trainer
@@ -70,6 +74,7 @@ def resume_start(self, checkpoint_path: Optional[_PATH] = None) -> None:
         self.resume_checkpoint_path = self.hpc_resume_path or checkpoint_path
         checkpoint_path = self.resume_checkpoint_path
         if not checkpoint_path:
+            log.info("`checkpoint_path` not specified. Skipping checkpoint loading.")
             return
 
         rank_zero_info(f"Restoring states from the checkpoint path at {checkpoint_path}")