Skip to content

Commit

Permalink
add VERBOSE logging level
Browse files Browse the repository at this point in the history
  • Loading branch information
edward-io committed Dec 9, 2021
1 parent 46f718d commit f0ffc81
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 0 deletions.
12 changes: 12 additions & 0 deletions pytorch_lightning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@

from pytorch_lightning.__about__ import * # noqa: F401, F403

VERBOSE = 15 # between logging.INFO and logging.DEBUG, used for logging in production use cases


def verbose(self, message, *args, **kws):
if self.isEnabledFor(VERBOSE):
self._log(VERBOSE, message, args, **kws)


logging.addLevelName(VERBOSE, "VERBOSE")
logging.verbose = verbose
logging.Logger.verbose = verbose

_root_logger = logging.getLogger()
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)
Expand Down
6 changes: 6 additions & 0 deletions pytorch_lightning/plugins/training_type/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def __init__(
checkpoint_io=checkpoint_io,
precision_plugin=precision_plugin,
)
log.verbose(f"Initializing DDP: {self.__class__.__name__}")
self.interactive_ddp_procs = []
self._num_nodes = 1
self.sync_batchnorm = False
Expand Down Expand Up @@ -222,6 +223,7 @@ def _call_children_scripts(self):
self._rank_0_has_called_call_children_scripts = True

def setup_distributed(self):
log.verbose(f"{self.__class__.__name__}: setting up distributed...")
reset_seed()

# determine which process we are and world size
Expand Down Expand Up @@ -329,6 +331,7 @@ def _reinit_optimizers_with_post_localSGD(self, warmup_steps: int):
trainer.convert_to_lightning_optimizers()

def configure_ddp(self) -> None:
log.verbose(f"{self.__class__.__name__}: configuring DDP...")
self.pre_configure_ddp()
self._model = self._setup_model(LightningDistributedModule(self.model))
self._register_ddp_hooks()
Expand Down Expand Up @@ -377,6 +380,7 @@ def pre_backward(self, closure_loss: torch.Tensor) -> None:
prepare_for_backward(self.model, closure_loss)

def model_to_device(self):
log.verbose(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
self.model.to(self.root_device)

def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor:
Expand Down Expand Up @@ -497,12 +501,14 @@ def reconciliate_processes(self, trace: str) -> None:
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")

def teardown(self) -> None:
log.verbose(f"{self.__class__.__name__}: tearing down plugin...")
super().teardown()
if isinstance(self.model, DistributedDataParallel):
self.model = self.lightning_module

if self.on_gpu:
# GPU teardown
log.verbose(f"{self.__class__.__name__}: moving model to CPU...")
self.lightning_module.cpu()
# clean up memory
torch.cuda.empty_cache()
8 changes: 8 additions & 0 deletions pytorch_lightning/plugins/training_type/fully_sharded.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import contextlib
from typing import Dict, Generator, List, Optional
import logging

import torch

Expand All @@ -30,6 +31,8 @@
from fairscale.nn import default_auto_wrap_policy, enable_wrap
from fairscale.nn.data_parallel import FullyShardedDataParallel

log = logging.getLogger(__name__)


class DDPFullyShardedPlugin(DDPPlugin):

Expand Down Expand Up @@ -129,6 +132,7 @@ def setup_distributed(self) -> None:

@contextlib.contextmanager
def model_sharded_context(self) -> Generator:
log.verbose(f"{self.__class__.__name__}: entered model_sharded_context.")
precision = self.precision_plugin.precision

def wrap_policy(*args, **kwargs):
Expand All @@ -150,7 +154,10 @@ def wrap_policy(*args, **kwargs):
):
yield

log.verbose(f"{self.__class__.__name__}: exiting model_sharded_context.")

def configure_ddp(self) -> None:
log.verbose(f"{self.__class__.__name__}: configuring DDP... (cpu_offload: [{self.cpu_offload}])")
if not self.cpu_offload:
# When using CPU Offload, FSDP will manage the CUDA movement for us.
# Note: this would be problematic for large model (which could not fit in one GPU)
Expand All @@ -170,6 +177,7 @@ def pre_dispatch(self, trainer: "pl.Trainer") -> None:
self.setup_optimizers(trainer)

def model_to_device(self) -> None:
log.verbose(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
# ensure we update the device type in the lightning module
self.lightning_module.to(self.root_device)

Expand Down
5 changes: 5 additions & 0 deletions pytorch_lightning/trainer/connectors/checkpoint_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import os
import re
import logging
from typing import Any, Dict, Optional

import torch
Expand All @@ -35,6 +36,9 @@
from omegaconf import Container


log: logging.Logger = logging.getLogger(__name__)


class CheckpointConnector:
def __init__(self, trainer: "pl.Trainer", resume_from_checkpoint: Optional[_PATH] = None) -> None:
self.trainer = trainer
Expand Down Expand Up @@ -70,6 +74,7 @@ def resume_start(self, checkpoint_path: Optional[_PATH] = None) -> None:
self.resume_checkpoint_path = self.hpc_resume_path or checkpoint_path
checkpoint_path = self.resume_checkpoint_path
if not checkpoint_path:
log.info("`checkpoint_path` not specified. Skipping checkpoint loading.")
return

rank_zero_info(f"Restoring states from the checkpoint path at {checkpoint_path}")
Expand Down

0 comments on commit f0ffc81

Please sign in to comment.