How to train in a multi-node environment? #18423

rahaazad2 · 2023-08-29T01:26:04Z

rahaazad2
Aug 29, 2023

I want to train a pytorch-lightning code in a cluster of 6 nodes (each node 1 gpu). Here's the code for training:

`
import argparse
import json
import os

import pytorch_lightning as pl
import src.data_loaders as module_data
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from src.utils import get_model_and_tokenizer
from torch.nn import functional as F
from torch.utils.data import DataLoader

class MyClassifier(pl.LightningModule):
def init(self, config):
super().init()
self.save_hyperparameters()
self.num_classes = config["arch"]["args"]["num_classes"]
self.model_args = config["arch"]["args"]
self.model, self.tokenizer = get_model_and_tokenizer(**self.model_args)
self.bias_loss = False

     self.num_main_classes = self.num_classes

    self.config = config

def forward(self, x):
    inputs = self.tokenizer(x, return_tensors="pt", truncation=True, padding=True).to(self.model.device)
    outputs = self.model(**inputs)[0]
    return outputs

def training_step(self, batch, batch_idx):
    x, meta = batch
    output = self.forward(x)
    loss = self.binary_cross_entropy(output, meta)
    self.log("train_loss", loss)
    return {"loss": loss}

def validation_step(self, batch, batch_idx):
    x, meta = batch
    output = self.forward(x)
    loss = self.binary_cross_entropy(output, meta)
    acc = self.binary_accuracy(output, meta)
    self.log("val_loss", loss)
    self.log("val_acc", acc)
    return {"loss": loss, "acc": acc}

def test_step(self, batch, batch_idx):
    x, meta = batch
    output = self.forward(x)
    loss = self.binary_cross_entropy(output, meta)
    acc, pred = self.binary_accuracy(output, meta)
    self.log("test_loss", loss)
    self.log("test_acc", acc)

def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), **self.config["optimizer"]["args"])

def cli_main():
pl.seed_everything(1234)

# args
parser = argparse.ArgumentParser()
parser.add_argument(
    "-c",
    "--config",
    default=None,
    type=str,
    help="config file path (default: None)",
)
parser.add_argument(
    "-r",
    "--resume",
    default=None,
    type=str,
    help="path to latest checkpoint (default: None)",
)
parser.add_argument(
    "-d",
    "--device",
    default=None,
    type=str,
    help="indices of GPUs to enable (default: None)",
)
parser.add_argument(
    "--num_workers",
    default=10,
    type=int,
    help="number of workers used in the data loader (default: 10)",
)
parser.add_argument("-e", "--n_epochs", default=100, type=int, help="if given, override the num")

args = parser.parse_args()
config = json.load(open(args.config))

if args.device is not None:
    config["device"] = args.device

# data
def get_instance(module, name, config, *args, **kwargs):
    return getattr(module, config[name]["type"])(*args, **config[name]["args"], **kwargs)

dataset = get_instance(module_data, "dataset", config)
val_dataset = get_instance(module_data, "dataset", config, train=False)

data_loader = DataLoader(
    dataset,
    batch_size=int(config["batch_size"]),
    num_workers=args.num_workers,
    shuffle=True,
    drop_last=True,
    pin_memory=True,
)

valid_data_loader = DataLoader(
    val_dataset,
    batch_size=config["batch_size"],
    num_workers=args.num_workers,
    shuffle=False,
)
# model
model = MyClassifier(config)

# training

checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)
trainer = pl.Trainer(
    accelerator='auto',
    strategy="ddp",
    max_epochs=args.n_epochs,
    accumulate_grad_batches=config["accumulate_grad_batches"],
    callbacks=[checkpoint_callback],
    # resume_from_checkpoint=args.resume,
    default_root_dir="saved/" + config["name"],
    deterministic=True,
    precision=16
)
trainer.fit(model, data_loader, valid_data_loader)

if name == "main":
cli_main()
`

It works fine on a single node with 4 GPUs but in multi-node setting, it seems there is no difference with single-node settings. Specifically, from the logs of the nodes I see that the RANK is correctly set (i.e., RANK 0 for master and RANK 1 to 5 for workers). However, there are two issues:

All workers save the checkpoints in a specific directory (e.g., version_11) but the master saves checkpoints in another directory (e.g., version_12)
Number of steps in each epoch is always the same, regardless of how many nodes are being used. From the logs of all nodes, I see the total number of steps in each log is N. While in a single node multi-gpu setting, this number is smaller (e.g., N/2 or N/4).

I run the code using this command python train.py --config PATH

jxtngx · 2023-08-29T19:38:23Z

jxtngx
Aug 29, 2023

hi @rahaazad2 👋! are you changing the num_nodes flag in the trainer shown above when you want to do: num_nodes = 6 and devices = 1?

0 replies

schillij95 · 2024-03-14T18:02:42Z

schillij95
Mar 14, 2024

I had a similar problem which was caused from an interaction of argparse with PL multi gpu instantiating

to solve the problem, i adjusted the parsing: args, unknown = parser.parse_known_args()

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to train in a multi-node environment? #18423

{{title}}

Replies: 2 comments

{{title}}

{{title}}

Select a reply

How to train in a multi-node environment? #18423

rahaazad2 Aug 29, 2023

Replies: 2 comments

jxtngx Aug 29, 2023

schillij95 Mar 14, 2024

rahaazad2
Aug 29, 2023

jxtngx
Aug 29, 2023

schillij95
Mar 14, 2024