I have a cluster with 4 GPUs on a single node. I need to run DDP training using all the GPUs available on that node. From the docs, I set num_workers=1
and devices=4, num_nodes=1
for PyTorch Lightning. But it’s not working as expected. Am I missing something?
Here is a minimal code for it.
import torch
from lightning.pytorch import LightningModule, Trainer
from torch.utils.data import DataLoader, Dataset
from argparse import ArgumentParser
import ray.train.lightning
import ray.train.torch
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig
from ray import train
import time
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2048)
self.layer2 = torch.nn.Linear(2048, 2)
def forward(self, x):
return self.layer2(self.layer(x))
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
return loss
def configure_optimizers(self):
return torch.optim.SGD(self.parameters(), lr=0.1)
def run(config):
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
plugins = [
ray.train.lightning.RayLightningEnvironment(),
]
trainer = Trainer(
strategy=ray.train.lightning.RayDDPStrategy(),
plugins=plugins,
enable_model_summary=False,
profiler='simple',
**config,
)
trainer.fit(model, train_dataloaders=train_data)
def _main():
config = {}
config["num_nodes"] = 1
config["devices"] = 4
trainer = TorchTrainer(
train_loop_per_worker=run,
train_loop_config=config,
scaling_config=ScalingConfig(
num_workers=1,
use_gpu=True,
),
)
trainer.fit()
if __name__ == "__main__":
_main()