How to run multi-GPU single node training with ray and PyTorch Lightning?

I have a cluster with 4 GPUs on a single node. I need to run DDP training using all the GPUs available on that node. From the docs, I set num_workers=1 and devices=4, num_nodes=1 for PyTorch Lightning. But it’s not working as expected. Am I missing something?

Here is a minimal code for it.

import torch
from lightning.pytorch import LightningModule, Trainer
from torch.utils.data import DataLoader, Dataset
from argparse import ArgumentParser
import ray.train.lightning
import ray.train.torch
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig
from ray import train
import time


class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class BoringModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(32, 2048)
        self.layer2 = torch.nn.Linear(2048, 2)

    def forward(self, x):
        return self.layer2(self.layer(x))

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=0.1)


def run(config):
    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)

    model = BoringModel()
    plugins = [
        ray.train.lightning.RayLightningEnvironment(),
    ]

    trainer = Trainer(
        strategy=ray.train.lightning.RayDDPStrategy(),
        plugins=plugins,
        enable_model_summary=False,
        profiler='simple',
        **config,
    )
    trainer.fit(model, train_dataloaders=train_data)

def _main():
    config = {}
    config["num_nodes"] = 1
    config["devices"] = 4

    trainer = TorchTrainer(
        train_loop_per_worker=run,
        train_loop_config=config,
        scaling_config=ScalingConfig(
            num_workers=1,
            use_gpu=True,
        ),
    )

    trainer.fit()


if __name__ == "__main__":
    _main()