PyTorch DDP with Ray Tune fails

I followed the minst ddp example and adapted for my segmentation use-case.

def main(args, num_samples=10, cpu_per_trial=8, gpus_per_trial=2):
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "weight_decay": tune.loguniform(6e-6, 1e-3),
        "lr_warmup_epochs": tune.choice([2, 4]),
        "lr_warmup_method": tune.choice(['linear', 'constant']),
        "lr_warmup_decay": tune.loguniform(1e-3, 1e-1)
    }
    scheduler = ASHAScheduler(
        metric="iou",
        mode="max",
        max_t=MAX_NUM_EPOCHS,
        grace_period=GRACE_PERIOD,
        reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "f1", "iou", "training_iteration"])

    distributed_train = DistributedTrainableCreator(
        train_seg_tuner(args, config),
        use_gpu=True,
        num_workers=2,  # number of parallel workers to use
        num_cpus_per_worker=cpu_per_trial
    )
    result = tune.run(
        distributed_train,
        resources_per_trial={"cpu": cpu_per_trial, "gpu": gpus_per_trial},
        num_samples=num_samples,
        scheduler=scheduler,
        local_dir='outputs/raytune_result',
        keep_checkpoints_num=1,
        checkpoint_score_attr='max-iou',
        progress_reporter=reporter)

Error Traceback:

WARNING:torch.distributed.run:                                                                                                                                                                                              
*****************************************                                                                                                                                                                                   
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.            
*****************************************                                                                                                                                                                                   
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1fa0eb5910>_5_dice_loss                                                                                                                              
| distributed init (rank 0): env://                                                                                                                                                                                         
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1523460e80>_5_dice_loss                                                                                                                              
| distributed init (rank 1): env://                                                                                                                                                                                         
Namespace(device='cuda', dist_backend='nccl', dist_url='env://', distributed=True, gpu=0, rank=0, world_size=2) 
Traceback (most recent call last):                                                                                                                                                                                          
  File "train_seg_tuner.py", line 356, in <module>                                                                                                                                                                          
    main(args, num_samples=4, cpu_per_trial=8, gpus_per_trial=2)                                                                                                                                                            
  File "train_seg_tuner.py", line 292, in main                                                                                                                                                                              
    train_seg_tuner(args, config),                                                                                                                                                                                          
  File "train_seg_tuner.py", line 196, in train_seg_tuner                                                                                                                                                                   
    optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)                                                                                                           
  File "/home/username/anaconda3/envs/bop/lib/python3.8/site-packages/torch/optim/adamw.py", line 62, in __init__                                                                                                          
    if not 0.0 <= lr:                                                                                                                                                                                                       
TypeError: '<=' not supported between instances of 'float' and 'Float' 
@wandb_mixin
def train_seg_tuner(args, config, checkpoint_dir="Results"):
    fname = "Result_" + str(config["lr"]) + '_' + str(epochs) + '_' + exp
    print(f"Saving trained model name: {fname}")
    
    # init for distributed
    init_distributed_mode(args)
    print(args)
    device = torch.device(args.device)
    
    if args.rank == 0:  # only on main process
        wandb.init(project="v2-tuning", entity="my-entity")
        wandb.config.update(args)
        wandb.config.update(config)
    
    # setup model
    net = CurveNet(num_masks=NUM_MASKS)
    net.to(device)
    if args.distributed:
        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)

    if args.distributed:
        net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.gpu])

    # setup datasets -> split 80%-20% train-val dataset
    train_files = glob.glob("/path/to/dataset/", recursive=True)
    val_files = glob.glob("/path/to/dataset/", recursive=True)

    train_set = MyDataset(files=train_files)
    print("Training samples:", len(train_set))
    val_set = MyDataset(files=val_files)
    print("Validation samples:", len(val_set))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_set)
    else:
        train_sampler = torch.utils.data.RandomSampler(train_set)
        val_sampler = torch.utils.data.SequentialSampler(val_set)
    
    # setup dataloaders
    trainloader = torch.utils.data.DataLoader(
        train_set,
        batch_size=16,
        sampler=train_sampler,
        num_workers=4,
        collate_fn=collate_fn,
        drop_last=True,
        pin_memory=True,
    )
 
    valloader = torch.utils.data.DataLoader(
        val_set,
        batch_size=16,
        sampler=val_sampler,
        num_workers=4,
        collate_fn=collate_fn,
        pin_memory=True
    )

    criterion = dice # set this variable to selected loss

    # setup optimizers
    optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)

Launching the scripts using torchrun --nproc-per-node=2 gives the above error.

Hi @dudeperf3ct, maybe you can try running this with Ray Train instead? It comes with a Ray Tune integration.

@matthewdeng I was able to run Ray Tune with Ray Train. Thank you for the awesome library!

ray.train is the only solution for the use of ray.tune of pytorch DDP training?

May it’s because the solution for handling argparse.Namespace and dict, you can do as here: