PyTorch DDP with Ray Tune fails

I followed the minst ddp example and adapted for my segmentation use-case.

def main(args, num_samples=10, cpu_per_trial=8, gpus_per_trial=2):
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "weight_decay": tune.loguniform(6e-6, 1e-3),
        "lr_warmup_epochs": tune.choice([2, 4]),
        "lr_warmup_method": tune.choice(['linear', 'constant']),
        "lr_warmup_decay": tune.loguniform(1e-3, 1e-1)
    }
    scheduler = ASHAScheduler(
        metric="iou",
        mode="max",
        max_t=MAX_NUM_EPOCHS,
        grace_period=GRACE_PERIOD,
        reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "f1", "iou", "training_iteration"])

    distributed_train = DistributedTrainableCreator(
        train_seg_tuner(args, config),
        use_gpu=True,
        num_workers=2,  # number of parallel workers to use
        num_cpus_per_worker=cpu_per_trial
    )
    result = tune.run(
        distributed_train,
        resources_per_trial={"cpu": cpu_per_trial, "gpu": gpus_per_trial},
        num_samples=num_samples,
        scheduler=scheduler,
        local_dir='outputs/raytune_result',
        keep_checkpoints_num=1,
        checkpoint_score_attr='max-iou',
        progress_reporter=reporter)

Error Traceback:

WARNING:torch.distributed.run:                                                                                                                                                                                              
*****************************************                                                                                                                                                                                   
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.            
*****************************************                                                                                                                                                                                   
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1fa0eb5910>_5_dice_loss                                                                                                                              
| distributed init (rank 0): env://                                                                                                                                                                                         
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1523460e80>_5_dice_loss                                                                                                                              
| distributed init (rank 1): env://                                                                                                                                                                                         
Namespace(device='cuda', dist_backend='nccl', dist_url='env://', distributed=True, gpu=0, rank=0, world_size=2) 
Traceback (most recent call last):                                                                                                                                                                                          
  File "train_seg_tuner.py", line 356, in <module>                                                                                                                                                                          
    main(args, num_samples=4, cpu_per_trial=8, gpus_per_trial=2)                                                                                                                                                            
  File "train_seg_tuner.py", line 292, in main                                                                                                                                                                              
    train_seg_tuner(args, config),                                                                                                                                                                                          
  File "train_seg_tuner.py", line 196, in train_seg_tuner                                                                                                                                                                   
    optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)                                                                                                           
  File "/home/username/anaconda3/envs/bop/lib/python3.8/site-packages/torch/optim/adamw.py", line 62, in __init__                                                                                                          
    if not 0.0 <= lr:                                                                                                                                                                                                       
TypeError: '<=' not supported between instances of 'float' and 'Float' 
@wandb_mixin
def train_seg_tuner(args, config, checkpoint_dir="Results"):
    fname = "Result_" + str(config["lr"]) + '_' + str(epochs) + '_' + exp
    print(f"Saving trained model name: {fname}")
    
    # init for distributed
    init_distributed_mode(args)
    print(args)
    device = torch.device(args.device)
    
    if args.rank == 0:  # only on main process
        wandb.init(project="v2-tuning", entity="my-entity")
        wandb.config.update(args)
        wandb.config.update(config)
    
    # setup model
    net = CurveNet(num_masks=NUM_MASKS)
    net.to(device)
    if args.distributed:
        net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)

    if args.distributed:
        net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.gpu])

    # setup datasets -> split 80%-20% train-val dataset
    train_files = glob.glob("/path/to/dataset/", recursive=True)
    val_files = glob.glob("/path/to/dataset/", recursive=True)

    train_set = MyDataset(files=train_files)
    print("Training samples:", len(train_set))
    val_set = MyDataset(files=val_files)
    print("Validation samples:", len(val_set))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
        val_sampler = torch.utils.data.distributed.DistributedSampler(val_set)
    else:
        train_sampler = torch.utils.data.RandomSampler(train_set)
        val_sampler = torch.utils.data.SequentialSampler(val_set)
    
    # setup dataloaders
    trainloader = torch.utils.data.DataLoader(
        train_set,
        batch_size=16,
        sampler=train_sampler,
        num_workers=4,
        collate_fn=collate_fn,
        drop_last=True,
        pin_memory=True,
    )
 
    valloader = torch.utils.data.DataLoader(
        val_set,
        batch_size=16,
        sampler=val_sampler,
        num_workers=4,
        collate_fn=collate_fn,
        pin_memory=True
    )

    criterion = dice # set this variable to selected loss

    # setup optimizers
    optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)

Launching the scripts using torchrun --nproc-per-node=2 gives the above error.

Hi @dudeperf3ct, maybe you can try running this with Ray Train instead? It comes with a Ray Tune integration.

@matthewdeng I was able to run Ray Tune with Ray Train. Thank you for the awesome library!