I followed the minst ddp example and adapted for my segmentation use-case.
def main(args, num_samples=10, cpu_per_trial=8, gpus_per_trial=2):
config = {
"lr": tune.loguniform(1e-4, 1e-1),
"weight_decay": tune.loguniform(6e-6, 1e-3),
"lr_warmup_epochs": tune.choice([2, 4]),
"lr_warmup_method": tune.choice(['linear', 'constant']),
"lr_warmup_decay": tune.loguniform(1e-3, 1e-1)
}
scheduler = ASHAScheduler(
metric="iou",
mode="max",
max_t=MAX_NUM_EPOCHS,
grace_period=GRACE_PERIOD,
reduction_factor=2)
reporter = CLIReporter(metric_columns=["loss", "f1", "iou", "training_iteration"])
distributed_train = DistributedTrainableCreator(
train_seg_tuner(args, config),
use_gpu=True,
num_workers=2, # number of parallel workers to use
num_cpus_per_worker=cpu_per_trial
)
result = tune.run(
distributed_train,
resources_per_trial={"cpu": cpu_per_trial, "gpu": gpus_per_trial},
num_samples=num_samples,
scheduler=scheduler,
local_dir='outputs/raytune_result',
keep_checkpoints_num=1,
checkpoint_score_attr='max-iou',
progress_reporter=reporter)
Error Traceback:
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1fa0eb5910>_5_dice_loss
| distributed init (rank 0): env://
Saving trained model name: Result_<ray.tune.sample.Float object at 0x7f1523460e80>_5_dice_loss
| distributed init (rank 1): env://
Namespace(device='cuda', dist_backend='nccl', dist_url='env://', distributed=True, gpu=0, rank=0, world_size=2)
Traceback (most recent call last):
File "train_seg_tuner.py", line 356, in <module>
main(args, num_samples=4, cpu_per_trial=8, gpus_per_trial=2)
File "train_seg_tuner.py", line 292, in main
train_seg_tuner(args, config),
File "train_seg_tuner.py", line 196, in train_seg_tuner
optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)
File "/home/username/anaconda3/envs/bop/lib/python3.8/site-packages/torch/optim/adamw.py", line 62, in __init__
if not 0.0 <= lr:
TypeError: '<=' not supported between instances of 'float' and 'Float'
@wandb_mixin
def train_seg_tuner(args, config, checkpoint_dir="Results"):
fname = "Result_" + str(config["lr"]) + '_' + str(epochs) + '_' + exp
print(f"Saving trained model name: {fname}")
# init for distributed
init_distributed_mode(args)
print(args)
device = torch.device(args.device)
if args.rank == 0: # only on main process
wandb.init(project="v2-tuning", entity="my-entity")
wandb.config.update(args)
wandb.config.update(config)
# setup model
net = CurveNet(num_masks=NUM_MASKS)
net.to(device)
if args.distributed:
net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
if args.distributed:
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.gpu])
# setup datasets -> split 80%-20% train-val dataset
train_files = glob.glob("/path/to/dataset/", recursive=True)
val_files = glob.glob("/path/to/dataset/", recursive=True)
train_set = MyDataset(files=train_files)
print("Training samples:", len(train_set))
val_set = MyDataset(files=val_files)
print("Validation samples:", len(val_set))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
val_sampler = torch.utils.data.distributed.DistributedSampler(val_set)
else:
train_sampler = torch.utils.data.RandomSampler(train_set)
val_sampler = torch.utils.data.SequentialSampler(val_set)
# setup dataloaders
trainloader = torch.utils.data.DataLoader(
train_set,
batch_size=16,
sampler=train_sampler,
num_workers=4,
collate_fn=collate_fn,
drop_last=True,
pin_memory=True,
)
valloader = torch.utils.data.DataLoader(
val_set,
batch_size=16,
sampler=val_sampler,
num_workers=4,
collate_fn=collate_fn,
pin_memory=True
)
criterion = dice # set this variable to selected loss
# setup optimizers
optimizer = optim.AdamW(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"], amsgrad=True)
Launching the scripts using torchrun --nproc-per-node=2
gives the above error.