I am using Ray Train for hyperparameter tuning. The config is shown below
def main(args, num_samples=2):
    trainer = Trainer(
        "torch",
        num_workers=args.num_workers,
        use_gpu=True,
        resources_per_worker={"GPU": arg.num_gpus//args.num_workers, "CPU": 20},
    )
    config = {
        # model parameters
        "lr": tune.loguniform(1e-4, 1e-1),
        "lr_backbone": tune.loguniform(1e-6, 1e-4),
        "weight_decay": tune.loguniform(1e-6, 1e-2),
        "epochs": 100,
        "lr_drop": 75,  # adjust this according to epochs, after how many epochs to drop lr
        "num_classes": 2,
        "device": "cuda",
        # backbone
        "clip_max_norm": 0.1,
        "frozen_weights": None,
        "backbone": tune.choice(["resnet18"]),
        "dilation": False,
        "position_embedding": "learned",  # default is sine    "learned" not working yet
        "enc_layers": tune.choice([3, 4, 5, 6]),
        "dec_layers": tune.choice([3, 4, 5, 6]),
        "dim_feedforward": tune.choice([512, 256, 1024]),
        "hidden_dim": tune.choice([128, 256, 512]),
        "dropout": tune.choice([0.1, 0.3, 0.5]),
        "nheads": tune.choice([4, 8]),
        "num_queries": tune.choice([10, 50, 100]),
        "pre_norm": False,
        # segmentation
        "masks": False,
        # loss
        "aux_loss": False,
        # matcher
        "set_cost_class": 1,
        "set_cost_bbox": 5,
        "set_cost_giou": 2,
        # * Loss coefficients
        "mask_loss_coef": 1,
        "dice_loss_coef": 1,
        "bbox_loss_coef": 5,
        "giou_loss_coef": 2,
        "eos_coef": 0.1,
        "root": "data/SH5_Bb_Frac",  # adjust this to your data folder
        "output_dir": "ray_tune_results/",  # adjust this to your output folder
        "seed": 42,
        "eval": False,
        "amp": True,
    }
    reporter = CLIReporter(metric_columns=["loss", "class_error", "training_iteration"])
    trainable = trainer.to_tune_trainable(train_tuner)
    result = tune.run(
        trainable,
        config=config,
        num_samples=num_samples,
        local_dir="ray_tune_results",
        keep_checkpoints_num=1,
        progress_reporter=reporter,
    )
    best_trial = result.get_best_trial("class_error", "min")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    print(
        "Best trial final validation class_error: {}".format(
            best_trial.last_result["class_error"]
        )
    )
    best_checkpoint_dir = best_trial.checkpoint.value
    print("Best checkpoint dir", best_checkpoint_dir)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Hyperparams")
    parser.add_argument(
        "--num-gpus",
        "-g",
        type=int,
        default=2,
        help="The number of GPUs per worker.",
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=2,
        help="The number of workers for training.",
    )
    args = parser.parse_args()
    start = time.time()
    ray.init()
    # num_samples -> number of random search experiments to run
    main(args, num_samples=4)
    stop = time.time()
    print("Total execution time is {} min".format((stop - start) / (60)))
System Configuration: 4 x A100 80GB GPU and 128 core CPU
python3 train_tuner.py -n 2 -g 4
With batch_size = 2 and num_workers for dataloader = 10
BaseWorkerMixin pid=3784043) [LOCAL RANK 0 | WORLD RANK 0]:                                                                                                                                                                                                      [52/1816]
(BaseWorkerMixin pid=3784043) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784043)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784038) [LOCAL RANK 1 | WORLD RANK 1]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784038) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784038)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
Training:   0%|          | 0/1189 [00:01<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784060) 2022-05-30 13:10:13,364   INFO torch.py:244 -- Moving model to device: cuda:0                                                                                                                                                                
(BaseWorkerMixin pid=3784060) number of params: 32916998                                                                                                                                                                                                                   
(BaseWorkerMixin pid=3784060) 2022-05-30 13:10:13,462   INFO torch.py:247 -- Wrapping provided model in DDP.                                                                                                                                                               
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784034) [LOCAL RANK 1 | WORLD RANK 1]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784034) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784034)                                                                                                                                                                                                                                              
(BaseWorkerMixin pid=3784060) [LOCAL RANK 0 | WORLD RANK 0]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784060) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784060)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:15 (running for 00:00:21.21)                                                                                                                                                                                                                
Memory usage on this node: 47.2/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
                                                                                                                                                                                                                                                                           
                                                                                                                                                                                                                                                                           
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:20 (running for 00:00:26.22)                                                                                                                                                                                                                
Memory usage on this node: 46.9/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+ 
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:30 (running for 00:00:36.24)                                                                                                                                                                                                                
Memory usage on this node: 46.8/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
                                                                                                                                                                                                                                                                           
                                                                                                                                                                                                                                                                           
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:35 (running for 00:00:41.25)                                                                                                                                                                                                                
Memory usage on this node: 46.8/503.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54
Number of trials: 4/4 (2 PENDING, 2 RUNNING)
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
== Status ==
Current time: 2022-05-30 13:10:40 (running for 00:00:46.27)
Memory usage on this node: 46.8/503.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54
Number of trials: 4/4 (2 PENDING, 2 RUNNING)
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
And thereβs only repetition of above table for a long time. I verified that model is not training after keeping it running for long time.
Graph utilization for 2 experiments
dgxstation-a100      Mon May 30 13:11:02 2022  450.119.04
[0] A100-SXM-80GB    | 38'C, 100 % | 37833 / 81252 MB | org(2287M) org(2871M)
[1] A100-SXM-80GB    | 37'C,   0 % |  5519 / 81252 MB | org(2795M)
[2] A100-SXM-80GB    | 38'C, 100 % |  4895 / 81252 MB | org(2287M) org(2605M)
[3] DGX Display      | 39'C,   0 % |     3 /  3911 MB |
[4] A100-SXM-80GB    | 37'C,   0 % |  2572 / 81252 MB | org(2569M)
With batch_size = 8
Same thing happens here as well.
To compare the script without ray train, i trained a simple distributed pytorch model and everything worked fine with batch size = 128 and utilization of all gpus was about 40%. Attaching utilization graph below.
Is there a thumb rule for number of gpus vs number of workers? How to run one experiment after another where 1 experiment utilizes all resources and moves to next experiment? I tried setting n=1 and g=4 but it runs all experiments simulatenously. How to best monitor resources and utilize ray for hyperparameter tuning efficiently?

