Ray Train hangs for long time

I am using Ray Train for hyperparameter tuning. The config is shown below

def main(args, num_samples=2):
    trainer = Trainer(
        "torch",
        num_workers=args.num_workers,
        use_gpu=True,
        resources_per_worker={"GPU": arg.num_gpus//args.num_workers, "CPU": 20},
    )
    config = {
        # model parameters
        "lr": tune.loguniform(1e-4, 1e-1),
        "lr_backbone": tune.loguniform(1e-6, 1e-4),
        "weight_decay": tune.loguniform(1e-6, 1e-2),
        "epochs": 100,
        "lr_drop": 75,  # adjust this according to epochs, after how many epochs to drop lr
        "num_classes": 2,
        "device": "cuda",
        # backbone
        "clip_max_norm": 0.1,
        "frozen_weights": None,
        "backbone": tune.choice(["resnet18"]),
        "dilation": False,
        "position_embedding": "learned",  # default is sine    "learned" not working yet
        "enc_layers": tune.choice([3, 4, 5, 6]),
        "dec_layers": tune.choice([3, 4, 5, 6]),
        "dim_feedforward": tune.choice([512, 256, 1024]),
        "hidden_dim": tune.choice([128, 256, 512]),
        "dropout": tune.choice([0.1, 0.3, 0.5]),
        "nheads": tune.choice([4, 8]),
        "num_queries": tune.choice([10, 50, 100]),
        "pre_norm": False,
        # segmentation
        "masks": False,
        # loss
        "aux_loss": False,
        # matcher
        "set_cost_class": 1,
        "set_cost_bbox": 5,
        "set_cost_giou": 2,
        # * Loss coefficients
        "mask_loss_coef": 1,
        "dice_loss_coef": 1,
        "bbox_loss_coef": 5,
        "giou_loss_coef": 2,
        "eos_coef": 0.1,
        "root": "data/SH5_Bb_Frac",  # adjust this to your data folder
        "output_dir": "ray_tune_results/",  # adjust this to your output folder
        "seed": 42,
        "eval": False,
        "amp": True,
    }
    reporter = CLIReporter(metric_columns=["loss", "class_error", "training_iteration"])

    trainable = trainer.to_tune_trainable(train_tuner)
    result = tune.run(
        trainable,
        config=config,
        num_samples=num_samples,
        local_dir="ray_tune_results",
        keep_checkpoints_num=1,
        progress_reporter=reporter,
    )

    best_trial = result.get_best_trial("class_error", "min")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    print(
        "Best trial final validation class_error: {}".format(
            best_trial.last_result["class_error"]
        )
    )
    best_checkpoint_dir = best_trial.checkpoint.value
    print("Best checkpoint dir", best_checkpoint_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Hyperparams")
    parser.add_argument(
        "--num-gpus",
        "-g",
        type=int,
        default=2,
        help="The number of GPUs per worker.",
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=2,
        help="The number of workers for training.",
    )
    args = parser.parse_args()

    start = time.time()
    ray.init()
    # num_samples -> number of random search experiments to run
    main(args, num_samples=4)
    stop = time.time()
    print("Total execution time is {} min".format((stop - start) / (60)))

System Configuration: 4 x A100 80GB GPU and 128 core CPU

python3 train_tuner.py -n 2 -g 4

With batch_size = 2 and num_workers for dataloader = 10

BaseWorkerMixin pid=3784043) [LOCAL RANK 0 | WORLD RANK 0]:                                                                                                                                                                                                      [52/1816]
(BaseWorkerMixin pid=3784043) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784043)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784038) [LOCAL RANK 1 | WORLD RANK 1]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784038) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784038)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
Training:   0%|          | 0/1189 [00:01<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784060) 2022-05-30 13:10:13,364   INFO torch.py:244 -- Moving model to device: cuda:0                                                                                                                                                                
(BaseWorkerMixin pid=3784060) number of params: 32916998                                                                                                                                                                                                                   
(BaseWorkerMixin pid=3784060) 2022-05-30 13:10:13,462   INFO torch.py:247 -- Wrapping provided model in DDP.                                                                                                                                                               
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
(BaseWorkerMixin pid=3784034) [LOCAL RANK 1 | WORLD RANK 1]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784034) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784034)                                                                                                                                                                                                                                              
(BaseWorkerMixin pid=3784060) [LOCAL RANK 0 | WORLD RANK 0]:                                                                                                                                                                                                               
(BaseWorkerMixin pid=3784060) ------------------------------ EPOCH: 1 | 100 ------------------------------                                                                                                                                                                 
(BaseWorkerMixin pid=3784060)                                                                                                                                                                                                                                              
Training:   0%|          | 0/1189 [00:00<?, ?it/s]                                                                                                                                                                                                                         
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:15 (running for 00:00:21.21)                                                                                                                                                                                                                
Memory usage on this node: 47.2/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
                                                                                                                                                                                                                                                                           
                                                                                                                                                                                                                                                                           
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:20 (running for 00:00:26.22)                                                                                                                                                                                                                
Memory usage on this node: 46.9/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+ 

== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:30 (running for 00:00:36.24)                                                                                                                                                                                                                
Memory usage on this node: 46.8/503.7 GiB                                                                                                                                                                                                                                  
Using FIFO scheduling algorithm.                                                                                                                                                                                                                                           
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects                                                                                                                                                                                 
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54                                                                                                                                                                            
Number of trials: 4/4 (2 PENDING, 2 RUNNING)                                                                                                                                                                                                                               
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |                                         
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|                                         
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |                                         
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |                                         
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |                                         
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |                                         
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+                                         
                                                                                                                                                                                                                                                                           
                                                                                                                                                                                                                                                                           
== Status ==                                                                                                                                                                                                                                                               
Current time: 2022-05-30 13:10:35 (running for 00:00:41.25)                                                                                                                                                                                                                
Memory usage on this node: 46.8/503.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54
Number of trials: 4/4 (2 PENDING, 2 RUNNING)
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+


== Status ==
Current time: 2022-05-30 13:10:40 (running for 00:00:46.27)
Memory usage on this node: 46.8/503.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 6.0/128 CPUs, 4.0/5 GPUs, 0.0/330.38 GiB heap, 0.0/145.58 GiB objects
Result logdir: /home/org/cced_internal/ray_tune_results/tune_function_2022-05-30_13-09-54
Number of trials: 4/4 (2 PENDING, 2 RUNNING)
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+
| Trial name                | status   | loc                   | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |        lr |   lr_backbone |   nheads |   num_queries |   weight_decay |
|---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------|
| tune_function_cc2ab_00000 | RUNNING  | 5.178.113.239:3784047 | resnet18   |            4 |               256 |       0.3 |            4 |          128 | 0.0180957 |   8.28432e-05 |        8 |            50 |    7.30983e-05 |
| tune_function_cc2ab_00001 | RUNNING  | 5.178.113.239:3784075 | resnet18   |            5 |               512 |       0.3 |            5 |          512 | 0.0101051 |   9.06266e-06 |        4 |           100 |    0.000146256 |
| tune_function_cc2ab_00002 | PENDING  |                       | resnet18   |            6 |              1024 |       0.1 |            6 |          128 | 0.0403562 |   2.03414e-06 |        4 |            50 |    0.00119554  |
| tune_function_cc2ab_00003 | PENDING  |                       | resnet18   |            5 |               256 |       0.1 |            3 |          512 | 0.0402638 |   2.79748e-05 |        8 |            50 |    0.00215873  |
+---------------------------+----------+-----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-----------+---------------+----------+---------------+----------------+

And there’s only repetition of above table for a long time. I verified that model is not training after keeping it running for long time.

Graph utilization for 2 experiments

dgxstation-a100      Mon May 30 13:11:02 2022  450.119.04
[0] A100-SXM-80GB    | 38'C, 100 % | 37833 / 81252 MB | org(2287M) org(2871M)
[1] A100-SXM-80GB    | 37'C,   0 % |  5519 / 81252 MB | org(2795M)
[2] A100-SXM-80GB    | 38'C, 100 % |  4895 / 81252 MB | org(2287M) org(2605M)
[3] DGX Display      | 39'C,   0 % |     3 /  3911 MB |
[4] A100-SXM-80GB    | 37'C,   0 % |  2572 / 81252 MB | org(2569M)

With batch_size = 8

Same thing happens here as well.

To compare the script without ray train, i trained a simple distributed pytorch model and everything worked fine with batch size = 128 and utilization of all gpus was about 40%. Attaching utilization graph below.

Is there a thumb rule for number of gpus vs number of workers? How to run one experiment after another where 1 experiment utilizes all resources and moves to next experiment? I tried setting n=1 and g=4 but it runs all experiments simulatenously. How to best monitor resources and utilize ray for hyperparameter tuning efficiently?

Hi @dudeperf3ct,

if the trials are RUNNING it means that Ray was able to acquire the resources, start the remote trainer process, and kick off the train() function in the trainable. Thus, it hands either somewhere in the initialization process, or in the actual train script.

The part we’re missing in your code snippet is the train_tuner function you pass to to_tune_trainable(). Can you give us this code? You can abbreviate it if you don’t want to share it.

The one thing that is interesting is that your python file also seems to be called train_tuner.py, so if you’re just doing something like import train_tuner, this may not work - it is important that calling train_tuner(some_config) runs the training function. See e.g. here for an example: Ray Train User Guide — Ray 1.12.1

@kai added a gist here: train_tuner.py · GitHub

Thanks for the code. It seems to hang somewhere in train_one_epoch as the TQDM is still printed but nothing within or after the call to this function.

Can you maybe add a few print outputs in that function to see where it starts hanging? Also, if it’s running, you could use py-spy dump --pid <pid> to find out where exactly it hangs? To find the correct PID you can take a look at the print outputs (e.g. (BaseWorkerMixin pid=3784060)) or ps aux | grep ray to find the correct worker process.

I added few print statements for printing loss and running_loss. It just runs first batch and then hangs.

output of py-spy dump

attaching log as it exceeds the max count of words

https://controlc.com/72066a4c

There are lot many ray::IDLEprocess (omitted a lot many to save word count) in ps -aux | grep ray

deepkap+   37046  0.2  0.0 154197140 85440 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37048  0.2  0.0 154197140 85160 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37051  0.2  0.0 154197140 85212 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37053  0.2  0.0 154197204 84884 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37057  0.2  0.0 154197140 84956 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37058  0.3  0.0 157611112 362876 pts/6 Sl+ 09:30   0:02 ray::BackendExecutor
deepkap+   37060  0.5  0.0 158119584 449124 pts/6 Sl+ 09:30   0:03 ray::BackendExecutor.get_next_results()
deepkap+   37061  0.2  0.0 154197144 85212 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37062  0.2  0.0 154197140 85216 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   37065  0.2  0.0 154197144 85524 pts/6 Sl+  09:30   0:01 ray::IDLE
deepkap+   43221  0.3  0.8 174352924 4283332 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()                                                                                                                                                [0/880]
deepkap+   43478  0.3  0.8 174352936 4282868 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   43734  0.3  0.8 174352948 4283092 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   43863  0.3  0.8 174352960 4283228 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   44118  0.3  0.8 174352972 4282868 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   44246  0.3  0.8 174352984 4283044 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   45299  0.2  0.8 174418364 4279700 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   45629  0.3  0.8 174418376 4279520 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   45885  0.3  0.8 174418388 4279424 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   46013  0.3  0.8 174418400 4280140 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   46141  0.3  0.8 174418412 4279884 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   46397  0.3  0.8 174418424 4279424 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   46654  0.3  0.8 174418436 4279640 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   46914  0.2  0.8 174418448 4279720 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   47043  0.3  0.8 174418460 4279708 pts/6 Sl+ 09:31   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+   47299  0.3  0.8 174418472 4280164 pts/6 Sl+ 09:31   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
root       54089  0.0  0.0  17664  2512 pts/20   S+   09:40   0:00 grep --color=auto ray
deepkap+ 3223566  0.0  1.0 179188844 5428656 pts/6 Sl May29   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3224083  0.0  1.0 179188868 5428724 pts/6 Sl May29   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3224339  0.0  1.0 179188880 5428732 pts/6 Sl May29   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3224595  0.0  1.0 179188892 5429076 pts/6 Sl May29   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3224851  0.0  1.0 179188904 5429016 pts/6 Sl May29   0:01 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3225107  0.0  1.0 179188916 5428732 pts/6 Sl May29   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()
deepkap+ 3225363  0.0  1.0 179188928 5428740 pts/6 Sl May29   0:02 ray::BaseWorkerMixin._BaseWorkerMixin__execute()

Hm, I’m wondering a few things

  1. What happens if you only use one worker (and one CPU per worker)?
  2. What happens if you use amp=False?

Where exactly? From the py-spy dump it seems it is able to get into the second loop but hangs on scaler.step()

To me, it looks like instead of setting up your own AMP scaler, you should maybe use the Ray Train utilities for this. See this part of the User guide: Ray Train User Guide — Ray 1.12.1

APIs:

https://docs.ray.io/en/latest/train/api.html#train-torch-backward
https://docs.ray.io/en/latest/train/api.html#train-torch-accelerate
https://docs.ray.io/en/latest/train/api.html#train-torch-prepare-optimizer

so something like

train.torch.accelerate(amp=True)
...
optimizer = optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
optimizer = train.torch.prepare_optimizer(optimizer)
...

train.torch.backward(losses)

and get rid of the manual handling of amp=True

Thank you @kai for the suggestions.

  1. What happens if you only use one worker (and one CPU per worker)?

Can you guide me with setting the parameters for this? should the trainer be modified as show below?

here python3 train_tuner.py --num_workers 1 --num_gpus 4

    trainer = Trainer(
        "torch",
        num_workers=args.num_workers,
        use_gpu=True,
        resources_per_worker={"GPU": args.num_gpus, "CPU": 1},
    )
  1. What happens if you use amp=False ?

I have revised the gist. Updated gist with print statements.

Now there is an error when I set amp=True and another error when I set amp=False.

Case 1: amp = True

(TrainTrainable pid=143785)   File "train_tuner.py", line 168, in train_tuner
(TrainTrainable pid=143785)     model = train.torch.prepare_model(model)
(TrainTrainable pid=143785)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 596, in prepare_model
(TrainTrainable pid=143785)     return get_accelerator(TorchAccelerator).prepare_model(
(TrainTrainable pid=143785)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 122, in prepare_model
(TrainTrainable pid=143785)     assert not hasattr(model, "__getstate__")

Case 2 : amp=False

(BackendExecutor pid=291030)   File "train_tuner.py", line 193, in train_tuner                                                                                                                                                                                             
(BackendExecutor pid=291030)     x, y = next(iter(trainloader))                                                                                                                                                                                                            
(BackendExecutor pid=291030)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 513, in __next__                                                                                                                             
(BackendExecutor pid=291030)     self._wait_for_batch(next_batch)                                                                                                                                                                                                          
(BackendExecutor pid=291030)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 495, in _wait_for_batch                                                                                                                      
(BackendExecutor pid=291030)     i.record_stream(curr_stream)                                                                                                                                                                                                              
(BackendExecutor pid=291030) AttributeError: 'list' object has no attribute 'record_stream'                                                                                                                                                                                
(TrainTrainable pid=290838)     x, y = next(iter(trainloader))                                                                                                                                                                                                             
(TrainTrainable pid=290838)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 513, in __next__                                                                                                                              
(TrainTrainable pid=290838)     self._wait_for_batch(next_batch)                                                                                                                                                                                                           
(TrainTrainable pid=290838)   File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 495, in _wait_for_batch                                                                                                                       
(TrainTrainable pid=290838)     i.record_stream(curr_stream)                                                                                                                                                                                                               
(TrainTrainable pid=290838) AttributeError: 'list' object has no attribute 'record_stream'  

Focusing on amp=False case first, i added debugging statements

    # before ray wrapping
    x, y = next(iter(trainloader))
    print(type(x), type(y))
    print(x.shape, y)

    # required for ray train
    trainloader = train.torch.prepare_data_loader(trainloader)
    valloader = train.torch.prepare_data_loader(valloader)
    
    # after ray wrapping
    x, y = next(iter(trainloader))
    print(type(x), type(y))
    print(x.shape, y)

Output of dataset and error show above in case of amp=False:

(BaseWorkerMixin pid=261004) <class 'torch.Tensor'> <class 'list'>   #(input shape, label shape)                                                                                                                                                                                                      
(BaseWorkerMixin pid=261004) torch.Size([128, 1, 100, 270])    # input                                                                                                                                                                                                                
(BaseWorkerMixin pid=261004) [{'labels': tensor([0, 0, 0, 0]), 'sinusoid': tensor([[0.0200, 0.1924, 0.8860],                                                                                                                                                               
(BaseWorkerMixin pid=261004)         [0.3400, 0.2136, 0.9099],                                                                                                                                                                                                             
(BaseWorkerMixin pid=261004)         [0.5400, 0.2116, 0.9133],                                                                                                                                                                                                             
(BaseWorkerMixin pid=261004)         [0.9800, 0.2248, 0.9295]])}, {'labels': tensor([]), 'sinusoid': tensor([], size=(0, 3))}, {'labels': tensor([]), 'sinusoid': tensor([], size=(0, 3))}, {'labels': tensor([0]), 'sinusoid': tensor([[0.3700, 0.4236, 0.9242]])}, {'labe
ls': tensor([0, 1]), 'sinusoid': tensor([[0.1000, 0.2037, 0.8158],                                                                                                                                                                                                         
(BaseWorkerMixin pid=261004)         [0.4200, 0.9570, 0.3725]])}, {'labels': tensor([0]), 'sinusoid': tensor([[0.7100, 0.3406, 0.8934]])}, {'labels': tensor([1]), 'sinusoid': tensor([[0.8700, 0.9206, 0.0691]])}, {'labels': tensor([1]), 'sinusoid': tensor([[0.5200, 0.
7303, 0.2487]])}, {'labels': tensor([1]), 'sinusoid': tensor([[0.4400, 0.9757, 0.0673]])}, {'labels': tensor([]), 'sinusoid': tensor([], size=(0, 3))}, {'labels': tensor([]), 'sinusoid': tensor([], size=(0, 3))}, {'labels': tensor([0, 0, 0]), 'sinusoid': tensor([[0.4
100, 0.2464, 0.8929],

Looking into source code ray/torch.py at 029517a037b1219423ab45af79db1e9296bc39c7 · ray-project/ray · GitHub it seems both image and labels should be tensor but in my case the input are tensors and labels are dict of Tensors as shown above.

Hey @dudeperf3ct, for the amp=False case can you use the latest Ray nightly wheel? This has been fixed on master, and will be included in the next Ray release.

For the amp=True case, that is a bug on our end. It should be fixed by this PR: [Train] Support amp for models with a custom `__getstate__` method by amogkam · Pull Request #25335 · ray-project/ray · GitHub!

Also, I would recommend using 1 GPU per worker, and just increasing the number of workers to increase parallelism. Using multiple GPUs per worker is only useful if your code is actually leveraging multiple GPUs, for example if you want data parallelism+model parallelism.

Thanks @amogkam. One last question, can you explain the relation between num_workers used by ray, resources per trial by each workers and num_workers used by pytorch dataloaders?

What is optimal number that can be used to get most out of the system? Since i have access to 4 x A100 80 GB GPU with 128 CPU cores, I was thinking I can run 2 experiments (num_samples=2) and for each experiment, I will have one worker for each experiment (num_workers=1) and each worker will have access to (2 GPUs and 20 CPU cores).

    trainer = Trainer(
        "torch",
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 2, "CPU": 20},
    )

    result = tune.run(
        trainable,
        config=config,
        num_samples=2,
        local_dir="ray_tune_results",
        keep_checkpoints_num=1,
        progress_reporter=reporter,
    )

Since ray is taking care of parallelism, I am assuming that I get benefits of both data and model parallelism.

I installed nightly version and ran the code setting amp=False

Failure # 1 (occurred at 2022-06-01_17-00-42)
e[36mray::TrainTrainable.train()e[39m (pid=146145, ip=5.178.113.239, repr=tune_function)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/trainable.py", line 360, in train
    result = self.step()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/function_runner.py", line 404, in step
    self._report_thread_runner_error(block=True)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/function_runner.py", line 574, in _report_thread_runner_error
    raise e
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/function_runner.py", line 277, in run
    self._entrypoint()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/function_runner.py", line 349, in entrypoint
    return self._trainable_func(
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/tune/function_runner.py", line 645, in _trainable_func
    output = fn()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/trainer.py", line 888, in tune_function
    for results in iterator:
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/trainer.py", line 752, in __next__
    self._final_results = self._run_with_error_handling(
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/trainer.py", line 713, in _run_with_error_handling
    return func()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/trainer.py", line 824, in _finish_training
    return self._backend_executor.finish_training()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/utils.py", line 168, in <lambda>
    return lambda *args, **kwargs: ray.get(actor_method.remote(*args, **kwargs))
ray.exceptions.RayTaskError(RuntimeError): e[36mray::BackendExecutor.finish_training()e[39m (pid=146313, ip=5.178.113.239, repr=<ray.train.backend.BackendExecutor object at 0x7f5e17f26d90>)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/backend.py", line 498, in finish_training
    results = self.get_with_failure_handling(futures)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/backend.py", line 517, in get_with_failure_handling
    success = check_for_failure(remote_values)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/utils.py", line 50, in check_for_failure
    ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): e[36mray::BaseWorkerMixin._BaseWorkerMixin__execute()e[39m (pid=146362, ip=5.178.113.239, repr=<ray.train.worker_group.BaseWorkerMixin object at 0x7f5ffc617c70>)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/worker_group.py", line 26, in __execute
    return func(*args, **kwargs)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/backend.py", line 489, in end_training
    output = session.finish()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/session.py", line 118, in finish
    func_output = self.training_thread.join()
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/utils.py", line 96, in join
    raise self.exc
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/utils.py", line 89, in run
    self.ret = self._target(*self._args, **self._kwargs)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/utils.py", line 138, in <lambda>
    return lambda: train_func(config)
  File "train_tuner.py", line 165, in train_tuner
    model = train.torch.prepare_model(model)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 614, in prepare_model
    return get_accelerator(TorchAccelerator).prepare_model(
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/ray/train/torch.py", line 95, in prepare_model
    torch.cuda.set_device(device)
  File "/home/deepkapha/anaconda3/envs/bop/lib/python3.8/site-packages/torch/cuda/__init__.py", line 311, in set_device
    torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.


Surprisingly even with error shown above the code runs for all epochs for 1 experiment and error for 3 experiments.

Result logdir: /home/deepkapha/cced_internal/ray_tune_results/tune_function_2022-06-01_17-00-32                                                                                                                                                                    [0/1837]
Number of trials: 4/4 (3 ERROR, 1 RUNNING)
+---------------------------+----------+----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-------------+---------------+----------+---------------+----------------+---------+---------------+----------------------+
| Trial name                | status   | loc                  | backbone   |   dec_layers |   dim_feedforward |   dropout |   enc_layers |   hidden_dim |          lr |   lr_backbone |   nheads |   num_queries |   weight_decay |    loss |   class_error |   training_iteration |
|---------------------------+----------+----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-------------+---------------+----------+---------------+----------------+---------+---------------+----------------------|
| tune_function_59152_00000 | RUNNING  | 5.178.113.239:146103 | resnet18   |            5 |               512 |       0.5 |            3 |          256 | 0.0600561   |   1.11388e-06 |        4 |            10 |    2.50995e-05 | 4.28803 |           100 |                   39 |
| tune_function_59152_00001 | ERROR    | 5.178.113.239:146145 | resnet18   |            3 |              1024 |       0.5 |            5 |          512 | 0.0861891   |   4.40916e-06 |        4 |           100 |    0.00584165  |         |               |                      |
| tune_function_59152_00002 | ERROR    | 5.178.113.239:146509 | resnet18   |            6 |               256 |       0.1 |            3 |          128 | 0.038247    |   1.23455e-06 |        4 |            10 |    0.000960137 |         |               |                      |
| tune_function_59152_00003 | ERROR    | 5.178.113.239:149868 | resnet18   |            6 |               512 |       0.1 |            6 |          512 | 0.000105594 |   2.79434e-06 |        8 |           100 |    0.000310172 |         |               |                      |
+---------------------------+----------+----------------------+------------+--------------+-------------------+-----------+--------------+--------------+-------------+---------------+----------+---------------+----------------+---------+---------------+----------------------+
Number of errored trials: 3
+---------------------------+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                |   # failures | error file                               |
|---------------------------+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| tune_function_59152_00001 |            1 | /home/deepkapha/cced_internal/ray_tune_results/tune_function_2022-06-01_17-00-32/tune_function_59152_00001_1_backbone=resnet18,dec_layers=3,dim_feedforward=1024,dropout=0.5000,enc_layers=5,hidden_dim=512,lr=0.08_2022-06-01_17-00-35/error.txt |
| tune_function_59152_00002 |            1 | /home/deepkapha/cced_internal/ray_tune_results/tune_function_2022-06-01_17-00-32/tune_function_59152_00002_2_backbone=resnet18,dec_layers=6,dim_feedforward=256,dropout=0.1000,enc_layers=3,hidden_dim=128,lr=0.038_2022-06-01_17-00-43/error.txt |
| tune_function_59152_00003 |            1 | /home/deepkapha/cced_internal/ray_tune_results/tune_function_2022-06-01_17-00-32/tune_function_59152_00003_3_backbone=resnet18,dec_layers=6,dim_feedforward=512,dropout=0.1000,enc_layers=6,hidden_dim=512,lr=0.000_2022-06-01_17-00-51/error.txt |
+---------------------------+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Validation: 100%|██████████| 3/3 [00:00<00:00,  4.80it/s]
Training:   0%|          | 0/19 [00:00<?, ?it/s]
(BaseWorkerMixin pid=146235) [LOCAL RANK 1 | WORLD RANK 1]: 
(BaseWorkerMixin pid=146235) Phase: val | Loss: 4.2880 | Class Error: 100.0000
(BaseWorkerMixin pid=146235) [LOCAL RANK 1 | WORLD RANK 1]: 
(BaseWorkerMixin pid=146235) ------------------------------ EPOCH: 40 | 100 ------------------------------
(BaseWorkerMixin pid=146235) 
(BaseWorkerMixin pid=146234) [LOCAL RANK 0 | WORLD RANK 0]: 
(BaseWorkerMixin pid=146234) Phase: val | Loss: 4.2880 | Class Error: 100.0000
(BaseWorkerMixin pid=146234) [LOCAL RANK 0 | WORLD RANK 0]: 
(BaseWorkerMixin pid=146234) ------------------------------ EPOCH: 40 | 100 ------------------------------
(BaseWorkerMixin pid=146234) 
Validation: 100%|██████████| 3/3 [00:00<00:00,  4.83it/s]

All the txt file contains same error of RuntimeError: CUDA error: invalid device ordinal.

Code: added support for ray amp instead to torch amp scaler · GitHub

Hey @dudeperf3ct, the num_workers for Ray Train is unrelated to num_workers for PyTorch DataLoaders.

For Ray Train, num_workers refers to the number of processes used for data parallel training. Each process will have its own model replica and will each train on a different batch of data. After every batch, the gradients will be synchronized across all model replicas.

In most cases you would want to have 1 gpu per worker (just set use_gpu=True). Having more than 1 GPU per worker is useful only if your training function actually uses multiple GPUs. For example, if you want to shard each model replica across multiple GPUs. Ray Train does not automatically do this model parallelism for you.

For your case, if you want num_samples=2, then I would recommend setting num_workers=2 and 1 GPU per worker.

The num_workers used for PyTorch DataLoaders is how many processes to use for loading in data, which is unrelated to data parallel training.

1 Like

Thanks for the code. Let me investigate further into this and will get back to you.