I’m running HPO with ASHA. All runs finished quite fast despite the last two trials, which show running but hung infinitely in the training loop. Despite warning I have 179GB in the /tmp dir. GPU memory is occupied. Ray status shows everything is healthy.
Here’s my code and log:
direction = "max"
max_t = 40 * 60 if tune_unit == "time" else 0.5 # mins or eval iterations
if data_args.task_name == "mrpc":
max_t = 30 * 60 if tune_unit == "time" else 12
grade_period = 5 * 60 if tune_unit == "time" else 0.5
time_attr = "time_total_s" if tune_unit == "time" else "training_iteration"
scheduler = ASHAScheduler(
time_attr=time_attr,
max_t=max_t,
metric = task_to_metric[data_args.task_name],
mode = direction,
grace_period=grade_period,
)
reporter = CLIReporter(
parameter_columns=["learning_rate", "per_device_train_batch_size", "weight_decay"],
metric_columns=["train_loss", "eval_loss", task_to_metric[data_args.task_name], "training_iteration"],
max_progress_rows=9,
max_report_frequency=9,
)
# Do hyperparam optimization with Ray Tune
best_run = trainer.hyperparameter_search(
hp_space=lambda _: param_space,
backend="ray",
n_trials=n_trials, # under the hood it calls ray.tune.run(num_samples=n_trials, ...)
scheduler=scheduler,
keep_checkpoints_num=0,
checkpoint_score_attr="min-" + task_to_metric[data_args.task_name], # rank in decreasing order
progress_reporter=reporter,
resources_per_trial={"cpu": 1, "gpu": 0.5},
local_dir="ray_results",
name=os.environ["WANDB_RUN_GROUP"],
max_failures=50, # tolerate OOM
# callbacks=[WandbLoggerCallback(project=os.environ["WANDB_PROJECT"], group=os.environ["WANDB_RUN_GROUP"])],
direction="maximize",
resume=args.resume_tune
)