How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I’ve already used WandbLoggerCallback, but can’t see any hyperparameters uploaded to the wandb run configs. This makes it impossible to inspect the best performing HP despite seeing training losses etc.
Here’s my code:
# PEFT monarch search space
param_space = {
# "nblocks": tune.choice(['sqrt(n)', 4]),
"seed": training_args.seed,
# "num_train_epochs": tune.choice([20, 25]),
"learning_rate": tune.quniform(8e-5, 6e-4, 2e-5),
"gradient_accumulation_steps": tune.choice([16, 32]), # Will OOM if tune batch size
"weight_decay": tune.choice([0]),
"lr_scheduler_type": tune.choice(["cosine", "linear"]), # mostly linear underperforms
"blk_r": peft_config["blk_r"],
"nblocks": peft_config["nblocks"],
}
n_trials = args.n_trials
# Set up scheduler and reporter etc.
direction = "min"
tune_unit = "iter"
max_t = 40 * 60 if "tune_unit" == "time" else 7
metric = f'train_mmlu_eval_accuracy'
grade_period = 4 * 60 if tune_unit == "time" else 2
time_attr = "time_total_s" if tune_unit == "time" else "training_iteration"
scheduler = ASHAScheduler(
time_attr=time_attr,
max_t=max_t,
metric = metric,
mode = direction,
grace_period=grade_period,
)
# Do hyperparam optimization with Ray Tune
best_run = trainer.hyperparameter_search(
hp_space=lambda _: param_space,
backend="ray",
n_trials=n_trials, # under the hood it calls ray.tune.run(num_samples=n_trials, ...)
scheduler=scheduler,
keep_checkpoints_num=None,
resources_per_trial={"cpu": 1, "gpu": 1},
name=os.environ["WANDB_RUN_GROUP"],
local_dir="/fly/ray_results",
max_failures=9999, # tolerate OOM
direction="maximize" if direction == "max" else "minimize",
compute_objective=partial(get_hpo_metric, metric),
resume=args.resume,
callbacks=[WandbLoggerCallback(project=os.environ["WANDB_PROJECT"], group=os.environ["WANDB_RUN_GROUP"], log_config=True)]
)