I’m trying to run a hyperparameter search with PyTorch Lightning, but it doesn’t seem like any of the trials are ever actually started. The CLI reporter only ever shows the trials as PENDING
, and they never change to RUNNING
. The CLI reporter output always stays the same, looking like this:
== Status ==
Memory usage on this node: 1.3/12.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/7.21 GiB heap, 0.0/3.61 GiB objects
Result logdir: /home/echols14/ray_results/tune_training_asha
Number of trials: 10/10 (10 PENDING)
+-------------------+----------+-------+-------------+--------------+-----------+-----------+
| Trial name | status | loc | lr | batch_size | dropout | d_model |
|-------------------+----------+-------+-------------+--------------+-----------+-----------|
| train_46f66_00000 | PENDING | | 0.000134998 | 256 | 0.155438 | 128 |
| train_46f66_00001 | PENDING | | 0.00143964 | 512 | 0.170543 | 128 |
| train_46f66_00002 | PENDING | | 9.20564e-05 | 512 | 0.325629 | 256 |
| train_46f66_00003 | PENDING | | 0.000815963 | 512 | 0.152794 | 128 |
| train_46f66_00004 | PENDING | | 3.95896e-05 | 512 | 0.0834858 | 512 |
| train_46f66_00005 | PENDING | | 0.00305493 | 64 | 0.0672461 | 128 |
| train_46f66_00006 | PENDING | | 3.34226e-05 | 512 | 0.116613 | 512 |
| train_46f66_00007 | PENDING | | 0.000262342 | 512 | 0.156197 | 512 |
| train_46f66_00008 | PENDING | | 3.96747e-05 | 256 | 0.142566 | 512 |
| train_46f66_00009 | PENDING | | 7.67698e-05 | 128 | 0.328324 | 128 |
+-------------------+----------+-------+-------------+--------------+-----------+-----------+
Here’s the function that kicks it all off:
def tune_hyperparams(config: Dict, num_samples=10, num_epochs=10, gpus_per_trial=1):
"""call ray.tune methods to find the best hyperparameters
Parameters
----------
config : Dict
a dictionary of configuration variables to be passed into train_heatmap_predictor.
values associated with certain keys will by written over by ray.tune for tuning
num_samples : int, optional
how many times to run each configuration, by default 10
num_epochs : int, optional
the number of epochs to run each trial, by default 10
gpus_per_trial : int, optional
the number of GPUs each trial can use, by default 1
"""
tune_config = {
# "tune": True, # triggers tuning things in the train method
"batch_size": tune.choice([64, 128, 256, 512]),
"lr": tune.loguniform(1e-5, 1e-2),
"d_model": tune.choice([128, 256, 512]),
"n_heads": tune.choice([4, 6, 8]),
"dim_transf_ff": tune.choice([256, 512, 1024, 2048]),
"n_transf_layers": tune.choice([4, 6, 8]),
"dropout": tune.uniform(0.05, 0.35),
}
config.update(tune_config) # any shared values will be overwritten by the tune value
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
reporter = CLIReporter(
parameter_columns=["lr", "batch_size", "dropout", "d_model"],
metric_columns=["train_loss", "val_loss", "val_accuracy", "training_iteration"])
analysis = tune.run(
tune.with_parameters(train, num_epochs=num_epochs, num_gpus=gpus_per_trial, hyperparam_tuning=True),
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
metric="accuracy",
mode="max",
# config=config,
config=tune_config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
name="tune_training_asha")
print("Best hyperparameters found were: ", analysis.best_config)
And it references this function as its trainable, but since IN THE TRAIN FUNCTION
doesn’t appear anywhere in the output, I don’t think it’s getting called at all.
def train(config: Dict, num_epochs=10, num_gpus=0, hyperparam_tuning=False):
"""this function can be called by a ray.tune hyperparameter search"""
print("IN THE TRAIN FUNCTION")
# general setup
if not os.path.isdir(config["model_dir"]):
os.mkdir(config["model_dir"])
seed_everything(config["seed"], workers=True)
# get data
dataset_train = PaletteDataset(config["train_data"])
dataloader_train = DataLoader(dataset_train, config["batch_size"], shuffle=True,
num_workers=config["cpus"])
dataset_val = PaletteDataset(config["val_data"])
dataloader_val = DataLoader(dataset_val, config["batch_size"], shuffle=False,
num_workers=config["cpus"])
# make the model
model = PaletteToScalarPL(config)
callback_list = list()
# set up custom checkpointing
if hyperparam_tuning:
tune_callback = TuneReportCallback(
metrics={
"loss": "ptl/val_loss",
"accuracy": "ptl/val_accuracy"
},
on="validation_end"
)
callback_list.append(tune_callback)
tb_logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".")
else:
tb_logger = TensorBoardLogger(save_dir=config["model_dir"], name="lightning_logs")
checkpoint_callback = ModelCheckpoint(
monitor="val_accuracy",
dirpath=os.path.join(tb_logger.log_dir, "checkpoints"),
filename="{epoch}-{val_accuracy:.2f}",
save_top_k=1,
mode="max",
)
callback_list.append(checkpoint_callback)
# send outputs to where sagemaker expects them (or the provided dir)
output_data_dir = config["output_data_dir"]
# train it
trainer = Trainer(max_epochs=num_epochs, gpus=num_gpus, deterministic=True,
logger=tb_logger, callbacks=callback_list,
default_root_dir=output_data_dir)
trainer.fit(model, train_dataloader=dataloader_train, val_dataloaders=dataloader_val)
if not hyperparam_tuning:
with open(os.path.join(config["model_dir"], "final_p_to_scalar.pth"), "wb") as f:
torch.save(model.p_to_scalar.state_dict(), f)
Any ideas why no training jobs are starting?