I’m trying to run a hyperparameter search with PyTorch Lightning, but it doesn’t seem like any of the trials are ever actually started. The CLI reporter only ever shows the trials as PENDING, and they never change to RUNNING. The CLI reporter output always stays the same, looking like this:
== Status ==
Memory usage on this node: 1.3/12.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/7.21 GiB heap, 0.0/3.61 GiB objects
Result logdir: /home/echols14/ray_results/tune_training_asha
Number of trials: 10/10 (10 PENDING)
+-------------------+----------+-------+-------------+--------------+-----------+-----------+
| Trial name | status | loc | lr | batch_size | dropout | d_model |
|-------------------+----------+-------+-------------+--------------+-----------+-----------|
| train_46f66_00000 | PENDING | | 0.000134998 | 256 | 0.155438 | 128 |
| train_46f66_00001 | PENDING | | 0.00143964 | 512 | 0.170543 | 128 |
| train_46f66_00002 | PENDING | | 9.20564e-05 | 512 | 0.325629 | 256 |
| train_46f66_00003 | PENDING | | 0.000815963 | 512 | 0.152794 | 128 |
| train_46f66_00004 | PENDING | | 3.95896e-05 | 512 | 0.0834858 | 512 |
| train_46f66_00005 | PENDING | | 0.00305493 | 64 | 0.0672461 | 128 |
| train_46f66_00006 | PENDING | | 3.34226e-05 | 512 | 0.116613 | 512 |
| train_46f66_00007 | PENDING | | 0.000262342 | 512 | 0.156197 | 512 |
| train_46f66_00008 | PENDING | | 3.96747e-05 | 256 | 0.142566 | 512 |
| train_46f66_00009 | PENDING | | 7.67698e-05 | 128 | 0.328324 | 128 |
+-------------------+----------+-------+-------------+--------------+-----------+-----------+
Here’s the function that kicks it all off:
def tune_hyperparams(config: Dict, num_samples=10, num_epochs=10, gpus_per_trial=1):
"""call ray.tune methods to find the best hyperparameters
Parameters
----------
config : Dict
a dictionary of configuration variables to be passed into train_heatmap_predictor.
values associated with certain keys will by written over by ray.tune for tuning
num_samples : int, optional
how many times to run each configuration, by default 10
num_epochs : int, optional
the number of epochs to run each trial, by default 10
gpus_per_trial : int, optional
the number of GPUs each trial can use, by default 1
"""
tune_config = {
# "tune": True, # triggers tuning things in the train method
"batch_size": tune.choice([64, 128, 256, 512]),
"lr": tune.loguniform(1e-5, 1e-2),
"d_model": tune.choice([128, 256, 512]),
"n_heads": tune.choice([4, 6, 8]),
"dim_transf_ff": tune.choice([256, 512, 1024, 2048]),
"n_transf_layers": tune.choice([4, 6, 8]),
"dropout": tune.uniform(0.05, 0.35),
}
config.update(tune_config) # any shared values will be overwritten by the tune value
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
reporter = CLIReporter(
parameter_columns=["lr", "batch_size", "dropout", "d_model"],
metric_columns=["train_loss", "val_loss", "val_accuracy", "training_iteration"])
analysis = tune.run(
tune.with_parameters(train, num_epochs=num_epochs, num_gpus=gpus_per_trial, hyperparam_tuning=True),
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
metric="accuracy",
mode="max",
# config=config,
config=tune_config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter,
name="tune_training_asha")
print("Best hyperparameters found were: ", analysis.best_config)
And it references this function as its trainable, but since IN THE TRAIN FUNCTION doesn’t appear anywhere in the output, I don’t think it’s getting called at all.
def train(config: Dict, num_epochs=10, num_gpus=0, hyperparam_tuning=False):
"""this function can be called by a ray.tune hyperparameter search"""
print("IN THE TRAIN FUNCTION")
# general setup
if not os.path.isdir(config["model_dir"]):
os.mkdir(config["model_dir"])
seed_everything(config["seed"], workers=True)
# get data
dataset_train = PaletteDataset(config["train_data"])
dataloader_train = DataLoader(dataset_train, config["batch_size"], shuffle=True,
num_workers=config["cpus"])
dataset_val = PaletteDataset(config["val_data"])
dataloader_val = DataLoader(dataset_val, config["batch_size"], shuffle=False,
num_workers=config["cpus"])
# make the model
model = PaletteToScalarPL(config)
callback_list = list()
# set up custom checkpointing
if hyperparam_tuning:
tune_callback = TuneReportCallback(
metrics={
"loss": "ptl/val_loss",
"accuracy": "ptl/val_accuracy"
},
on="validation_end"
)
callback_list.append(tune_callback)
tb_logger = TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version=".")
else:
tb_logger = TensorBoardLogger(save_dir=config["model_dir"], name="lightning_logs")
checkpoint_callback = ModelCheckpoint(
monitor="val_accuracy",
dirpath=os.path.join(tb_logger.log_dir, "checkpoints"),
filename="{epoch}-{val_accuracy:.2f}",
save_top_k=1,
mode="max",
)
callback_list.append(checkpoint_callback)
# send outputs to where sagemaker expects them (or the provided dir)
output_data_dir = config["output_data_dir"]
# train it
trainer = Trainer(max_epochs=num_epochs, gpus=num_gpus, deterministic=True,
logger=tb_logger, callbacks=callback_list,
default_root_dir=output_data_dir)
trainer.fit(model, train_dataloader=dataloader_train, val_dataloaders=dataloader_val)
if not hyperparam_tuning:
with open(os.path.join(config["model_dir"], "final_p_to_scalar.pth"), "wb") as f:
torch.save(model.p_to_scalar.state_dict(), f)
Any ideas why no training jobs are starting?
