Hi,
I’m attempting to do hyperparameter search with Pytorch Lightning. For that, I followed the documentation here and the accompanying medium post here.
However this creates an error:
(pid=9734) 2021-04-18 17:01:40,951 ERROR function_runner.py:254 – Runner Thread raised error.
(pid=9734) Traceback (most recent call last):
(pid=9734) File “/venv/lib/python3.6/site-packages/ray/tune/function_runner.py”, line 248, in run
(pid=9734) self._entrypoint()
(pid=9734) File “/venv/lib/python3.6/site-packages/ray/tune/function_runner.py”, line 316, in entrypoint
(pid=9734) self._status_reporter.get_checkpoint())
(pid=9734) File “/venv/lib/python3.6/site-packages/ray/tune/function_runner.py”, line 576, in _trainable_func
(pid=9734) output = fn()
(pid=9734) File “/venv/lib/python3.6/site-packages/ray/tune/function_runner.py”, line 651, in _inner
(pid=9734) inner(config, checkpoint_dir=None)
(pid=9734) File “/venv/lib/python3.6/site-packages/ray/tune/function_runner.py”, line 645, in inner
(pid=9734) fn(config, **fn_kwargs)
(pid=9734) File “/MBART.py”, line 363, in train_tune
(pid=9734) trainer.fit(model)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/states.py”, line 48, in wrapped_fn
(pid=9734) result = fn(self, *args, **kwargs)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py”, line 1073, in fit
(pid=9734) results = self.accelerator_backend.train(model)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/accelerators/gpu_backend.py”, line 51, in train
(pid=9734) results = self.trainer.run_pretrain_routine(model)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py”, line 1239, in run_pretrain_routine
(pid=9734) self.train()
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py”, line 394, in train
(pid=9734) self.run_training_epoch()
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py”, line 516, in run_training_epoch
(pid=9734) self.run_evaluation(test_mode=False)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/evaluation_loop.py”, line 582, in run_evaluation
(pid=9734) eval_results = self._evaluate(self.model, dataloaders, max_batches, test_mode)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/evaluation_loop.py”, line 369, in _evaluate
(pid=9734) self.on_validation_batch_end(batch, batch_idx, dataloader_idx)
(pid=9734) File “/venv/lib/python3.6/site-packages/pytorch_lightning/trainer/callback_hook.py”, line 156, in on_validation_batch_end
(pid=9734) callback.on_validation_batch_end(self, self.get_model(), batch, batch_idx, dataloader_idx)
(pid=9734) TypeError: on_validation_batch_end() missing 1 required positional argument: ‘dataloader_idx’
This is the code:
from ray.tune.integration.pytorch_lightning import TuneReportCallback
ray_callback = TuneReportCallback(
{
"loss": "val_loss",
"rouge1": "val_rouge1"
},
on="validation_end")
config = {
"learning_rate": tune.loguniform(1e-7, 5e-4),
"weight_decay": tune.choice([0,0.1,0.2,0.3]),
"warmup_steps": tune.choice([0,100,300, 500]),
"adam_epsilon":1e-08, "early_stop_callback":False, "eval_batch_size":2, "fp_16":False,
"freeze_embeds":False, "freeze_encoder":False, "gradient_accumulation_steps":8, "label_smoothing":0, "max_grad_norm":1.0, "max_input_length":512,
"max_output_length":150, "model_name_or_path":model_name, "n_gpu":1, "n_test":100, "n_train":1000, "n_val":100, "num_train_epochs":1,
"opt_level":'O1', "output_dir":'output_path, "resume_from_checkpoint":None, "seed":42, "tokenizer_name_or_path":tokenizer, "train_batch_size":2, "val_check_interval":0.05
}
train_params = dict(
num_sanity_val_steps=0,
accumulate_grad_batches=1,
gpus=1,
max_epochs=args.num_train_epochs,
precision= 16 if args.fp_16 else 32,
amp_level=args.opt_level,
resume_from_checkpoint=args.resume_from_checkpoint,
gradient_clip_val=args.max_grad_norm,
checkpoint_callback=checkpoint_callback,
val_check_interval=args.val_check_interval,
logger=wandb_logger,
callbacks=[ray_callback],
)
def train_tune(config):
model = ModelFineTuner(config)
trainer = pl.Trainer(**train_params)
trainer.fit(model)
from functools import partial
analysis = tune.run(
tune.with_parameters(
train_tune),
resources_per_trial={
"cpu": 6,
"gpu": 1
},
config=config, )
Any ideas on how to solve the error ?
Thanks!