Hello,
I am new to Tune and getting this error in my code and I don’t seem to figure out where it is coming from
ValueError: Trial returned a result which did not include the specified metric(s) eval_acc
that PopulationBasedTraining
expects. Make sure your calls to tune.report()
include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {‘trial_id’: ‘bbc62_00000’, ‘date’: ‘2024-07-17_17-46-33’, ‘timestamp’: 1721231193, ‘pid’: 63296, ‘hostname’: ‘linglit48’, ‘node_ip’: ‘130.83.252.48’, ‘done’: True, ‘config/model_name’: ‘Helsinki-NLP/opus-mt-ar-en’, ‘config/task_name’: ‘rte’, ‘config/data_dir’: ‘/home/saughmon/IREP_Project/Pipeline/src/fine_tuning/data/RTE’, ‘config/per_gpu_val_batch_size’: 32, ‘config/per_gpu_train_batch_size’: 16, ‘config/learning_rate’: 4.709216008487399e-05, ‘config/weight_decay’: 0.11923011344361316, ‘config/num_epochs’: 2, ‘config/max_steps’: -1, ‘config/wandb/project’: ‘pbt_transformers’, ‘config/wandb/reinit’: True, ‘config/wandb/allow_val_change’: True}
Here is my code:
class TuneTransformerTrainer(transformers.Trainer):
def get_optimizers(
self, num_training_steps
):
self.current_optimizer, self.current_scheduler = super(
).get_optimizers(num_training_steps)
return (self.current_optimizer, self.current_scheduler)
def evaluate(self,
eval_dataset= None):
eval_dataloader = self.get_eval_dataloader(eval_dataset)
output = self._prediction_loop(
eval_dataloader, description="Evaluation")
self._log(output.metrics)
self.save_state()
tune.report(**output.metrics)
return output.metrics
def save_state(self):
with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
self.args.output_dir = checkpoint_dir
# This is the directory name that Huggingface requires.
output_dir = os.path.join(
self.args.output_dir,
f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
self.save_model(output_dir)
if self.is_world_master():
torch.save(self.current_optimizer.state_dict(),
os.path.join(output_dir, "optimizer.pt"))
torch.save(self.current_scheduler.state_dict(),
os.path.join(output_dir, "scheduler.pt"))