Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects

  • High: It blocks me to complete my task.

Hello Ray Community!
I’ve fine-tuned deberta with my own data and would like to do some hyperparameter optimization with ray tune. I’m trying to use ray tune but it keeps showing a metric error in all the trials that appear. Can anybody help me, please?

I am not able to fix or find a standard ray implementation solution that uses the trainer.hyperparameter_search() yet and as a newbie I’m having a lot of difficulty.

Here is my custom code (adapted from Using |🤗| Huggingface Transformers with Tune — Ray 2.3.0):

import os

from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import sentencepiece
import numpy as np
import pandas as pd 
import os
import torch
import ray
from ray import tune
from ray.air import session
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
    download_data

)
from ray.tune.schedulers import PopulationBasedTraining
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)


model_hf = 'microsoft/deberta-v3-base'
dataset_file = 'postagged_clean.csv'
dataset_size = 1000

def load_dataset(dataset_file, dataset_size):
  data = pd.read_csv('postagged_clean.csv')
  data = data[:dataset_size]
  data['label'] = data['label'].astype(int)
  data['premise'] = data['premise'].astype(str)
  data['hypothesis'] = data['hypothesis'].astype(str)
  return data


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])  


def prepare_dataset(dataset_file, dataset_size, tokenizer):
  df = load_dataset(dataset_file, dataset_size)
  
  training_size = int(dataset_size * 0.8)
  #val_size = dataset_size - training_size
  
  premise = list(df['premise'])
  hypothesis = list(df['hypothesis'])
  y = list(df['label'])

  X_train_tokenized = tokenizer(premise[0:training_size], hypothesis[0:training_size], padding=True, truncation=True, max_length=256)
  y_train = y[0:training_size]

  X_val_tokenized = tokenizer(premise[training_size:dataset_size], hypothesis[training_size:dataset_size], padding=True, truncation=True, max_length=256)
  y_val = y[training_size:dataset_size]

  train_dataset = Dataset(X_train_tokenized, y_train)
  val_dataset = Dataset(X_val_tokenized, y_val)
  return train_dataset, val_dataset

'''
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy_metric = load_metric("accuracy")
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return acc
'''

def compute_metrics(eval_preds):
  metric = load_metric("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  accuracy_metric = load_metric("accuracy")
  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
  return accuracy

def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False):
    data_dir_name = "./data" if not smoke_test else "./test_data"
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)

    model_hf = 'microsoft/deberta-v3-base'

    # Download and cache tokenizer, model, and features
    print("Downloading and caching Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_hf)

    
    train_dataset, val_dataset = prepare_dataset(dataset_file, dataset_size, tokenizer)

    # Change these as needed.
    model_name = (
        model_hf if not smoke_test else model_hf
    )
    task_name = "zero-shot-classification"
    task_data_dir = os.path.join(data_dir, task_name.upper())
    num_labels = 3
    config = AutoConfig.from_pretrained(
        model_name, num_labels=num_labels, finetuning_task=task_name
    )

    
    # Triggers tokenizer download to cache
    print("Downloading and caching pre-trained model")
    AutoModelForSequenceClassification.from_pretrained(
        model_hf,
        config=config,
    )

    def get_model():
        return AutoModelForSequenceClassification.from_pretrained(
            model_hf,
            config=config,
        )

    TUNE_DISABLE_STRICT_METRIC_CHECKING=1

    training_args = TrainingArguments(
        output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="./logs",
        skip_memory_metrics=True,
        metric_for_best_model='eval_acc',
        report_to="none"
    )

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    tune_config = {
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
    }

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_acc",
        mode="max",
        perturbation_interval=1,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.3),
            "learning_rate": tune.uniform(1e-5, 5e-5),
            "per_device_train_batch_size": [12, 16, 32, 64],
            "num_train_epochs": tune.choice([5])
        },
    )

    reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "per_device_train_batch_size": "train_bs/gpu",
            "num_train_epochs": "num_epochs",
        },
        metric_columns=["eval_acc", "eval_loss", "epoch", "training_iteration"],
    )

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        n_trials=num_samples,
        resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
        scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop={"training_iteration": 1} if smoke_test else None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        log_to_file=True,
    )
    tune.report(
        metric= "eval_acc"
    )
    results = tuner.fit()
    print("Best hyperparameters found were: ", results.get_best_result().config)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test",
        default=True,
        action="store_true",
        help="Finish quickly for testing",
    )
    args, _ = parser.parse_known_args()

    #ray.init()

    if args.smoke_test:
        tune_transformer(num_samples=3, gpus_per_trial=0, smoke_test=True)
    else:
        # You can change the number of GPUs here:
        tune_transformer(num_samples=8, gpus_per_trial=1)

    ray.shutdown()     

this is the error that appears:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _wait_and_handle_event(self, next_trial)
    923                     if event.type == _ExecutorEventType.TRAINING_RESULT:
--> 924                         self._on_training_result(
    925                             trial, result[_ExecutorEvent.KEY_FUTURE_RESULT]

10 frames
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _on_training_result(self, trial, result)
   1046         with warn_if_slow("process_trial_result"):
-> 1047             self._process_trial_results(trial, result)
   1048 

/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _process_trial_results(self, trial, results)
   1129                 with warn_if_slow("process_trial_result"):
-> 1130                     decision = self._process_trial_result(trial, result)
   1131                 if decision is None:

/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _process_trial_result(self, trial, result)
   1166         flat_result = flatten_dict(result)
-> 1167         self._validate_result_metrics(flat_result)
   1168 

/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _validate_result_metrics(self, result)
   1262             if report_metric:
-> 1263                 raise ValueError(
   1264                     "Trial returned a result which did not include the "

ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'objective': 0.0, 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0, 'eval_runtime': 31.6168, 'eval_samples_per_second': 6.326, 'eval_steps_per_second': 0.221, 'epoch': 0.04, 'time_this_iter_s': 56.38709497451782, 'should_checkpoint': True, 'done': False, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '8fb01_00000', 'experiment_id': '13a71dfaee2e46c2bc06e5d5d51c54d7', 'date': '2023-03-27_20-19-40', 'timestamp': 1679948380, 'time_total_s': 56.38709497451782, 'pid': 52539, 'hostname': 'eb9cee7f9116', 'node_ip': '172.28.0.12', 'time_since_restore': 56.38709497451782, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003461599349975586, 'config/per_device_train_batch_size': 32, 'config/per_device_eval_batch_size': 32, 'config/num_train_epochs': 4, 'config/max_steps': 1, 'config/weight_decay': 0.23896289605806983, 'config/learning_rate': 1.7337391594646555e-05}

During handling of the above exception, another exception occurred:

TuneError                                 Traceback (most recent call last)
<ipython-input-4-71fcf34ba5e4> in <module>
    228 
    229     if args.smoke_test:
--> 230         tune_transformer(num_samples=3, gpus_per_trial=0, smoke_test=True)
    231     else:
    232         # You can change the number of GPUs here:

<ipython-input-4-71fcf34ba5e4> in tune_transformer(num_samples, gpus_per_trial, smoke_test)
    192     )
    193 
--> 194     trainer.hyperparameter_search(
    195         hp_space=lambda _: tune_config,
    196         backend="ray",

/usr/local/lib/python3.9/dist-packages/transformers/trainer.py in hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs)
   2536             HPSearchBackend.WANDB: run_hp_search_wandb,
   2537         }
-> 2538         best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
   2539 
   2540         self.hp_search_backend = None

/usr/local/lib/python3.9/dist-packages/transformers/integrations.py in run_hp_search_ray(trainer, n_trials, direction, **kwargs)
    340         dynamic_modules_import_trainable.__mixins__ = trainable.__mixins__
    341 
--> 342     analysis = ray.tune.run(
    343         dynamic_modules_import_trainable,
    344         config=trainer.hp_space(None),

/usr/local/lib/python3.9/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, _experiment_checkpoint_dir, _remote, _remote_string_queue)
    754     )
    755     while not runner.is_finished() and not experiment_interrupted_event.is_set():
--> 756         runner.step()
    757         if has_verbosity(Verbosity.V1_EXPERIMENT):
    758             _report_progress(runner, progress_reporter)

/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in step(self)
    955             logger.debug(f"Got new trial to run: {next_trial}")
    956 
--> 957         self._wait_and_handle_event(next_trial)
    958 
    959         self._stop_experiment_if_needed()

/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _wait_and_handle_event(self, next_trial)
    934                 raise e
    935             else:
--> 936                 raise TuneError(traceback.format_exc())
    937 
    938     def step(self):

TuneError: Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 924, in _wait_and_handle_event
    self._on_training_result(
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1047, in _on_training_result
    self._process_trial_results(trial, result)
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1130, in _process_trial_results
    decision = self._process_trial_result(trial, result)
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1167, in _process_trial_result
    self._validate_result_metrics(flat_result)
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1263, in _validate_result_metrics
    raise ValueError(
ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'objective': 0.0, 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0, 'eval_runtime': 31.6168, 'eval_samples_per_second': 6.326, 'eval_steps_per_second': 0.221, 'epoch': 0.04, 'time_this_iter_s': 56.38709497451782, 'should_checkpoint': True, 'done': False, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '8fb01_00000', 'experiment_id': '13a71dfaee2e46c2bc06e5d5d51c54d7', 'date': '2023-03-27_20-19-40', 'timestamp': 1679948380, 'time_total_s': 56.38709497451782, 'pid': 52539, 'hostname': 'eb9cee7f9116', 'node_ip': '172.28.0.12', 'time_since_restore': 56.38709497451782, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003461599349975586, 'config/per_device_train_batch_size': 32, 'config/per_device_eval_batch_size': 32, 'config/num_train_epochs': 4, 'config/max_steps': 1, 'config/weight_decay': 0.23896289605806983, 'config/learning_rate': 1.7337391594646555e-05}

Thanks in advance! :hugs:

Hello, also new to Ray and transformer hyperparameter search. I ran into this myself and believe it’s because in the ‘Results’ dictionary, it has ‘eval_accuracy’ and not ‘eval_acc’ as you defined as the string for the metric.

1 Like

Hi @Mel_Augusto,

what @Od-Lanir suggested should be correct! If you take a look at the error message:

ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'objective': 0.0, 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0, 'eval_runtime': 31.6168, 'eval_samples_per_second': 6.326, 'eval_steps_per_second': 0.221, 'epoch': 0.04, 'time_this_iter_s': 56.38709497451782, 'should_checkpoint': True, 'done': False, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '8fb01_00000', 'experiment_id': '13a71dfaee2e46c2bc06e5d5d51c54d7', 'date': '2023-03-27_20-19-40', 'timestamp': 1679948380, 'time_total_s': 56.38709497451782, 'pid': 52539, 'hostname': 'eb9cee7f9116', 'node_ip': '172.28.0.12', 'time_since_restore': 56.38709497451782, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003461599349975586, 'config/per_device_train_batch_size': 32, 'config/per_device_eval_batch_size': 32, 'config/num_train_epochs': 4, 'config/max_steps': 1, 'config/weight_decay': 0.23896289605806983, 'config/learning_rate': 1.7337391594646555e-05}

you can see that it reports 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0 but not eval_acc.

So if you just change eval_acc to eval_accuracy in the rest of your code, you should be good!

By the way, you may want to consider using eval_loss as a metric instead - it’s a more fine-grained metrics as the accuracy is often a lossy derivative of the loss.