- High: It blocks me to complete my task.
Hello Ray Community!
I’ve fine-tuned deberta with my own data and would like to do some hyperparameter optimization with ray tune. I’m trying to use ray tune but it keeps showing a metric error in all the trials that appear. Can anybody help me, please?
I am not able to fix or find a standard ray implementation solution that uses the trainer.hyperparameter_search() yet and as a newbie I’m having a lot of difficulty.
Here is my custom code (adapted from Using |🤗| Huggingface Transformers with Tune — Ray 2.3.0):
import os
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import sentencepiece
import numpy as np
import pandas as pd
import os
import torch
import ray
from ray import tune
from ray.air import session
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
download_data
)
from ray.tune.schedulers import PopulationBasedTraining
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
)
model_hf = 'microsoft/deberta-v3-base'
dataset_file = 'postagged_clean.csv'
dataset_size = 1000
def load_dataset(dataset_file, dataset_size):
data = pd.read_csv('postagged_clean.csv')
data = data[:dataset_size]
data['label'] = data['label'].astype(int)
data['premise'] = data['premise'].astype(str)
data['hypothesis'] = data['hypothesis'].astype(str)
return data
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels=None):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.encodings["input_ids"])
def prepare_dataset(dataset_file, dataset_size, tokenizer):
df = load_dataset(dataset_file, dataset_size)
training_size = int(dataset_size * 0.8)
#val_size = dataset_size - training_size
premise = list(df['premise'])
hypothesis = list(df['hypothesis'])
y = list(df['label'])
X_train_tokenized = tokenizer(premise[0:training_size], hypothesis[0:training_size], padding=True, truncation=True, max_length=256)
y_train = y[0:training_size]
X_val_tokenized = tokenizer(premise[training_size:dataset_size], hypothesis[training_size:dataset_size], padding=True, truncation=True, max_length=256)
y_val = y[training_size:dataset_size]
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
return train_dataset, val_dataset
'''
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
accuracy_metric = load_metric("accuracy")
acc = accuracy_metric.compute(predictions=predictions, references=labels)
return acc
'''
def compute_metrics(eval_preds):
metric = load_metric("accuracy")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
accuracy_metric = load_metric("accuracy")
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
return accuracy
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False):
data_dir_name = "./data" if not smoke_test else "./test_data"
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
if not os.path.exists(data_dir):
os.mkdir(data_dir, 0o755)
model_hf = 'microsoft/deberta-v3-base'
# Download and cache tokenizer, model, and features
print("Downloading and caching Tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_hf)
train_dataset, val_dataset = prepare_dataset(dataset_file, dataset_size, tokenizer)
# Change these as needed.
model_name = (
model_hf if not smoke_test else model_hf
)
task_name = "zero-shot-classification"
task_data_dir = os.path.join(data_dir, task_name.upper())
num_labels = 3
config = AutoConfig.from_pretrained(
model_name, num_labels=num_labels, finetuning_task=task_name
)
# Triggers tokenizer download to cache
print("Downloading and caching pre-trained model")
AutoModelForSequenceClassification.from_pretrained(
model_hf,
config=config,
)
def get_model():
return AutoModelForSequenceClassification.from_pretrained(
model_hf,
config=config,
)
TUNE_DISABLE_STRICT_METRIC_CHECKING=1
training_args = TrainingArguments(
output_dir=".",
learning_rate=1e-5, # config
do_train=True,
do_eval=True,
no_cuda=gpus_per_trial <= 0,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
num_train_epochs=2, # config
max_steps=-1,
per_device_train_batch_size=16, # config
per_device_eval_batch_size=16, # config
warmup_steps=0,
weight_decay=0.1, # config
logging_dir="./logs",
skip_memory_metrics=True,
metric_for_best_model='eval_acc',
report_to="none"
)
trainer = Trainer(
model_init=get_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
tune_config = {
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"num_train_epochs": tune.choice([2, 3, 4, 5]),
"max_steps": 1 if smoke_test else -1, # Used for smoke test.
}
scheduler = PopulationBasedTraining(
time_attr="training_iteration",
metric="eval_acc",
mode="max",
perturbation_interval=1,
hyperparam_mutations={
"weight_decay": tune.uniform(0.0, 0.3),
"learning_rate": tune.uniform(1e-5, 5e-5),
"per_device_train_batch_size": [12, 16, 32, 64],
"num_train_epochs": tune.choice([5])
},
)
reporter = CLIReporter(
parameter_columns={
"weight_decay": "w_decay",
"learning_rate": "lr",
"per_device_train_batch_size": "train_bs/gpu",
"num_train_epochs": "num_epochs",
},
metric_columns=["eval_acc", "eval_loss", "epoch", "training_iteration"],
)
trainer.hyperparameter_search(
hp_space=lambda _: tune_config,
backend="ray",
n_trials=num_samples,
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
scheduler=scheduler,
keep_checkpoints_num=1,
checkpoint_score_attr="training_iteration",
stop={"training_iteration": 1} if smoke_test else None,
progress_reporter=reporter,
local_dir="~/ray_results/",
name="tune_transformer_pbt",
log_to_file=True,
)
tune.report(
metric= "eval_acc"
)
results = tuner.fit()
print("Best hyperparameters found were: ", results.get_best_result().config)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--smoke-test",
default=True,
action="store_true",
help="Finish quickly for testing",
)
args, _ = parser.parse_known_args()
#ray.init()
if args.smoke_test:
tune_transformer(num_samples=3, gpus_per_trial=0, smoke_test=True)
else:
# You can change the number of GPUs here:
tune_transformer(num_samples=8, gpus_per_trial=1)
ray.shutdown()
this is the error that appears:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _wait_and_handle_event(self, next_trial)
923 if event.type == _ExecutorEventType.TRAINING_RESULT:
--> 924 self._on_training_result(
925 trial, result[_ExecutorEvent.KEY_FUTURE_RESULT]
10 frames
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _on_training_result(self, trial, result)
1046 with warn_if_slow("process_trial_result"):
-> 1047 self._process_trial_results(trial, result)
1048
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _process_trial_results(self, trial, results)
1129 with warn_if_slow("process_trial_result"):
-> 1130 decision = self._process_trial_result(trial, result)
1131 if decision is None:
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _process_trial_result(self, trial, result)
1166 flat_result = flatten_dict(result)
-> 1167 self._validate_result_metrics(flat_result)
1168
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _validate_result_metrics(self, result)
1262 if report_metric:
-> 1263 raise ValueError(
1264 "Trial returned a result which did not include the "
ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'objective': 0.0, 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0, 'eval_runtime': 31.6168, 'eval_samples_per_second': 6.326, 'eval_steps_per_second': 0.221, 'epoch': 0.04, 'time_this_iter_s': 56.38709497451782, 'should_checkpoint': True, 'done': False, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '8fb01_00000', 'experiment_id': '13a71dfaee2e46c2bc06e5d5d51c54d7', 'date': '2023-03-27_20-19-40', 'timestamp': 1679948380, 'time_total_s': 56.38709497451782, 'pid': 52539, 'hostname': 'eb9cee7f9116', 'node_ip': '172.28.0.12', 'time_since_restore': 56.38709497451782, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003461599349975586, 'config/per_device_train_batch_size': 32, 'config/per_device_eval_batch_size': 32, 'config/num_train_epochs': 4, 'config/max_steps': 1, 'config/weight_decay': 0.23896289605806983, 'config/learning_rate': 1.7337391594646555e-05}
During handling of the above exception, another exception occurred:
TuneError Traceback (most recent call last)
<ipython-input-4-71fcf34ba5e4> in <module>
228
229 if args.smoke_test:
--> 230 tune_transformer(num_samples=3, gpus_per_trial=0, smoke_test=True)
231 else:
232 # You can change the number of GPUs here:
<ipython-input-4-71fcf34ba5e4> in tune_transformer(num_samples, gpus_per_trial, smoke_test)
192 )
193
--> 194 trainer.hyperparameter_search(
195 hp_space=lambda _: tune_config,
196 backend="ray",
/usr/local/lib/python3.9/dist-packages/transformers/trainer.py in hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs)
2536 HPSearchBackend.WANDB: run_hp_search_wandb,
2537 }
-> 2538 best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
2539
2540 self.hp_search_backend = None
/usr/local/lib/python3.9/dist-packages/transformers/integrations.py in run_hp_search_ray(trainer, n_trials, direction, **kwargs)
340 dynamic_modules_import_trainable.__mixins__ = trainable.__mixins__
341
--> 342 analysis = ray.tune.run(
343 dynamic_modules_import_trainable,
344 config=trainer.hp_space(None),
/usr/local/lib/python3.9/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, _experiment_checkpoint_dir, _remote, _remote_string_queue)
754 )
755 while not runner.is_finished() and not experiment_interrupted_event.is_set():
--> 756 runner.step()
757 if has_verbosity(Verbosity.V1_EXPERIMENT):
758 _report_progress(runner, progress_reporter)
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in step(self)
955 logger.debug(f"Got new trial to run: {next_trial}")
956
--> 957 self._wait_and_handle_event(next_trial)
958
959 self._stop_experiment_if_needed()
/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py in _wait_and_handle_event(self, next_trial)
934 raise e
935 else:
--> 936 raise TuneError(traceback.format_exc())
937
938 def step(self):
TuneError: Traceback (most recent call last):
File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 924, in _wait_and_handle_event
self._on_training_result(
File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1047, in _on_training_result
self._process_trial_results(trial, result)
File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1130, in _process_trial_results
decision = self._process_trial_result(trial, result)
File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1167, in _process_trial_result
self._validate_result_metrics(flat_result)
File "/usr/local/lib/python3.9/dist-packages/ray/tune/execution/trial_runner.py", line 1263, in _validate_result_metrics
raise ValueError(
ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'objective': 0.0, 'eval_loss': 1.1676169633865356, 'eval_accuracy': 0.0, 'eval_runtime': 31.6168, 'eval_samples_per_second': 6.326, 'eval_steps_per_second': 0.221, 'epoch': 0.04, 'time_this_iter_s': 56.38709497451782, 'should_checkpoint': True, 'done': False, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '8fb01_00000', 'experiment_id': '13a71dfaee2e46c2bc06e5d5d51c54d7', 'date': '2023-03-27_20-19-40', 'timestamp': 1679948380, 'time_total_s': 56.38709497451782, 'pid': 52539, 'hostname': 'eb9cee7f9116', 'node_ip': '172.28.0.12', 'time_since_restore': 56.38709497451782, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003461599349975586, 'config/per_device_train_batch_size': 32, 'config/per_device_eval_batch_size': 32, 'config/num_train_epochs': 4, 'config/max_steps': 1, 'config/weight_decay': 0.23896289605806983, 'config/learning_rate': 1.7337391594646555e-05}
Thanks in advance!