I am trying Ray Tune with Ax. My validation code as follows:
# validation
model.eval()
val_losses = []
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs.permute(0, 2, 1))
loss = loss_fn(outputs, targets)
val_losses.append(loss.item())
# print progress
train_loss = sum(train_losses) / len(train_losses)
val_loss = sum(val_losses) / len(val_losses)
session.report({"val_loss" : val_loss})
return val_loss
ax = AxClient(enforce_sequential_optimization=False)
MINIMIZE = True # Whether we should be minimizing or maximizing the objective
ax.create_experiment(
name="trial_experiment",
parameters = [
{"name": "learning_rate", "type": "range", "bounds": [1e-2, 1e-1], "log_scale": True},
{"name": "batch_size", "type": "choice", "values": [64, 128]}
],
objective_name="val_loss",
minimize=MINIMIZE,
)
ax.experiment.optimization_config.objective.minimize
# Set up AxSearcher in RayTune
algo = AxSearch(ax_client=ax)
algo = tune.search.ConcurrencyLimiter(algo, max_concurrent=3)
tune.run(
tune.with_parameters(train, base_x_train=base_x_train, base_y_train=base_y_train,
base_x_val=base_x_val, base_y_val=base_y_val, base_x_test=base_x_test,
base_y_test=base_y_test, epochs=3, patience=0),
num_samples=5,
search_alg=algo,
verbose=1,
resources_per_trial={"cpu": 2, "gpu": 1},
)
The error is:
Trial returned a result which did not include the specified metric(s) `val_loss` that `SearchGenerator` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1. Result: {'_metric': 0.4429260902106762, 'time_this_iter_s': 0.03015732765197754, 'done': False, 'training_iteration': 4, 'trial_id': '7e5666d1', 'date': '2023-05-24_19-20-06', 'timestamp': 1684956006, 'time_total_s': 31.46916675567627, 'pid': 38774, 'hostname': '35e35a1ebe64', 'node_ip': '172.28.0.12', 'time_since_restore': 31.46916675567627, 'iterations_since_restore': 4, 'config/hidden_layers': 2, 'config/num_neurons': 256, 'config/kernel_size': 3, 'config/dropout': 0.37662695646286015, 'config/learning_rate': 0.024095727162975665, 'config/batch_size': 64}
What mistake am I making here? Any guide please?