I get the following error when trying to run this code block.
2024-02-05 00:40:27,959 ERROR tune_controller.py:1374 – Trial task failed for trial train_cifar_d9b90_00000
Traceback (most recent call last):
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray\air\execution_internal\event_manager.py”, line 110, in resolve_future
result = ray.get(future)
^^^^^^^^^^^^^^^
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray_private\auto_init_hook.py”, line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray_private\client_mode_hook.py”, line 103, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray_private\worker.py”, line 2624, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(KeyError): ray::ImplicitFunc.train() (pid=41716, ip=127.0.0.1, actor_id=6b400c177e6b5b3db2a0418601000000, repr=func)
File “python\ray_raylet.pyx”, line 1813, in ray._raylet.execute_task
File “python\ray_raylet.pyx”, line 1754, in ray._raylet.execute_task.function_executor
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray_private\function_manager.py”, line 726, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray\util\tracing\tracing_helper.py”, line 467, in _resume_span
return method(self, *_args, **_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray\tune\trainable\trainable.py”, line 342, in train
raise skipped from exception_cause(skipped)
File “c:\Users\Aurick\anaconda3\envs\Aurick\Lib\site-packages\ray\air_internal\util.py”, line 88, in run
…
output = fn()
^^^^
File “C:\Users\Aurick\AppData\Local\Temp\ipykernel_22348\2015821930.py”, line 3, in train_cifar
KeyError: ‘dr’
TuneError: (‘Trials did not complete’, [train_cifar_d9b90_00000, train_cifar_d9b90_00001, train_cifar_d9b90_00002, train_cifar_d9b90_00003, train_cifar_d9b90_00004])
The code snippet I’ve using is provided below:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
data_dir = resolve_path(‘My Models’)
#load_data(data_dir)
config = {
"l1": tune.choice([256,128,64]),
"l2": tune.choice([8]),
"l3": tune.choice([16]),
"lr": tune.loguniform(1e-6, 1e-3),
"batch_size": tune.choice([2, 4, 8, 16, 32, 64]),
}
config = {
“l1”: tune.choice([2**i for i in range(9)]),
“l2”: tune.choice([2**i for i in range(7)]),
“l3”: tune.choice([2**i for i in range(5)]),
“lr”: tune.loguniform(1e-6, 1e-4),
“batch_size”: tune.choice([4, 8, 16, 32]),
}
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=max_num_epochs,
grace_period=1,
reduction_factor=2,
)
result = tune.run(
partial(train_cifar, data_dir=data_dir, max_num_epochs=max_num_epochs),
resources_per_trial={"cpu": 18, "gpu": gpus_per_trial},
config=config,
num_samples=num_samples,
scheduler=scheduler,
)
best_trial = result.get_best_trial("loss", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
#print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")
best_trained_model = NeuralNetwork(best_trial.config["l1"], best_trial.config["l2"], best_trial.config["l3"])
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
#print(f"Using {device} device")
best_trained_model.to(device)
checkpoint_path = os.path.join(best_trial.checkpoint.to_directory(), "model.checkpoint")
model_state, optimizer_state = torch.load(checkpoint_path)
best_trained_model.load_state_dict(model_state)
torch.save(best_trained_model, 'Ray_GaN_Best_NoOpt.pth')
#test_acc = test_accuracy(best_trained_model, device)
#print("Best trial test set accuracy: {}".format(test_acc))
if name == “main”:
# You can change the number of GPUs per trial here:
main(num_samples=5, max_num_epochs=10, gpus_per_trial=1)
Will greatly appriciate any kinds of help. Thanks in advance