Hi All. I’m relatively new to Ray Tune and ML engineering and i’m encountering the following issue. When trying to tune hyperparameters, im encountering the following error when running tuner.fit() . It runs for the first epoch, then upon completing the second epoch and calling session.report(), I encounter the following error.
TypeError Traceback (most recent call last)
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tune.py:1072, in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, checkpoint_keep_all_ranks, checkpoint_upload_from_workers, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, local_dir, _experiment_checkpoint_dir, _remote, _remote_string_queue, _entrypoint)
1071 if has_verbosity(Verbosity.V1_EXPERIMENT):
-> 1072 _report_progress(runner, progress_reporter)
1074 if air_verbosity is not None:
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tune.py:172, in _report_progress(runner, reporter, done)
171 if reporter.should_report(trials, done=done):
--> 172 sched_debug_str = runner.scheduler_alg.debug_string()
173 used_resources_str = runner._used_resources_string()
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:171, in AsyncHyperBandScheduler.debug_string(self)
170 out = "Using AsyncHyperBand: num_stopped={}".format(self._num_stopped)
--> 171 out += "\n" + "\n".join([b.debug_str() for b in self._brackets])
172 return out
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:171, in (.0)
170 out = "Using AsyncHyperBand: num_stopped={}".format(self._num_stopped)
--> 171 out += "\n" + "\n".join([b.debug_str() for b in self._brackets])
172 return out
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:257, in _Bracket.debug_str(self)
254 def debug_str(self) -> str:
255 # TODO: fix up the output for this
...
215 time.monotonic() - times[0][0] < self._actor_force_cleanup_timeout
216 ) and self._actor_manager.is_actor_started(tracked_actor=times[0][1]):
217 # Even if force_all=True, we give the actors time to clean up
TypeError: '<' not supported between instances of 'TrackedActor' and 'TrackedActor'
Here is my code also for context. Any help would be appreciated. Thanks!
from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset
def get_datasets(training_data_dir, cleaned_images_dir):
transform_list = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
with FileLock(os.path.expanduser("~/.data.lock")):
dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
#obtain the list of targets
train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
return (train_dataset, test_dataset, val_dataset)
import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import Checkpoint, session
from torch.utils.tensorboard import SummaryWriter
def create_model_dir_path():
parent_dir = 'model_evaluation'
current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
child_dir = 'weights'
path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
return path
def training_loop(model, optimiser, train_loader, epoch_num):
# writer = SummaryWriter()
batch_id = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Set the model to run on the device
model = model.to(device)
model.train(True)
print("Model device added")
for batch in train_loader:
# get features and labels from the batch
features,labels = batch
features = features.to(device)
labels = labels.to(device)
# loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
optimiser.zero_grad()
# make a prediction
prediction = model(features)
# calculate loss
criterion = F.cross_entropy(prediction,labels)
# backward function calculates the gradient of the current tensor w.r.t graph leaves
criterion.backward()
# print(criterion.item())
# moves each parameter in the opposite direction of the gradient, proportional to the learning rate
optimiser.step()
# writer.add_scalar('Loss', criterion.item(), batch_id)
batch_id += 1
print("Completed")
def validate(model, val_loader):
# Set the model to evaluation mode
model.eval()
running_vloss = 0.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
for i, vdata in enumerate(val_loader):
vinputs, vlabels = vdata
vinputs = vinputs.to(device)
vlabels = vlabels.to(device)
voutputs = model(vinputs)
vloss = F.cross_entropy(voutputs, vlabels)
running_vloss += vloss
avg_vloss = running_vloss / (i + 1)
return avg_vloss
def train_fbm(config):
model = FBMClassifier()
optimiser = SGD(model.parameters(), lr = config["lr"])
base_dir = "**mybasedir**"
train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
path = create_model_dir_path()
os.makedirs(path)
for epoch in range(10):
print(f"Beginning {epoch} ...")
training_loop(model,optimiser,train_loader, epoch)
loss = validate(model, val_loader)
print(f'Epoch {epoch} - Average Loss: {loss}')
session.report(metrics={"loss": loss})#tune.report(mean_accuracy=loss)
# torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
print(f"Ending {epoch} ...")
from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig
config = {
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([2, 4, 8, 16])
}
scheduler = ASHAScheduler(
max_t=10,
grace_period=1,
reduction_factor=2)
tuner = tune.Tuner(
tune.with_resources(
tune.with_parameters(train_fbm),
resources={"cpu": 2, "gpu": 1}
),
tune_config=tune.TuneConfig(
metric="loss",
mode="min",
scheduler=scheduler,
num_samples=3,
),
param_space=config,
run_config= RunConfig(verbose=3)
)
results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
best_result.metrics["accuracy"]))