TypeError: '<' not supported between instances of 'TrackedActor' and 'TrackedActor'

Hi All. I’m relatively new to Ray Tune and ML engineering and i’m encountering the following issue. When trying to tune hyperparameters, im encountering the following error when running tuner.fit() . It runs for the first epoch, then upon completing the second epoch and calling session.report(), I encounter the following error.

TypeError                                 Traceback (most recent call last)
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tune.py:1072, in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, checkpoint_keep_all_ranks, checkpoint_upload_from_workers, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, local_dir, _experiment_checkpoint_dir, _remote, _remote_string_queue, _entrypoint)
   1071 if has_verbosity(Verbosity.V1_EXPERIMENT):
-> 1072     _report_progress(runner, progress_reporter)
   1074 if air_verbosity is not None:

File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tune.py:172, in _report_progress(runner, reporter, done)
    171 if reporter.should_report(trials, done=done):
--> 172     sched_debug_str = runner.scheduler_alg.debug_string()
    173     used_resources_str = runner._used_resources_string()

File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:171, in AsyncHyperBandScheduler.debug_string(self)
    170 out = "Using AsyncHyperBand: num_stopped={}".format(self._num_stopped)
--> 171 out += "\n" + "\n".join([b.debug_str() for b in self._brackets])
    172 return out

File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:171, in (.0)
    170 out = "Using AsyncHyperBand: num_stopped={}".format(self._num_stopped)
--> 171 out += "\n" + "\n".join([b.debug_str() for b in self._brackets])
    172 return out

File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\schedulers\async_hyperband.py:257, in _Bracket.debug_str(self)
    254 def debug_str(self) -> str:
    255     # TODO: fix up the output for this
...
    215         time.monotonic() - times[0][0] < self._actor_force_cleanup_timeout
    216     ) and self._actor_manager.is_actor_started(tracked_actor=times[0][1]):
    217         # Even if force_all=True, we give the actors time to clean up

TypeError: '<' not supported between instances of 'TrackedActor' and 'TrackedActor'

Here is my code also for context. Any help would be appreciated. Thanks!

from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset

def get_datasets(training_data_dir, cleaned_images_dir):    
    transform_list = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], 
                                [0.229, 0.224, 0.225])
        ])
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
        #obtain the list of targets
        train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
    return (train_dataset, test_dataset, val_dataset)

import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import Checkpoint, session
from torch.utils.tensorboard import SummaryWriter

def create_model_dir_path():
    parent_dir = 'model_evaluation'
    current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
    child_dir = 'weights'
    path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
    return path

def training_loop(model, optimiser, train_loader, epoch_num):  
    # writer = SummaryWriter()
    batch_id = 0     
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    # Set the model to run on the device
    model = model.to(device)
    model.train(True)     
    print("Model device added")
    for batch in train_loader:
        # get features and labels from the batch
        features,labels = batch
        features = features.to(device)
        labels = labels.to(device)
        # loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
        optimiser.zero_grad()
        # make a prediction
        prediction = model(features)
        # calculate loss
        criterion = F.cross_entropy(prediction,labels)
        # backward function calculates the gradient of the current tensor w.r.t graph leaves
        criterion.backward()
        # print(criterion.item())
        # moves each parameter in the opposite direction of the gradient, proportional to the learning rate
        optimiser.step()
        # writer.add_scalar('Loss', criterion.item(), batch_id)
        batch_id += 1
    print("Completed")
    

def validate(model, val_loader):
    # Set the model to evaluation mode
    model.eval()
    running_vloss = 0.0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels = vdata                
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)
            
            voutputs = model(vinputs)
            vloss = F.cross_entropy(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    return avg_vloss
def train_fbm(config):    
    model = FBMClassifier()
    optimiser = SGD(model.parameters(), lr = config["lr"])    
    base_dir = "**mybasedir**"
    train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
    
    train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
    test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
    val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
    
    path = create_model_dir_path()
    os.makedirs(path)
    
    for epoch in range(10):
        print(f"Beginning {epoch} ...")
        training_loop(model,optimiser,train_loader, epoch)
        loss = validate(model, val_loader)        
        print(f'Epoch {epoch} - Average Loss: {loss}')
        session.report(metrics={"loss": loss})#tune.report(mean_accuracy=loss)
        # torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
        print(f"Ending {epoch} ...")

from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig
config = {        
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
scheduler = ASHAScheduler(
        max_t=10,
        grace_period=1,
        reduction_factor=2)
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_fbm),
            resources={"cpu": 2, "gpu": 1}
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            scheduler=scheduler,
            num_samples=3,
        ),
        param_space=config,
        run_config= RunConfig(verbose=3)
    )
results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))

Hi @PDDhillon and thanks for the issue.

This was a bug that has been fixed on master. The fix will be included in the upcoming Ray 2.6.0 release. Until then you can install the latest nightly wheels or patch the tune controller yourself to avoid the issue.

The relevant section is here:

1 Like