Encountering the Tracked Actor not managed by this event error

maulish34 · December 11, 2024, 5:14am

i am facing the following error while tuning using ASHAScheduler:

ValueError: Tracked actor is not managed by this event manager: <TrackedActor 146450608451337861391929918230712088053>

I have changed the code a bit from my original implementation (commented out) but I am still facing the same issue and unable to find out the root cause. I have the latest ray version insalled. I am new to Ray and cannot figure out why this is being caused. Any help would be appreciated.

Here is my code:
ray.shutdown()
ray.init(ignore_reinit_error=True, log_to_driver=False)

def train_ray_tune(config, use_bn=True, max_epochs=25, device=‘cpu’):
# Re-create data loaders with the config batch size
train_loader_tune = DataLoader(train_dataset, batch_size=config[‘batch_size’], shuffle=True, num_workers=8)
val_loader_tune = DataLoader(val_dataset, batch_size=config[‘batch_size’], shuffle=False, num_workers=8)

model_tune = build_model(use_bn=use_bn)
model_tune.to(device)
criterion_tune = nn.CrossEntropyLoss()

optimizer_tune = optim.SGD(model_tune.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4)
# Fixed step_size for simplicity, tune only gamma
scheduler_tune = optim.lr_scheduler.StepLR(optimizer_tune, step_size=15, gamma=config['lr_decay'])

# Track best val_loss for reference (not needed by ASHA directly)
best_val_loss = float("inf")

for epoch in range(max_epochs):
    # Training
    model_tune.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for inputs, labels in train_loader_tune:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer_tune.zero_grad()
        outputs = model_tune(inputs)
        loss = criterion_tune(outputs, labels)
        loss.backward()
        optimizer_tune.step()

        train_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        train_correct += (preds == labels).sum().item()
        train_total += inputs.size(0)

    train_loss /= train_total
    train_acc = train_correct / train_total

    # Validation
    model_tune.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader_tune:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model_tune(inputs)
            v_loss = criterion_tune(outputs, labels)
            val_loss += v_loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += inputs.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    scheduler_tune.step()

    # Report intermediate results to Ray Tune
    tune.report(loss=val_loss, accuracy=val_acc)

search_space = {
“lr”: tune.loguniform(1e-4, 1e-2),
“batch_size”: tune.choice([32, 64, 128]),
“lr_decay”: tune.uniform(0.5, 0.99) # gamma for LR decay
}

scheduler = ASHAScheduler(
metric=“loss”,
mode=“min”,
max_t=25, # maximum epochs
grace_period=5, # minimum epochs before pruning
reduction_factor=2
)

from pathlib import Path

current_dir = Path.cwd()
storage_uri_bn = current_dir.joinpath(‘ray_results_bn’).as_uri()
storage_uri_non_bn = current_dir.joinpath(‘ray_results_non_bn’).as_uri()

tuner_bn = Tuner(
tune.with_parameters(
train_ray_tune,
train_dataset=train_dataset,
val_dataset=val_dataset,
use_bn=True,
max_epochs=25,
device=‘cpu’
),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“bn_model_tuning”,
storage_path=storage_uri_bn,
stop={“training_iteration”: 25}
),
)

result_bn = tuner_bn.fit()
best_config_bn = result_bn.get_best_result(metric=“loss”, mode=“min”).config

‘’’
tuner_bn = Tuner(
tune.with_parameters(train_ray_tune, use_bn=True, max_epochs=25, device=‘cpu’),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“bn_model_tuning”,
storage_path=storage_uri_bn, # Short path
stop={“training_iteration”: 25} # Stop after 30 epochs if not pruned
),
)

result_bn = tuner_bn.fit()
best_config_bn = result_bn.get_best_result(metric=“loss”, mode=“min”).config

Example for Non-BN model

tuner_non_bn = Tuner(
tune.with_parameters(train_ray_tune, use_bn=False, max_epochs=25, device=‘cpu’),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“non_bn_model_tuning”,
storage_path=storage_uri_non_bn, # Short path
stop={“training_iteration”: 25}
),
)

result_non_bn = tuner_non_bn.fit()
best_config_non_bn = result_non_bn.get_best_result(metric=“loss”, mode=“min”).config
‘’’

Topic		Replies	Views
Error when setting up bayesian optimizer with asha scheduler Ray Tune	1	514	October 18, 2021
Episode_reward_mean that ASHA Scheduler expects not found in results RLlib	9	48	March 11, 2025
TypeError: '<' not supported between instances of 'TrackedActor' and 'TrackedActor' Ray Tune	1	524	July 19, 2023
ValueError in tuner.fit() Ray Tune	2	552	June 1, 2023
Most runs immediately failing with "out of memory" Ray Tune	5	1231	May 11, 2021

Encountering the Tracked Actor not managed by this event error

Example for Non-BN model

Related topics