i am facing the following error while tuning using ASHAScheduler:
ValueError: Tracked actor is not managed by this event manager: <TrackedActor 146450608451337861391929918230712088053>
I have changed the code a bit from my original implementation (commented out) but I am still facing the same issue and unable to find out the root cause. I have the latest ray version insalled. I am new to Ray and cannot figure out why this is being caused. Any help would be appreciated.
Here is my code:
ray.shutdown()
ray.init(ignore_reinit_error=True, log_to_driver=False)
def train_ray_tune(config, use_bn=True, max_epochs=25, device=‘cpu’):
# Re-create data loaders with the config batch size
train_loader_tune = DataLoader(train_dataset, batch_size=config[‘batch_size’], shuffle=True, num_workers=8)
val_loader_tune = DataLoader(val_dataset, batch_size=config[‘batch_size’], shuffle=False, num_workers=8)
model_tune = build_model(use_bn=use_bn)
model_tune.to(device)
criterion_tune = nn.CrossEntropyLoss()
optimizer_tune = optim.SGD(model_tune.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4)
# Fixed step_size for simplicity, tune only gamma
scheduler_tune = optim.lr_scheduler.StepLR(optimizer_tune, step_size=15, gamma=config['lr_decay'])
# Track best val_loss for reference (not needed by ASHA directly)
best_val_loss = float("inf")
for epoch in range(max_epochs):
# Training
model_tune.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for inputs, labels in train_loader_tune:
inputs, labels = inputs.to(device), labels.to(device)
optimizer_tune.zero_grad()
outputs = model_tune(inputs)
loss = criterion_tune(outputs, labels)
loss.backward()
optimizer_tune.step()
train_loss += loss.item() * inputs.size(0)
_, preds = torch.max(outputs, 1)
train_correct += (preds == labels).sum().item()
train_total += inputs.size(0)
train_loss /= train_total
train_acc = train_correct / train_total
# Validation
model_tune.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader_tune:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model_tune(inputs)
v_loss = criterion_tune(outputs, labels)
val_loss += v_loss.item() * inputs.size(0)
_, preds = torch.max(outputs, 1)
val_correct += (preds == labels).sum().item()
val_total += inputs.size(0)
val_loss /= val_total
val_acc = val_correct / val_total
scheduler_tune.step()
# Report intermediate results to Ray Tune
tune.report(loss=val_loss, accuracy=val_acc)
search_space = {
“lr”: tune.loguniform(1e-4, 1e-2),
“batch_size”: tune.choice([32, 64, 128]),
“lr_decay”: tune.uniform(0.5, 0.99) # gamma for LR decay
}
scheduler = ASHAScheduler(
metric=“loss”,
mode=“min”,
max_t=25, # maximum epochs
grace_period=5, # minimum epochs before pruning
reduction_factor=2
)
from pathlib import Path
current_dir = Path.cwd()
storage_uri_bn = current_dir.joinpath(‘ray_results_bn’).as_uri()
storage_uri_non_bn = current_dir.joinpath(‘ray_results_non_bn’).as_uri()
tuner_bn = Tuner(
tune.with_parameters(
train_ray_tune,
train_dataset=train_dataset,
val_dataset=val_dataset,
use_bn=True,
max_epochs=25,
device=‘cpu’
),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“bn_model_tuning”,
storage_path=storage_uri_bn,
stop={“training_iteration”: 25}
),
)
result_bn = tuner_bn.fit()
best_config_bn = result_bn.get_best_result(metric=“loss”, mode=“min”).config
‘’’
tuner_bn = Tuner(
tune.with_parameters(train_ray_tune, use_bn=True, max_epochs=25, device=‘cpu’),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“bn_model_tuning”,
storage_path=storage_uri_bn, # Short path
stop={“training_iteration”: 25} # Stop after 30 epochs if not pruned
),
)
result_bn = tuner_bn.fit()
best_config_bn = result_bn.get_best_result(metric=“loss”, mode=“min”).config
Example for Non-BN model
tuner_non_bn = Tuner(
tune.with_parameters(train_ray_tune, use_bn=False, max_epochs=25, device=‘cpu’),
param_space=search_space,
tune_config=TuneConfig(
scheduler=scheduler,
num_samples=15
),
run_config=RunConfig(
name=“non_bn_model_tuning”,
storage_path=storage_uri_non_bn, # Short path
stop={“training_iteration”: 25}
),
)
result_non_bn = tuner_non_bn.fit()
best_config_non_bn = result_non_bn.get_best_result(metric=“loss”, mode=“min”).config
‘’’