Hello there,
I received the error message “ray.tune.error.TuneError: Trial with unexpected good status encountered: PENDING” after my BOHB script was working fine for more than an hour. I’ll provide some reduced code here to show the main part of the project:
imports ...
class TrainableNN(Trainable):
def setup(self, config, args, ds, ds_info):
self.timestep = 0
self.config = config
self.args = args
self.train_ds, self.val_ds = split_datasets(args, ds)
torch.manual_seed(args.random_seed)
if args.model.name == 'resnet18_deterministic':
self.model = deterministic.resnet.ResNet18(ds_info['n_classes'])
self.criterion = nn.CrossEntropyLoss()
self.optimizer = torch.optim.SGD(
self.model.parameters(),
lr=config['learning_rate'],
weight_decay=config['weight_decay'],
momentum=args.model.optimizer.momentum,
nesterov=True,
)
self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=args.model.n_epochs)
self.device = args.device
self.train_loader = torch.utils.data.DataLoader(self.train_ds, batch_size=config['batch_size'], shuffle=True, drop_last=True)
self.val_loader = torch.utils.data.DataLoader(self.val_ds, batch_size=64, shuffle=False, drop_last=False)
def step(self):
for i in range(self.args.step_size):
_ = self.train_one_epoch(self.train_loader)
self.lr_scheduler.step()
self.timestep += 1
val_stats = self.evaluate(self.val_loader)
return val_stats
def train_one_epoch(self, dataloader, epoch=None, print_freq=200):
# Just trains one epoch in a classical pytorch manner
@torch.no_grad()
def evaluate(self, dataloader, dataloaders_ood=None):
# Just evaluates the current models performance
@torch.no_grad()
def collect_predictions(self, dataloader):
all_logits = []
all_targets = []
for inputs, targets in dataloader:
inputs = inputs.to(self.device)
all_logits.append(self.model(inputs).cpu())
all_targets.append(targets)
logits = torch.cat(all_logits, dim=0)
targets = torch.cat(all_targets, dim=0)
return logits, targets
def save_checkpoint(self, checkpoint_dir):
checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
torch.save(self.model.state_dict(), checkpoint_path)
return checkpoint_path
def load_checkpoint(self, checkpoint_path):
self.model.load_state_dict(torch.load(checkpoint_path))
@classmethod
def default_resource_request(cls, config):
return PlacementGroupFactory([{"CPU": 4, "GPU": 0.5}])
@hydra.main(version_base=None, config_path="./configs", config_name="hparam_search")
def main(args):
logger = logging.getLogger()
logger.info('Using setup: %s', args)
n_iterations = args.model.n_epochs//args.step_size
# Init ray, if we are using slurm, set cpu and gpus
adress = 'auto' if args.distributed else None
num_cpus = int(os.environ.get('SLURM_CPUS_PER_TASK', args.cpus_per_trial))
num_gpus = torch.cuda.device_count()
ray.init(address=adress, num_cpus=num_cpus, num_gpus=num_gpus)
search_space, points_to_evaluate = build_search_space(args)
bohb_search = TuneBOHB(
points_to_evaluate=points_to_evaluate,
metric="test_acc1",
mode="max"
)
bohb_search = tune.search.ConcurrencyLimiter(bohb_search, max_concurrent=args.max_concurrent)
bohb_hyperband = HyperBandForBOHB(
time_attr="training_iteration",
max_t=n_iterations,
reduction_factor=args.reduction_factor,
stop_last_trials=False,
)
# Init dset used to give to the Ray Object store
ds, ds_info = datasets.cifar.build_cifar10('train', args.dataset_path, return_info=True)
tuner = tune.Tuner(
tune.with_parameters(TrainableNN, args=args, ds=ds, ds_info=ds_info),
run_config=air.RunConfig(
stop={
"training_iteration": n_iterations,
},
),
tune_config=tune.TuneConfig(
search_alg=bohb_search,
scheduler=bohb_hyperband,
num_samples=args.n_configs,
metric="test_acc1",
mode="max"
),
param_space=search_space
)
results = tuner.fit()
print('Best NLL Stats: {}'.format(results.get_best_result().metrics))
print('Best NLL Hyperparameter: {}'.format(results.get_best_result().config))
print('Best Acc Hyperparameter: {}'.format(results.get_best_result(metric="test_acc1", mode="max").config))
def build_search_space(args):
points_to_evaluate = None
if args.model.name == 'resnet18_deterministic':
search_space = {
"learning_rate": tune.uniform(0, .1),
"weight_decay": tune.uniform(0, .1),
"batch_size": tune.choice([32, 64, 128])
}
points_to_evaluate = [
{"learning_rate": 1e-1, "weight_decay": 5e-4, "batch_size":64},
{"learning_rate": 1e-2, "weight_decay": 5e-4, "batch_size":64}
]
else:
raise NotImplementedError('Model {} not implemented.'.format(args.model.name))
return search_space, points_to_evaluate
if __name__ == '__main__':
main()
Here is the resulting Stack Trace
Traceback (most recent call last):
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/tuner.py", line 367, in fit
return self._local_tuner.fit()
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py", line 503, in fit
analysis = self._fit_internal(trainable, param_space)
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py", line 621, in _fit_internal
analysis = run(
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/tune.py", line 904, in run
runner.step()
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1342, in step
self._wait_and_handle_event(next_trial)
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1411, in _wait_and_handle_event
raise TuneError(traceback.format_exc())
ray.tune.error.TuneError: Traceback (most recent call last):
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1400, in _wait_and_handle_event
self._on_training_result(
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 694, in _on_training_result
self._process_trial_results(trial, result)
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 707, in _process_trial_results
decision = self._process_trial_result(trial, result)
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 750, in _process_trial_result
decision = self._scheduler_alg.on_trial_result(
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/schedulers/hb_bohb.py", line 114, in on_trial_result
action = self._process_bracket(trial_runner, bracket)
File "/mnt/stud/home/phahn/.conda/envs/uncertainty_evaluation/lib/python3.9/site-packages/ray/tune/schedulers/hyperband.py", line 270, in _process_bracket
raise TuneError(
ray.tune.error.TuneError: Trial with unexpected good status encountered: PENDING
I was trying to run this code on a server using 4 GPUs and 40 CPUs to have 8 parallel runs as well as some additional cpu power if necessary. Is there anything i am missing or is this a bug from rays side?
Additional Note:
Here is the last Status Update i received before the error occured:
== Status ==
Current time: 2023-05-04 13:12:05 (running for 02:48:57.77)
Using HyperBand: num_stopped=26 total_brackets=4
Round #0:
Bracket(Max Size (n)=1, Milestone (r)=12, completed=56.2%): {RUNNING: 1, TERMINATED: 15}
Bracket(Max Size (n)=2, Milestone (r)=12, completed=49.0%): {RUNNING: 2, TERMINATED: 8}
Bracket(Max Size (n)=4, Milestone (r)=10, completed=37.1%): {RUNNING: 4, TERMINATED: 3}
Bracket(Max Size (n)=5, Milestone (r)=10, completed=61.3%): {PAUSED: 4, RUNNING: 1}
Logical resource usage: 32.0/40 CPUs, 4.0/4 GPUs (0.0/1.0 accelerator_type:V100)
Current best trial: 1e952b2e with test_acc1=92.30999755859375 and parameters={'learning_rate': 0.007364477289920681, 'weight_decay': 0.00026112245478438644, 'batch_size': 64}
Result logdir: /mnt/stud/home/phahn/ray_results/TrainableNN_2023-05-04_10-23-06
Number of trials: 38/100 (4 PAUSED, 8 RUNNING, 26 TERMINATED)
+----------------------+------------+---------------------+--------------+-----------------+----------------+--------+------------------+-------------+-------------+------------+
| Trial name | status | loc | batch_size | learning_rate | weight_decay | iter | total time (s) | test_loss | test_acc1 | test_nll |
|----------------------+------------+---------------------+--------------+-----------------+----------------+--------+------------------+-------------+-------------+------------|
| TrainableNN_271901b2 | RUNNING | 141.51.131.93:24944 | 64 | 0.01 | 0.0005 | 13 | 7455.04 | 0.311992 | 91.35 | 0.311992 |
| TrainableNN_27b322fa | RUNNING | 141.51.131.93:39254 | 128 | 0.0079678 | 0.00207703 | 11 | 3904.6 | 0.350977 | 89.23 | 0.350977 |
| TrainableNN_33aa0129 | RUNNING | 141.51.131.93:17092 | 64 | 0.0116685 | 0.00279521 | 5 | 1793.52 | 0.507012 | 82.63 | 0.507012 |
| TrainableNN_51b3eb29 | RUNNING | 141.51.131.93:30516 | 64 | 0.000517159 | 0.0012137 | 10 | 5734 | 0.393945 | 89.68 | 0.393945 |
| TrainableNN_53b90488 | RUNNING | 141.51.131.93:28080 | 64 | 0.00197385 | 0.00236213 | 5 | 1795.03 | 0.328827 | 89.65 | 0.328827 |
| TrainableNN_7376e736 | RUNNING | 141.51.131.93:37382 | 64 | 0.0342845 | 0.00132928 | 8 | 5478.45 | 0.65188 | 79.13 | 0.65188 |
| TrainableNN_7aef5d6d | RUNNING | 141.51.131.93:42069 | 64 | 0.0095709 | 0.00216552 | 6 | 2146.83 | 0.451911 | 84.87 | 0.451911 |
| TrainableNN_fff989ea | RUNNING | 141.51.131.93:32824 | 128 | 0.0269434 | 0.0249693 | 9 | 3156.96 | 1.49307 | 49.31 | 1.49307 |
| TrainableNN_039f9d2e | PAUSED | 141.51.131.93:34033 | 32 | 0.0103221 | 0.0833994 | 10 | 3221.64 | 2.30294 | 9.6 | 2.30294 |
| TrainableNN_1e952b2e | PAUSED | 141.51.131.93:29038 | 64 | 0.00736448 | 0.000261122 | 10 | 3735.09 | 0.327314 | 92.31 | 0.327314 |
| TrainableNN_ac0224fa | PAUSED | 141.51.131.93:31947 | 32 | 0.000788363 | 0.0581641 | 10 | 3755.57 | 0.683219 | 80.67 | 0.683219 |
| TrainableNN_e41809c7 | PAUSED | 141.51.131.93:32497 | 32 | 0.0901537 | 0.00247578 | 10 | 3213.99 | 1.39476 | 54.68 | 1.39476 |
| TrainableNN_17b308f3 | TERMINATED | 141.51.131.93:15765 | 128 | 0.0550693 | 0.0771374 | 2 | 1196.65 | 2.30409 | 9.86 | 2.30409 |
| TrainableNN_24cc2be6 | TERMINATED | 141.51.131.93:9284 | 64 | 0.0554148 | 0.0373097 | 1 | 353.842 | 2.07382 | 20.12 | 2.07382 |
| TrainableNN_24ec6a67 | TERMINATED | 141.51.131.93:17330 | 64 | 0.00109714 | 0.0757342 | 4 | 1467.55 | 0.843984 | 77.18 | 0.843984 |
| TrainableNN_25200616 | TERMINATED | 141.51.131.93:27100 | 64 | 0.065716 | 0.0965496 | 5 | 1805.84 | 2.30316 | 9.94 | 2.30316 |
| TrainableNN_34b86efe | TERMINATED | 141.51.131.93:27872 | 64 | 0.080713 | 0.0332946 | 5 | 3375.9 | 2.30384 | 9.94 | 2.30384 |
| TrainableNN_38d09b83 | TERMINATED | 141.51.131.93:11218 | 64 | 0.00253526 | 0.0677027 | 2 | 1409.73 | 1.01034 | 70.44 | 1.01034 |
| TrainableNN_43b48ec7 | TERMINATED | 141.51.131.93:17548 | 64 | 0.1 | 0.0005 | 4 | 2525.76 | 0.607998 | 79.21 | 0.607998 |
| TrainableNN_48df4487 | TERMINATED | 141.51.131.93:10732 | 32 | 0.0898256 | 0.0337691 | 1 | 343.068 | 2.3049 | 9.31 | 2.3049 |
+----------------------+------------+---------------------+--------------+-----------------+----------------+--------+------------------+-------------+-------------+------------+
... 18 more trials not shown (18 TERMINATED)