Here is the issue that occurs for search algorithm: HEBOSearch, AxSearch, BayesSearch, BayesOptSearch. However HyerOptSearch runs without any issues.
(1). finish the 1st time run.
(2). continue tuning on top of the 1st finished run using
algorithm.restore_from_dir
Step (1) can finish successfully. Step(2) has the following errors:
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/execution/insufficient_resources_manager.py:161, in _InsufficientResourcesManager.on_no_available_trials(self, all_trials)
157 return
159 # Otherwise, can fulfill none
160 msg = _get_insufficient_resources_warning_msg(
--> 161 for_train=self._for_train, trial=all_trials[0]
162 )
163 logger.warning(msg)
164 self._no_running_trials_since = time.monotonic()
IndexError: list index out of range
(END)
It seems the algorithm runs out of trials, but this error doesn’t happen to HyperOptSearch. Any hints would be appreciated. The following is the complete error messages:
---- ASHAScheduler is used as the scheduler
--- using HEBOSearch ---
--- previous run exist, continue the tuning ----
... contine tuning with more samples .....
╭──────────────────────────────────────────────────────────╮
│ Configuration for experiment brian2 │
├──────────────────────────────────────────────────────────┤
│ Search algorithm SearchGenerator │
│ Scheduler AsyncHyperBandScheduler │
│ Number of trials 100 │
╰──────────────────────────────────────────────────────────╯
View detailed results here: /scratch/bell/wxie/1_hlayer/ray_log/brian2
To visualize your results with TensorBoard, run: `tensorboard --logdir /scratch/bell/wxie/1_hlayer/ray_log/brian2`
Trial status:
Current time: 2023-08-06 09:10:32. Total running time: 10min 0s
Logical resource usage: 0/300 CPUs, 0/0 GPUs
╭─────────────────────────╮
│ Trial name status │
├─────────────────────────┤
╰─────────────────────────╯
Trial status:
Current time: 2023-08-06 09:20:33. Total running time: 20min 1s
Logical resource usage: 0/300 CPUs, 0/0 GPUs
╭─────────────────────────╮
│ Trial name status │
├─────────────────────────┤
╰─────────────────────────╯
---------------------------------------------------------------------------
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
File ~/Brian2/hyper_param_tune/run_ray_tune.py:202
199 print('---- ASHAScheduler is used as the scheduler')
201 # run ray tune
--> 202 run_tune(args.n_hlayer, args.time_budget_s, args.run_option, scheduler, args.search_algo, args.debug)
File ~/Brian2/hyper_param_tune/run_ray_tune.py:145, in run_tune(n_hlayer, time_budget_s, resume_option, scheduler, algo_option, debug)
140 restart_errored = (resume_option=="restart_errored")
141 tuner = tune.Tuner.restore(
142 os.path.join(raw_log_dir, raw_log_name),
143 resume_errored, restart_errored)
--> 145 results = tuner.fit()
146 best_result = results.get_best_result(metric="eff_validation", mode="max")
147 print('----- best result information ------')
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/tuner.py:347, in Tuner.fit(self)
345 if not self._is_ray_client:
346 try:
--> 347 return self._local_tuner.fit()
348 except TuneError as e:
349 raise TuneError(
350 _TUNER_FAILED_MSG.format(
351 path=self._local_tuner.get_experiment_checkpoint_dir()
352 )
353 ) from e
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:588, in TunerInternal.fit(self)
586 param_space = copy.deepcopy(self.param_space)
587 if not self._is_restored:
--> 588 analysis = self._fit_internal(trainable, param_space)
589 else:
590 analysis = self._fit_resume(trainable, param_space)
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:703, in TunerInternal._fit_internal(self, trainable, param_space)
689 """Fitting for a fresh Tuner."""
690 args = {
691 **self._get_tune_run_arguments(trainable),
692 **dict(
(...)
701 **self._tuner_kwargs,
702 }
--> 703 analysis = run(
704 **args,
705 )
706 self.clear_remote_string_queue()
707 return analysis
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/tune.py:1107, in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, search_alg, scheduler, checkpoint_config, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, checkpoint_keep_all_ranks, checkpoint_upload_from_workers, trial_executor, local_dir, _experiment_checkpoint_dir, _remote, _remote_string_queue, _entrypoint)
1103 try:
1104 while (
1105 not runner.is_finished() and not experiment_interrupted_event.is_set()
1106 ):
-> 1107 runner.step()
1108 if has_verbosity(Verbosity.V1_EXPERIMENT):
1109 _report_progress(runner, progress_reporter)
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/execution/tune_controller.py:290, in TuneController.step(self)
286 if not self._actor_manager.next(timeout=0.1):
287 # If there are no actors running, warn about potentially
288 # insufficient resources
289 if not self._actor_manager.num_live_actors:
--> 290 self._insufficient_resources_manager.on_no_available_trials(
291 self.get_trials()
292 )
294 # Maybe stop whole experiment
295 self._stop_experiment_if_needed()
File ~/local_pkgs/anaconda3/envs/brian2/lib/python3.10/site-packages/ray/tune/execution/insufficient_resources_manager.py:161, in _InsufficientResourcesManager.on_no_available_trials(self, all_trials)
157 return
159 # Otherwise, can fulfill none
160 msg = _get_insufficient_resources_warning_msg(
--> 161 for_train=self._for_train, trial=all_trials[0]
162 )
163 logger.warning(msg)
164 self._no_running_trials_since = time.monotonic()
IndexError: list index out of range
(END)