Hi, all
I am using ray 2.5.1 on Mac and was fine to execute the training and checkpoints are also saved successfully locally, but it terminates at the end of the process of tuner.fit() with following error:
Does anyone know what goes wrong? I am feeling it is some version issue, but not really sure what is causing it.
in Experiment.run(self, **run_config) 264 json.dump(run_meta, ifile, indent=4, cls=RobustJSONEncoder) 266 tuner = tune.Tuner( 267 **tune_config 268 ) --> 269 analysis = tuner.fit() 270 best_result = analysis.get_best_result(metric='episode_reward_mean', mode='max') 271 best_checkpoint_path = best_result.best_checkpoints[-1][0].path File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/tuner.py:347](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/tuner.py:347), in Tuner.fit(self) 345 if not self._is_ray_client: 346 try: --> 347 return self._local_tuner.fit() 348 except TuneError as e: 349 raise TuneError( 350 _TUNER_FAILED_MSG.format( 351 path=self._local_tuner.get_experiment_checkpoint_dir() 352 ) 353 ) from e File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:588](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:588), in TunerInternal.fit(self) 586 param_space = copy.deepcopy(self.param_space) 587 if not self._is_restored: --> 588 analysis = self._fit_internal(trainable, param_space) 589 else: 590 analysis = self._fit_resume(trainable, param_space) File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:712](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py:712), in TunerInternal._fit_internal(self, trainable, param_space) 698 """Fitting for a fresh Tuner.""" 699 args = { 700 **self._get_tune_run_arguments(trainable), 701 **dict( (...) 710 **self._tuner_kwargs, 711 } --> 712 analysis = run( 713 **args, 714 ) 715 self.clear_remote_string_queue() 716 return analysis File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/tune.py:1131](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/tune.py:1131), in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, checkpoint_keep_all_ranks, checkpoint_upload_from_workers, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, local_dir, _experiment_checkpoint_dir, _remote, _remote_string_queue, _entrypoint) 1126 else: 1127 logger.warning( 1128 f"Experiment has been interrupted, but the most recent state was " 1129 f"saved.\nResume experiment with: {restore_entrypoint}" 1130 ) -> 1131 ea = ExperimentAnalysis( 1132 experiment_checkpoint, 1133 trials=all_trials, 1134 default_metric=metric, 1135 default_mode=mode, 1136 sync_config=sync_config, 1137 remote_storage_path=remote_path, 1138 ) 1140 return ea File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:112](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:112), in ExperimentAnalysis.__init__(self, experiment_checkpoint_path, trials, default_metric, default_mode, remote_storage_path, sync_config) 110 self._experiment_states = [] 111 self._checkpoints_and_paths: List[Tuple[dict, os.PathLike]] = [] --> 112 self._load_checkpoints(experiment_checkpoint_path) 113 assert self._checkpoints_and_paths 115 self.trials = trials File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:188](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:188), in ExperimentAnalysis._load_checkpoints(self, experiment_checkpoint_path) 186 def _load_checkpoints(self, experiment_checkpoint_path: str) -> List[str]: 187 # Get the latest checkpoints from the checkpoint_path. --> 188 latest_checkpoints = self._get_latest_checkpoint(experiment_checkpoint_path) 189 if not latest_checkpoints: 190 raise ValueError( 191 f"
{experiment_checkpoint_path} must either be a path to an " 192 "experiment checkpoint file, or a directory containing an experiment " 193 "checkpoint file." 194 ) File [~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:308](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/tune/analysis/experiment_analysis.py:308), in ExperimentAnalysis._get_latest_checkpoint(self, experiment_checkpoint_path) 294 def _get_latest_checkpoint(self, experiment_checkpoint_path: str) -> List[str]: 295 """Gets the latest experiment checkpoints corresponding to a given path. 296 297 Acceptable path inputs (either local or remote): (...) 306 file for each experiment corresponding to the given path. 307 """ --> 308 if is_directory(experiment_checkpoint_path): 309 return self._get_latest_checkpoint_from_dir(experiment_checkpoint_path) 311 local_experiment_checkpoint_path = self._maybe_download_experiment_checkpoint( 312 experiment_checkpoint_path 313 ) File [~/.pyenv/versions/lib/python3.10/site-packages/ray/air/_internal/remote_storage.py:572](https://file+.vscode-resource.vscode-cdn.net/Users/Documents/CodeProject/des_rl/~/.pyenv/versions/lib/python3.10/site-packages/ray/air/_internal/remote_storage.py:572), in is_directory(uri) 569 _assert_pyarrow_installed() 571 fs, bucket_path = get_fs_and_path(uri) --> 572 file_info = fs.get_file_info(bucket_path) 573 return not file_info.is_file AttributeError: 'NoneType' object has no attribute 'get_file_info'