ppo_params = {
"entropy_coeff": tune.loguniform(0.00000001, 0.1),
"lr": tune.loguniform(5e-5, 1),
"sgd_minibatch_size": tune.choice([ 32, 64, 128, 256, 512]),
"lambda": tune.choice([0.1,0.3,0.5,0.7,0.9,1.0]),'framework':"torch",
"num_workers":2,'log_level':'DEBUG','env':'StockTrading_train_env','num_gpus':1}
if __name__ == '__main__':
ray.init(num_cpus=13,num_gpus=1,ignore_reinit_error=True)
tuner = tune.Tuner(
# trainable_with_resources,
"PPO",
param_space=ppo_params,
tune_config=tune.TuneConfig(
search_alg=optuna_search,
num_samples=1,
metric="episode_reward_mean",
mode='max',
reuse_actors=True,
),
run_config= RunConfig(
name="Trial Run",
local_dir="trial_run",
failure_config=FailureConfig(
max_failures=0,
),
stop={'training_iterations':1000},
checkpoint_config=CheckpointConfig(
num_to_keep=None,
# checkpoint_score_attribute="episode_reward_mean",
# checkpoint_score_order='max',
checkpoint_at_end=True
),
verbose=3
)
)
rs = tuner.fit()
In the above code cell, I am building a ray tune pipeline for Financial Reinforcement learning. So after running this, I am getting the following error
TuneError Traceback (most recent call last)
File ~/.local/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py:853, in TrialRunner._wait_and_handle_event(self, next_trial)
852 if event.type == _ExecutorEventType.TRAINING_RESULT:
--> 853 self._on_training_result(
854 trial, result[_ExecutorEvent.KEY_FUTURE_RESULT]
855 )
856 else:
File ~/.local/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py:978, in TrialRunner._on_training_result(self, trial, result)
977 with warn_if_slow("process_trial_result"):
--> 978 self._process_trial_results(trial, result)
File ~/.local/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py:1061, in TrialRunner._process_trial_results(self, trial, results)
1060 with warn_if_slow("process_trial_result"):
-> 1061 decision = self._process_trial_result(trial, result)
1062 if decision is None:
1063 # If we didn't get a decision, this means a
1064 # non-training future (e.g. a save) was scheduled.
1065 # We do not allow processing more results then.
File ~/.local/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py:1100, in TrialRunner._process_trial_result(self, trial, result)
1098 self._validate_result_metrics(flat_result)
-> 1100 if self._stopper(trial.trial_id, result) or trial.should_stop(flat_result):
1101 decision = TrialScheduler.STOP
...
251 experiment_checkpoint_dir = ray.get(
252 self._remote_tuner.get_experiment_checkpoint_dir.remote()
253 )
TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("/home/athekunal/Ray for FinRL/trial_run/Trial Run")`.
First, it starts with tuning and then it is failing after one sample. From the error trace, I think that the checkpoint functionality is giving some error. Am I missing something here?
Version:
ray: 2.1.0
python: 3.10
WSL2 with Ubuntu 22.04
cuda: 11.6