So this is part of my code… I’ve been trying many config options. The lines that have been commented out are those that were inactive as per my last trial that produced the jupyter cell output in the image above.
I’m relatively new to Ray so you have to excuse some of the comments in the code as they are my notes to understand Rllib.
The env is non-deterministic but I haven’t setup a random seed
self.env_train_config = {
"type": "train",
"window_size": 100,
"min_periods": 30,
#"render_env": True,
"log_name": self.log_name,
"log_dir": self.log_dir,
}
self.agent = ppo.PPO
self.algo_name = "PPO_1"
self.create_env = create_env
self.exploration_config = {
# The Exploration class to use
"type": EpsilonGreedy,
# Config for the Exploration class' constructor:
"initial_epsilon": 1.0,
"final_epsilon": 0.01,
"epsilon_timesteps": 1000, # Timesteps over which to anneal epsilon.
"warmup_timesteps": 300
}
self.search_space = {
"LR" : 5e-5,
"GAMMA" : 0.99,
"LAMBDA" : 1.0,
"VF_LOSS_COEFF" : 1.0,
"ENTROPY_COEFF" : 0.0,
"KL_COEF" : 0.2,
"KL_TARGET" : 0.01,
"CLIP_PARAM" : 0.3,
"VF_CLIP_PARAM" : 10.0,
}
self.algo = (
PPOConfig()
.framework(
framework = "tf2",
eager_tracing = False)
.debugging(
#logger_creator = None,
#logger_config = None,
log_level = "ERROR",
log_sys_usage = True,
#fake_sampler = False,
#seed = 0,
#worker_cls = None
)
.rollouts(
num_rollout_workers=1,
num_envs_per_worker=1,
create_env_on_local_worker=True,
#sample_collector = None,
#sample_async = False,
#enable_connectors = False
#rollout_fragment_length = None, # int | str | None = NotProvided
#batch_mode = 'complete_episodes',
#remote_worker_envs = False,
#remote_env_batch_wait_ms = None, # float | None = NotProvided
#validate_workers_after_construction = False,
ignore_worker_failures=False,
#recreate_failed_workers = False,
#restart_failed_sub_environments = False,
#num_consecutive_worker_failures_tolerance = None, # int | None = NotProvided
#preprocessor_pref = None, # str | None = NotProvided
observation_filter = "MeanStdFilter",
#synchronize_filter = False,
#compress_observations = False,
#enable_tf1_exec_eagerly = False,
#sampler_perf_stats_ema_coef = None, # float | None = NotProvided
#worker_health_probe_timeout_s = None, # int = NotProvided,
#worker_restore_timeout_s = None, # int = NotProvided,
)
.evaluation(
evaluation_interval = self.evaluation_frequency,
#evaluation_duration = None, # ValueError: must be an int and >0!
evaluation_duration_unit = "episodes",
#evaluation_sample_timeout_s = 180,
evaluation_parallel_to_training = False,
evaluation_config = self.env_train_config,
#off_policy_estimation_methods = False, # See Notes in Next Cell
ope_split_batch_by_episode = False,
evaluation_num_workers = 1,
#custom_evaluation_function = None,
always_attach_evaluation_results = False,
enable_async_evaluation = False)
.callbacks(callbacks_class = RecordNetWorthCallback)
.exploration(
explore = True,
#exploration_config = self.exploration_config # Dict
)
.reporting(
keep_per_episode_custom_metrics = False,
metrics_num_episodes_for_smoothing = 60,
# Below options give explicit instructions to agent to go above limit
min_time_s_per_iteration = 1,
min_train_timesteps_per_iteration = 1,
min_sample_timesteps_per_iteration = 1,)
.checkpointing(
export_native_model_files = False, # Bool # Used to restore just the NN models
checkpoint_trainable_policies_only = False) # Bool
.environment(
env="CreateEnv",
env_config= self.env_train_config,
#observation_space = None,
#action_space = None,
#env_task_fn = None,
render_env = True,
#clip_rewards = False,
#normalize_actions = True,
clip_actions = False,
#disable_env_checking = True,
#auto_wrap_old_gym_envs = True,
)
.training(
# Override some of AlgorithmConfig's default values with my values.
gamma= self.search_space["GAMMA"],
lr = self.search_space["LR"],
train_batch_size = 1000,
model= {
#"_use_default_native_models": True,
"use_lstm": True,
"lstm_cell_size": 100,
"fcnet_hiddens": [20,20],
"fcnet_activation": "relu",
'vf_share_layers': True,
'lstm_use_prev_action': False,
'lstm_use_prev_reward': False
},
#optimizer = {},
max_requests_in_flight_per_sampler_worker = None,
#rl_trainer_class = ppo.PPOTrainer,
#_enable_rl_trainer_api = False,
# Set PPO RL Trainer HPs
kl_coeff = self.search_space["KL_COEF"],
kl_target = self.search_space["KL_TARGET"],
use_critic = True,
clip_param = self.search_space["CLIP_PARAM"],
vf_clip_param = self.search_space["VF_CLIP_PARAM"],
entropy_coeff = self.search_space["ENTROPY_COEFF"],
#entropy_coeff = 10,
vf_loss_coeff = self.search_space["VF_LOSS_COEFF"],
# Experimental PPO Trainer HPs
#lr_schedule = lr_schedule,
entropy_coeff_schedule = None,
# PPO Config Specific HPs
use_gae = True,
lambda_ = self.search_space["LAMBDA"],
sgd_minibatch_size = 128,
num_sgd_iter = 30,
shuffle_sequences = False, # Unsure I want this for time series data
grad_clip = None,
)
)
stopper = CombinedStopper(
MaximumIterationStopper(max_iter=self.max_epoch),
NetWorthstopper(net_worth_mean=self.net_worth_threshold, patience=self.patience),
TrialPlateauStopper(metric="net_worth_max"),
)
stopper1 = {
"training_iteration": 10,
"timesteps_total": 2500,
"episode_reward_mean": 150,
}
checkpointer = CheckpointConfig(
num_to_keep= 5,
#checkpoint_score_attribute= None, # str | None
#checkpoint_score_order= MAX, # str,
checkpoint_frequency= 1,
checkpoint_at_end= True)
failure_check = FailureConfig(
max_failures= 0,
fail_fast= False)
sync_config = SyncConfig(
#upload_dir = None,
syncer = "auto",
sync_period = DEFAULT_SYNC_PERIOD,
sync_timeout = DEFAULT_SYNC_TIMEOUT,
sync_on_checkpoint = True)
progress = ProgressReporter()
callbacks=[
PrintCallback(),
RenderCallback(
self.evaluation_frequency,
self.log_name,
self.log_dir)]
# train an agent
analysis = tune.Tuner(
trainable= "PPO",
param_space= self.algo.to_dict(),
tune_config= tune.TuneConfig(
mode = 'max',
metric= "episode_reward_mean",
#search_alg= None,
#scheduler= None,
num_samples= 10,
max_concurrent_trials= 1,
#time_budget_s = None,
reuse_actors = False,
#trial_name_creator = None,
#trial_dirname_creator = None,
chdir_to_trial_dir = True),
run_config= air.RunConfig(
name= self.algo_name,
#local_dir = None,
#callbacks= callbacks,
stop= stopper1,
failure_config= failure_check,
#sync_config= sync_config,
checkpoint_config = checkpointer,
#progress_reporter = progress,
verbose = 1,
log_to_file = True
),
#_tuner_kwargs = None, # Dict
#_tuner_internal= None # TunerInternal
).fit()
print(f"Best Trail log directory: {analysis.get_best_result()}")
ray.shutdown()
taken = time.time() - start
print(f"Time taken: {taken:.2f} seconds.")
self.best_logdir = analysis.get_best_result().checkpoint
Is this enough info?