- Medium: It contributes to significant difficulty to complete my task, but I can work around it.
Hi Im struggling get the same results when evaluating a trained model compared to the output from training - much lower mean reward.
Im having a custom env that
- each reset initializes the env to one of 328 samples incrementing it one by one until it repeats itself again.
- Each episode is around 100-120 timesteps and will only return done on the last timestep.
My training is setup like so
SELECT_ENV = "my_env"
register_env(env_name, env_creator)
experiment = tune.run(
"PPO",
config={
"env": SELECT_ENV,
#"framework": "tf2",
#"eager_tracing":True,
#"lambda": 0.95,
#"kl_coeff": 0.5,
#"clip_rewards": True,
#"clip_param": 0.3,
#"vf_clip_param": 10.0,
#"vf_share_layers": True,
#"vf_loss_coeff": 1e-2,
#"entropy_coeff": 0.01,
#"train_batch_size": 10000,
#"sample_batch_size": 130,
#"sgd_minibatch_size": 130,
#"num_sgd_iter": 10,
"num_workers": 6,
#"num_envs_per_worker": 16,
#"lr": 0.0001,
"gamma": 1.0,
"batch_mode": "complete_episodes",
"metrics_smoothing_episodes": 328,
#"num_cpus": 4
#"model": {'_use_default_native_models': False, '_disable_preprocessor_api': False, '_disable_action_flattening': False, 'fcnet_hiddens': [512, 512], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1}
},
metric="episode_reward_mean",
mode="max",
stop={"training_iteration": 250},
checkpoint_at_end=True,
)
and the testcode running on the SAME 328 sample dataset like so
register_env(env_name, env_creator)
config = ppo.PPOConfig()
config.explore=False
agent = config.build(env=env_name)
agent.restore(checkpoint_path)
env = env_creator(config)
state = env.reset()
sum_reward = 0
episodes = 1
while True:
#action = agent.compute_single_action(state)
action = agent.compute_action(state)
state, reward, done, info = env.step(action)
#if(reward != 0):
# print(reward)
sum_reward += reward
if done:
if (episodes == 328):
break
else:
state = env.reset()
#print(env.current_state)
episodes += 1;
print(sum_reward)
print(episodes)
print(sum_reward / episodes)
=> 12736.807102917062
=> 328
=> 38.83172897230812
the mean reaward fom evaluation roughtly 38 while on tensorboard and training checkpoint it a much better mean reward of around 123…
check_point = experiment.get_trial_checkpoints_paths(trial=experiment.get_best_trial('episode_reward_mean'),
metric='episode_reward_mean')
=> PPO_my_env_4cfa5_00000_0_2022-11-14_14-36-10\checkpoint_000250’, 123.2423709106124
Am I doing somthing wrong here? Thanks for any help , especially some hands one changes