my PPO train result is not as good as expected, the episode_reward_max is close to -14, but the episode_reward_mean is about -20, why they are so different? how can i improve the perform?
algorithm = 'PPO'
code_file_path = os.path.dirname(os.path.realpath(__file__))
save_path = code_file_path + f"/tmp/{env_name}/"
user_checkpoint_dir = save_path + f"/rllib_best_checkpoint"
############################################## 定义算法参数 ##################################################
config = (
get_trainable_cls(algorithm)
.get_default_config()
.environment(env_name)
.framework("torch")
.rollouts(num_rollout_workers=9)
.training(
model={
"fcnet_hiddens": [512, 512],
"fcnet_activation": "tanh",
},
)
)
if algorithm == "PPO":
config.training(
lr_schedule=None,
# lr_schedule=[[0, 0.0001], [1000000, 0.00005], [2000000, 0.00001], [3000000, 0.000005]], # tune.grid_search([1e-4, 2e-4]),
lr=5e-5,
# tune.grid_search([1e-4, 2e-4]),
train_batch_size=4000,
# # PPO specific settings:
use_critic=True,
use_gae=True,
# lambda_=1.0,
lambda_=0.95,
# lambda_=tune.grid_search([0.95, 0.9]),
use_kl_loss=True,
kl_coeff=0.2,
kl_target=0.01,
sgd_minibatch_size=128,
num_sgd_iter=30,
shuffle_sequences=True,
vf_loss_coeff=1.0,
entropy_coeff=0.01,
# entropy_coeff=tune.grid_search([0.02, 0.01]),
entropy_coeff_schedule=None,
# clip_param=0.3,
clip_param=0.5,
# clip_param=tune.grid_search([0.3, 1),
vf_clip_param=10.0,
grad_clip=None,
)
############################################## 使用tune进行训练 ##################################################
results = tune.Tuner(
algorithm,
param_space=config,
run_config=train.RunConfig(
stop={
"training_iteration": 100,
# 'timesteps_total': 50000,
},
verbose=2,
checkpoint_config=train.CheckpointConfig(
num_to_keep=5,
checkpoint_frequency=5,
checkpoint_score_attribute="episode_reward_max",
checkpoint_score_order='max',
checkpoint_at_end=True
),
storage_path=save_path,
),
).fit()
# 保存最后的结果
ckpt = results.get_best_result(metric="episode_reward_mean", mode="max").checkpoint
algo = Algorithm.from_checkpoint(ckpt)
algo.save(user_checkpoint_dir)