PPO.train incorrect result
PPO.train returns the number of iterations
in episode_reward_max, episode_reward_min
Am I doing wrong or is it a bug?
if it’s a bug, how do I install RAY 2.2.0?
pip install -U "ray[default, tune, rllib, air, serve]" # 2.3.1
pip install tensorflow
class MockEnv(gymnasium.Env):
def __init__(self, env_config):
self.episode_length = env_config["episode_length"]
self.config = env_config
self.i = 0
self.observation_space = gymnasium.spaces.Discrete(20)
self.action_space = gymnasium.spaces.Discrete(2)
def reset(self, *, seed=None, options=None):
self.i = 0
return 0, {}
def step(self, action):
self.i += 1
mock_obs = 12
mock_reward = 9.0
terminated = truncated = self.i >= self.episode_length
return mock_obs, mock_reward, terminated, truncated, {}
if __name__ == '__main__':
# ray.init(
# local_mode = True #local_mode
# # , logging_level = 'DEBUG',
# , ignore_reinit_error = True
# # , num_cpus = 15
# )
def env_creator(env_config:EnvContext):
env = MockEnv(env_config)
return env
env_id = "MockEnv01"
register_env(env_id, env_creator)
env_cfg = {
'env_id': env_id,
'env_creator': env_creator,
'episode_length': 3
}
ppoconfig = PPOConfig()
ppoconfig.disable_env_checking = True
ppoconfig.auto_wrap_old_gym_envs = False
ppoconfig.train_batch_size = 10 # for speed
ppoconfig.sgd_minibatch_size = 5 #for speed
ppoconfig.environment(env=env_id)
ppoconfig.env_config = env_cfg
ppoconfig.log_level = "WARNING" # "DEBUG"
ppoconfig.ignore_worker_failures = True
# ppoconfig.framework_str = "torch"
ppoconfig.framework_str = "tf2"
ppoconfig.lr = 8e-6
ppoconfig.num_gpus = 0
ppoconfig.lr_schedule = [
[0, 1e-1],
[int(1e2), 1e-2],
[int(1e3), 1e-3],
[int(1e4), 1e-4],
[int(1e5), 1e-5],
[int(1e6), 1e-6],
[int(1e7), 1e-7]
]
ppoconfig.clip_rewards = True
ppoconfig.gamma = 0.99
ppoconfig.vf_loss_coeff = 0.5
ppoconfig.vf_share_layers = True
ppoconfig.entropy_coeff = 0.01
# ppoconfig.checkpoint_freq = 1000
ppoconfig.keep_checkpoints_num = 3
ppoconfig.verbose = 1
ppoconfig.log_to_file = False
algo = PPO(ppoconfig)
result = algo.train()
print(f'episode_length:{env_cfg["episode_length"]}')
print(f'episode_reward_max: {result["episode_reward_max"]}, episode_reward_min: {result["episode_reward_min"]}')