PPO.train incorrect result

PPO.train incorrect result

PPO.train returns the number of iterations
in episode_reward_max, episode_reward_min

Am I doing wrong or is it a bug?

if it’s a bug, how do I install RAY 2.2.0?

pip install -U "ray[default, tune, rllib, air, serve]" # 2.3.1
pip install tensorflow

class MockEnv(gymnasium.Env):

    def __init__(self, env_config):
        self.episode_length = env_config["episode_length"]
        self.config = env_config
        self.i = 0
        self.observation_space = gymnasium.spaces.Discrete(20)
        self.action_space = gymnasium.spaces.Discrete(2)

    def reset(self, *, seed=None, options=None):
        self.i = 0
        return 0, {}

    def step(self, action):
        self.i += 1
        mock_obs = 12
        mock_reward = 9.0

        terminated = truncated = self.i >= self.episode_length
        return mock_obs, mock_reward, terminated, truncated, {}
		
if __name__ == '__main__':
    # ray.init(
    #     local_mode = True #local_mode
    #     # , logging_level = 'DEBUG',
    #     , ignore_reinit_error = True
    #     # , num_cpus = 15
    # )

    def env_creator(env_config:EnvContext):
        env = MockEnv(env_config)
        return env

    env_id = "MockEnv01"
    register_env(env_id, env_creator)

    env_cfg = {
        'env_id': env_id,
        'env_creator': env_creator,
        'episode_length': 3
    }

    ppoconfig = PPOConfig()
    ppoconfig.disable_env_checking = True
    ppoconfig.auto_wrap_old_gym_envs = False
    ppoconfig.train_batch_size =  10 # for speed
    ppoconfig.sgd_minibatch_size = 5 #for speed


    ppoconfig.environment(env=env_id)
    ppoconfig.env_config = env_cfg

    ppoconfig.log_level = "WARNING"  # "DEBUG"

    ppoconfig.ignore_worker_failures = True
    # ppoconfig.framework_str = "torch"
    ppoconfig.framework_str = "tf2"

    ppoconfig.lr = 8e-6
    ppoconfig.num_gpus = 0
    ppoconfig.lr_schedule =  [
                [0, 1e-1],
                [int(1e2), 1e-2],
                [int(1e3), 1e-3],
                [int(1e4), 1e-4],
                [int(1e5), 1e-5],
                [int(1e6), 1e-6],
                [int(1e7), 1e-7]
            ]
    ppoconfig.clip_rewards = True
    ppoconfig.gamma = 0.99
    ppoconfig.vf_loss_coeff = 0.5
    ppoconfig.vf_share_layers = True
    ppoconfig.entropy_coeff = 0.01

    # ppoconfig.checkpoint_freq = 1000
    ppoconfig.keep_checkpoints_num = 3
    ppoconfig.verbose = 1
    ppoconfig.log_to_file = False

    algo = PPO(ppoconfig)
    result = algo.train()

    print(f'episode_length:{env_cfg["episode_length"]}')
    print(f'episode_reward_max: {result["episode_reward_max"]}, episode_reward_min: {result["episode_reward_min"]}')

clip_rewards = True clips the reward at each timestep to -1.0, 0.0, or 1.0 based on its sign. ray/algorithm_config.py at master · ray-project/ray · GitHub

As a side note, I would reccommend using the AlgorithmConfig API to avoid issues like these, since it provides documentation for the arguments you pass in to the config object’s functions.