PPO results do not match StableBaselines3 results with same settings

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

Hi, I recently trained PPO on the Alien atari environment with StableBaselines3 and noticed that the training had an average reward of more than 1000. However, when using RLlib with the same settings, I notice an average reward of less than 100. I am not sure if there is something wrong with my code or there is a bug. The policy and value networks for both RLlib and StableBaselines3 are the same, and follows the Nature DQN architecture. The environment settings for both RLlib and StableBaselines3 are the same as well (e.g., noop_max=0, frameskip=4 etc.). I have set the algorithm settings to be the same as well (e.g., gamma=0.99 etc.). Thus, I am not sure where the problem is. Both RLlib and StableBaselines3 train for 10M timesteps. My hunch is that there is something wrong with my RLlib algorithm configuration as generally PPO performs well on the Alien environment and a mean reward of less than 100 suggests that it is not learning at all.

Env Configs

  • Max epsisode frames: 108k frames
  • Mode: Default
  • Difficulty: Defalut
  • Obs type: Grayscale
  • Frameskip (w/ max pooling): 4
  • Repeat action probability: 0.25
  • Full action space: False
  • Noop reset: 0
  • Terminal on life loss: False
  • Resize: 84 x 84
  • Scale observation: [0,1)
  • Reward clipped: [-1, 1]
  • Frame stack: 4

Network Configs

  • Nature DQN Arch
  • Policy and Value Networks are shared

Algorithm Configs

  • Horizon: 128
  • Adam learning rate: 0.00025
  • Num epochs: 3
  • Minibatch size: 32 x 8
  • Gamma: 0.99
  • GAE lambda: 0.95
  • Number of actors: 8
  • Clipping param: 0.1
  • VF Coeff: 1
  • Entropy Coeff: 0.01

StableBaselines3 Experiment Setting

import uuid
import time
import os
import numpy as np
import gym
from stable_baselines3.common.env_util import make_vec_env, make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from gym.wrappers import AtariPreprocessing, TransformReward, FrameStack
from stable_baselines3.common.logger import configure
from stable_baselines3 import DQN, PPO
import torch
import random
import argparse

def train_eval(config):
    # config gpu
    os.environ["CUDA_VISIBLE_DEVICES"] = f'{config.gpu}'

    if config.algo == 'ppo':
        # obs scaled in network
        env = make_atari_env(
            f"{config.env}NoFrameskip-v4", 
            n_envs=8,
            wrapper_kwargs={
                'noop_max': 0, 
                'frame_skip': 4,
                'screen_size': 84,
                'terminal_on_life_loss': False,
                'clip_reward': True, 
                'action_repeat_probability': 0.25,
            },
        )
        env = VecFrameStack(env, 4)

        # done
        model = PPO(
            'CnnPolicy',
            env,
            learning_rate=0.00025,
            n_steps=128,
            batch_size=32*8,
            n_epochs=3,
            gamma=0.99,
            gae_lambda=0.95,
            clip_range=0.1,
            clip_range_vf=None,
            normalize_advantage=False,
            ent_coef=0.01,
            vf_coef=1.0,
            max_grad_norm=float('inf'),
            use_sde=False,
            stats_window_size=100,
            policy_kwargs={
                'normalize_images': True,
            },
            verbose=1,
        )
        start = time.time()
        logger = configure(os.path.join(os.getcwd(), 'runs', 'sb3', 'ppo', f'train_eval_{config.env}_{uuid.uuid4()}'), ["csv"])
        model.set_logger(logger)
        model.learn(total_timesteps=10000000)
        end = time.time()



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, help='Specify GPU index', required=True)
    parser.add_argument('--env', type=str, help='Specify gym environment to use', required=True)
    parser.add_argument('--algo', choices=['dqn', 'ppo'], type=str, help='Specify algorithm to use', required=True)
    args = parser.parse_args()
    train_eval(args)

RLlib Experiment Settings

import os
import argparse
import random
import time
import uuid
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import AtariPreprocessing, TransformReward, FrameStack

import tensorflow as tf
import torch
import torch.nn as nn

import ray
from ray import air
from ray.tune import TuneConfig
from ray.tune.tuner import Tuner
from ray.tune.registry import register_env
from ray.rllib.policy.policy import Policy
from ray.rllib.algorithms.dqn.dqn import DQN, DQNConfig
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy 
from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy 
from ray.rllib.utils.typing import AlgorithmConfigDict, ModelConfigDict
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from typing import Optional, Type
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models import ModelCatalog

from torch.utils.tensorboard import SummaryWriter

def train_eval(config):

    os.environ["CUDA_VISIBLE_DEVICES"] = f'{config.gpu}'

    # env done
    def env_creator(env_config):
        env = gym.make(f"ALE/{config.env}-v5", frameskip=1, render_mode='rgb_array')
        env = AtariPreprocessing(env, noop_max=0, scale_obs=True)
        env = TransformReward(env, lambda x: np.clip(x, -1, 1))
        env = FrameStack(env, 4)
        return env

    if config.algo == 'ppo':
        class TorchNature(TorchModelV2, nn.Module):
            def __init__(self, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, num_outputs: int, model_config: ModelConfigDict, name: str):
                super().__init__(obs_space, action_space, num_outputs, model_config, name)
                nn.Module.__init__(self)

                self._model = nn.Sequential(
                    nn.Conv2d(4, 32, 8, 4, 0),
                    nn.ReLU(),
                    nn.Conv2d(32, 64, 4, 2, 0),
                    nn.ReLU(),
                    nn.Conv2d(64, 64, 3, 1, 0),
                    nn.ReLU(),
                    nn.Flatten(),
                    nn.Linear(3136, 512),
                    nn.ReLU(),
                )
                self._pi = nn.Sequential(nn.Linear(512, num_outputs))
                self._vf = nn.Sequential(nn.Linear(512, 1))

            def forward(self, input_dict, state, seq_lens):
                self._out = self._model(input_dict['obs'].float())
                pi_out = self._pi(self._out)
                return pi_out, []

            def value_function(self):
                return torch.reshape(self._vf(self._out), [-1])

        # register env, models
        register_env(f"{config.env}_custom", env_creator=env_creator)
        ModelCatalog.register_custom_model("TorchNature", TorchNature)

        param_space = PPOConfig()
        param_space = param_space.training(
            gamma=0.99,
            lr=0.00025,
            grad_clip=None,
            train_batch_size=128*8,
            model={
                '_disable_preprocessor_api': True,
                'custom_model': 'TorchNature'
            },
            optimizer={'epsilon': 1e-8} if config.tf2 else {'eps': 1e-8},
            lr_schedule=None,
            use_critic=True,
            use_gae=True,
            lambda_=0.95,
            kl_coeff=0.0,
            sgd_minibatch_size=32*8,
            num_sgd_iter=3,
            vf_loss_coeff=1.0,
            entropy_coeff=0.01,
            entropy_coeff_schedule=None,
            clip_param=0.1,
            vf_clip_param=float('inf'),
        )
        param_space = param_space.environment(f"{config.env}_custom", render_env=False, clip_rewards=False, normalize_actions=False, clip_actions=False, auto_wrap_old_gym_envs=False)
        param_space = param_space.framework('torch', eager_tracing=True)
        param_space = param_space.rollouts(
            num_rollout_workers=8, 
            num_envs_per_worker=1, 
            create_env_on_local_worker=False,
            sample_async=False,
            rollout_fragment_length=128, 
            batch_mode='truncate_episodes', 
            preprocessor_pref=None, 
            observation_filter="NoFilter"
        )
        param_space = param_space.evaluation(evaluation_interval=None)
        param_space = param_space.experimental(_disable_preprocessor_api=True)
        param_space = param_space.debugging(logger_config={'type': "ray.tune.logger.NoopLogger"})
        param_space = param_space.resources(num_gpus=1, num_cpus_per_worker=1)
        trainer = PPO(config=param_space)

        train_steps = 10000000
        out_path = os.path.join(os.getcwd(), 'runs', 'torch', 'ppo',f'train_eval_{config.env}_{uuid.uuid4()}')
        writer = SummaryWriter(log_dir=out_path)
        past_episodes = 0
        start = time.time()
        while True:
            results = trainer.train()
            timesteps = results['timesteps_total']
            mean_reward = results['episode_reward_mean']
            episodes_total = results['episodes_total']
            if episodes_total > (past_episodes+100):
                past_episodes = episodes_total
                writer.add_scalar('Timestep/reward', mean_reward, timesteps)
            if timesteps >= train_steps:
                break
        writer.close()
        end = time.time()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, help='Specify gym environment to use', required=True)
    parser.add_argument('--gpu', type=int, help='Specify GPU index', required=True)
    parser.add_argument('--algo', choices=['dqn', 'ppo'], type=str, help='Specify algorithm to use', required=True)
    args = parser.parse_args()
    train_eval(args)