Custom Gym Environment NaN

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I try to get RLLIB with custom model and environment classes running. In case of the latter I experienced a really weird behavior with my custom environment classes. Hence, I tested it with a gymnasium benchmark enviornment.

In essence, my code looks as follows:

import hydra
from omegaconf import DictConfig
import gymnasium as gym
from pydoc import locate
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.tune.logger import pretty_print
from ray.rllib.models import ModelCatalog
from gymnasium.envs.classic_control.pendulum import PendulumEnv


class CustomPendulum(PendulumEnv):
    def __init__(self, env_config):
        super().__init__()


@hydra.main(config_path="configs", config_name="train.yaml")
def main(cfg: DictConfig):

    ray.init(local_mode=False)

    ModelCatalog.register_custom_model(
        model_name="model_wrapper",
        model_class=locate(cfg.wrapper._target_),
        )


    config = (  # 1. Configure the algorithm,
        PPOConfig()
        .environment("Pendulum-v1")
        .rollouts(num_rollout_workers=8)
        .framework("torch")
        .training(model={
                "custom_model": "model_wrapper",
                "custom_model_config": cfg.model,
            })
    )

    algo = config.build()  # 2. build the algorithm,

    for i in range(10):
        result = algo.train()
        print("Episode %i: episode_reward_mean = %g"%(i, result["episode_reward_mean"]))
              
    ray.shutdown()


if __name__ == "__main__":
    main()

Which at first glance seems to work as the averager reward is ascending accordingly:

Episode 0: episode_reward_mean = -1406.04
Episode 1: episode_reward_mean = -1463.69
Episode 2: episode_reward_mean = -1430.1
Episode 3: episode_reward_mean = -1429.89
Episode 4: episode_reward_mean = -1410.02
Episode 5: episode_reward_mean = -1395.62
Episode 6: episode_reward_mean = -1356.33
Episode 7: episode_reward_mean = -1326.2
Episode 8: episode_reward_mean = -1305.13
Episode 9: episode_reward_mean = -1313.29

However, if I change the configuration to

    config = (  # 1. Configure the algorithm,
        PPOConfig()
        .environment(CustomPendulum, env_config={})
        .rollouts(num_rollout_workers=8)
        .framework("torch")
        .training(model={
                "custom_model": "model_wrapper",
                "custom_model_config": cfg.model,
            })
    )

it breaks down since the average reward is now a nan

Episode 0: episode_reward_mean = nan
Episode 1: episode_reward_mean = nan
Episode 2: episode_reward_mean = nan
Episode 3: episode_reward_mean = nan
Episode 4: episode_reward_mean = nan
Episode 5: episode_reward_mean = nan
Episode 6: episode_reward_mean = nan
Episode 7: episode_reward_mean = nan
Episode 8: episode_reward_mean = nan
Episode 9: episode_reward_mean = nan

I already tried to register the environment using ray.tune.registry.register_env which didn’t work either.

Has anyone experienced a similar behaviour?
I’m using a freshly upgraded rllib 2.5.

BR