Env with supersuit is not working properly

I tried to apply supersuit (2.6.6) with ray.version = (‘2.0.0.dev0’) to CartPole-v0/1 if it worked or not. I have no luck so far with rllib. Model doesn’t improve.

Custom environment without supersuit with
env_config = {‘supersuit’: False} works well with both rllib and stable baseline3 under default configuration. Agent easily learns how to play. But env_config = {‘supersuit’: True} works only with SB3. I’ve tried several variations but they all failed with rllib. MyEnv seems ok but… Any Idea or suggestions?

import gym
import torch
import ray
from copy import deepcopy
from ray.rllib.agents.registry import get_trainer_class
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
# from ray.rllib.agents.pg import PGTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
import supersuit as ss

# torch.__version__, ray.__version__  = ('1.9.0+cu102', '2.0.0.dev0')
class MyEnv(gym.Env):
    def __init__(self, env_config):
        self.env = gym.make('CartPole-v1')
        self.supersuit = env_config.get('supersuit', False)
        if self.supersuit:
            self.num_stacks = env_config.get('num_stacks', 2)
            self.env = ss.frame_stack_v1(self.env, self.num_stacks)
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        
    def reset(self):
        obs = self.env.reset()
        if self.supersuit:
            for _ in range(self.num_stacks - 1):
                obs, reward, done, info = self.step(self.action_space.sample())    
        return obs
    
    def step(self, action):
        return self.env.step(action)
    
    def render(self):
        return self.env.render()
    
    def close(self):
        self.env.close()

ray.init()
config = deepcopy(DEFAULT_CONFIG)
config['framework'] = 'torch'
env_config = {
    'supersuit': True,
    'num_stacks': 2,
}
config.update({
    'env_config': env_config
})

agent = PPOTrainer(config, env=MyEnv)

for i in range(1, 11):
    result = agent.train()
    print(f"{i:02d} reward: {result['episode_reward_mean']:>6.1f}")
#     print(pretty_print(result))

env = MyEnv(env_config)
obs = env.reset()
for i in range(1000):
    action = agent.compute_single_action(obs)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()

env.close()


################# stable baselines3
from stable_baselines3 import PPO
import supersuit as ss

env = MyEnv(env_config)
print(env.observation_space)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()

env.close()