I’ve been trying to test the PPO algorithm on a custom environment, the Tiger Problem in text form. I don’t understand what is wrong in the custom environment, PPO runs fine on the stock Taxi v-3 env.
These are the library versions:
gymnasium: 0.28.1
ray: 2.9.1
torch: 2.2.0
Running the code in a Jupyter notebook.
import gymnasium as gym
from gymnasium.spaces import Discrete, Box
from gymnasium import spaces
from gymnasium.envs import register
import numpy as np
import random
import time
import sys
sys.modules[name]
import matplotlib.pyplot as plt
from IPython.display import clear_output
from time import sleep
OBS_START = [0]
OBS_GROWL_LEFT = [1]
OBS_GROWL_RIGHT = [2]
OBS_MAP = {
OBS_START[0]: ‘START’,
OBS_GROWL_LEFT[0]: ‘GROWL_LEFT’,
OBS_GROWL_RIGHT[0]: ‘GROWL_RIGHT’,
}
ACTION_NONE = -1
ACTION_OPEN_LEFT = 0
ACTION_OPEN_RIGHT = 1
ACTION_LISTEN = 2
ACTION_MAP = {
ACTION_OPEN_LEFT: ‘OPEN_LEFT’,
ACTION_OPEN_RIGHT: ‘OPEN_RIGHT’,
ACTION_LISTEN: ‘LISTEN’,
ACTION_NONE: ‘NONE’,
}
class TigerEnv(gym.Env):
metadata = {‘render.modes’: [‘human’],
‘render_modes’: [‘human’]}
def __init__(self, reward_tiger=-100, reward_gold=10, reward_listen=-1,
obs_accuracy=.85, max_steps_per_episode=100):
self.reward_tiger = reward_tiger
self.reward_gold = reward_gold
self.reward_listen = reward_listen
self.obs_accuracy = obs_accuracy
self.max_steps_per_episode = max_steps_per_episode
self.curr_episode = -1 # Set to -1 b/c reset() adds 1 to episode
self.action_episode_memory = []
self.observation_episode_memory = []
self.reward_episode_memory = []
self.curr_step = 0
self.reset()
# LISTEN, OPEN_LEFT, OPEN_RIGHT
self.action_space = spaces.Discrete(3)
# GROWL_LEFT, GROWL_RIGHT, START
self.observation_space = spaces.Discrete(3)
def step(self, action):
done = self.curr_step >= self.max_steps_per_episode
if done:
raise RuntimeError("Episode is done")
self.curr_step += 1
should_reset = self.take_action(action)
done = self.curr_step >= self.max_steps_per_episode
reward = self.get_reward()
self.action_episode_memory[self.curr_episode].append(action)
obs = self.get_obs()
self.observation_episode_memory[self.curr_episode].append(obs)
self.reward_episode_memory[self.curr_episode].append(reward)
if should_reset:
self.step_reset()
infos = {}
return obs, reward, done, infos
def reset(self, *, seed=None, options=None):
if seed is not None:
np.random.seed(seed)
self.curr_step = 0
self.curr_episode += 1
self.left_door_open = False
self.right_door_open = False
self.tiger_left = np.random.randint(0, 2)
self.tiger_right = 1 - self.tiger_left
initial_obs = OBS_START
self.action_episode_memory.append([-1])
self.observation_episode_memory.append([initial_obs])
self.reward_episode_memory.append([0])
infos = {}
return initial_obs, infos
def render(self, mode='human'):
return
def close(self):
pass
def translate_obs(self, obs):
if obs[0] not in OBS_MAP:
raise ValueError('Invalid observation: '.format(obs))
else:
return OBS_MAP[obs[0]]
def translate_action(self, action):
return ACTION_MAP[action]
def take_action(self, action):
should_reset = False
if action == ACTION_OPEN_LEFT:
self.left_door_open = True
should_reset = True
elif action == ACTION_OPEN_RIGHT:
self.right_door_open = True
should_reset = True
elif action == ACTION_LISTEN:
pass
else:
raise ValueError('Invalid action ', action)
return should_reset
def get_reward(self):
if not (self.left_door_open or self.right_door_open):
return self.reward_listen
if self.left_door_open:
if self.tiger_left:
return self.reward_tiger
else:
return self.reward_gold
if self.right_door_open:
if self.tiger_right:
return self.reward_tiger
else:
return self.reward_gold
raise ValueError('Unreachable state reached.')
def get_obs(self):
last_action = self.action_episode_memory[self.curr_episode][-1]
if last_action != ACTION_LISTEN:
# Return accurate observation, but this won't be informative, since
# the tiger will be reset afterwards.
if self.tiger_left:
return OBS_GROWL_LEFT
else:
return OBS_GROWL_RIGHT
# Return accurate observation
if np.random.rand() < self.obs_accuracy:
if self.tiger_left:
return OBS_GROWL_LEFT
else:
return OBS_GROWL_RIGHT
# Return inaccurate observation
else:
if self.tiger_left:
return OBS_GROWL_RIGHT
else:
return OBS_GROWL_LEFT
def step_reset(self):
# Make sure doors are closed
self.left_door_open = False
self.right_door_open = False
self.tiger_left = np.random.randint(0, 2)
self.tiger_right = 1 - self.tiger_left
register(id=‘Tiger-v0’,
entry_point=“main:TigerEnv”)
env = gym.make(“Tiger-v0”)
from ray.rllib.algorithms.ppo import PPOConfig
from ray import tune
tune.register_env(“Tiger-v0”, lambda config: TigerEnv())
config = PPOConfig()
config = config.training(gamma=0.99, lr=0.01, kl_coeff=0.3, train_batch_size=128)
config = config.resources(num_gpus=0)
config = config.rollouts(num_rollout_workers=1)
algo = config.build(env=“Tiger-v0”)
result = algo.train()
Error Message:
2024-02-05 00:00:13,573 ERROR actor_manager.py:506 – Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, ray::RolloutWorker.init() (pid=3606436, ip=192.168.0.185, actor_id=a452bc5d58d357be084549e701000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f19e9028be0>)
ValueError: The two structures don’t have the same nested structure.
First structure: type=list str=[0]
Second structure: type=int64 str=0
More specifically: Substructure “type=list str=[0]” is a sequence, while substructure “type=int64 str=0” is not
During handling of the above exception, another exception occurred:
ray::RolloutWorker.init() (pid=3606436, ip=192.168.0.185, actor_id=a452bc5d58d357be084549e701000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f19e9028be0>)
…
(RolloutWorker pid=3606436) Entire second structure:
(RolloutWorker pid=3606436) .
(RolloutWorker pid=3606436)
(RolloutWorker pid=3606436) The above error has been found in your environment! We’ve added a module for checking your custom environments. It may cause your experiment to fail if your environment is not set up correctly. You can disable this behavior via calling config.environment(disable_env_checking=True)
. You can run the environment checking module standalone by calling ray.rllib.utils.check_env([your env]).
End of Error
How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.