How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I have implemented a multiagent environment inheriting from MultiAgentEnv class. The code containing class is as follows:
class NPIComplianceEnv(MultiAgentEnv):
#metadata = {‘render.modes’: [‘human’]}
def __init__(self, num_agents=5):
super(NPIComplianceEnv, self).__init__()
self.num_agents = num_agents
self.action_space = spaces.Discrete(3) # Actions: 0: Stay home, 1: Go to public, 2: Go to work
# Observation for each agent includes its own state and a view of the states of other agents
self.observation_space = spaces.Dict({
"Health_status": spaces.Discrete(5), # SEIR+D model states
"LSI_level": spaces.Discrete(4),
"Nearby_infected": spaces.Box(low=0, high=num_agents, shape=(1,), dtype=np.int32)
})
self.states = {
"Health_status": np.zeros(self.num_agents, dtype=int),
"LSI_level": np.zeros(self.num_agents, dtype=int),
"Nearby_infected": np.zeros((self.num_agents, 1), dtype=int)
}
self.rng = default_rng()
print('============Initialization Done!==================')
print("========== Observations:")
print(type(self.observation_space))
print(self.observation_space)
print("========== States:")
print(type(self.states))
print(self.states)
def reset(self, seed=None, options=None):
super().reset(seed=seed)
# Initialize the states for all agents
self.states = {
"Health_status": self.rng.integers(0, 5, self.num_agents),
"LSI_level": self.rng.integers(0, 4, self.num_agents),
"Nearby_infected": self.rng.integers(0, 100, (self.num_agents, 1)) # Example data
}
print('===============The Environment was reset!===============')
return self.states, {}
def step(self, actions):
print('==============Starting The Training================')
rewards = np.zeros(self.num_agents)
done = False
info = {}
print('==============================')
print(type(self.states))
print('==============================')
# Updating health statuses based on interactions and actions
for i in range(self.num_agents):
action = actions[i]
if action == 1: # Assuming going to public places
infection_risk = 0.01 # Simplified infection risk
infected = self.rng.random() < infection_risk
rewards[i] -= 10 if infected else 0 # Penalize getting infected
self.states['Health_status'][i] = 2 if infected else self.states['Health_status'][i]
# Updating global state based on actions
print('==============Updating global state================')
global_infection_update = np.sum(actions == 1) * 0.01 # Simplified global effect
print(global_infection_update)
self.states['LSI_level'] = (self.states['LSI_level'] + global_infection_update) % 4
print('==============================')
return self.states, rewards, done, False, info
def render(self, mode='human'):
print(f"Current states: {self.states}")
def close(self):
pass
def seed(self, seed=None):
The problem is that when trying to train the agents, the following error occurs:
============Initialization Done!==================
========== Observations:
<class ‘gymnasium.spaces.dict.Dict’>
Dict(‘Health_status’: Discrete(5), ‘LSI_level’: Discrete(4), ‘Nearby_infected’: Box(0, 5, (1,), int32))
========== States:
<class ‘dict’>
{‘Health_status’: array([0, 0, 0, 0, 0]), ‘LSI_level’: array([0, 0, 0, 0, 0]), ‘Nearby_infected’: array([[0],
[0],
[0],
[0],
[0]])}
2024-06-03 16:45:54,494 WARNING util.py:61 – Install gputil for GPU system monitoring.
2024-06-03 16:45:54,500 ERROR actor_manager.py:523 – Ray error, taking actor 1 out of service. ray::RolloutWorker.apply() (pid=2843903, ip=128.40.86.32, actor_id=8e23bcd3a9157fb746fa7e9c01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f87a5ca5e10>)
ValueError: The two structures don’t have the same nested structure.
First structure: type=ndarray str=[2 3 0 4 0]
Second structure: type=OrderedDict str=OrderedDict([(‘Health_status’, 4), (‘LSI_level’, 2), (‘Nearby_infected’, array([0], dtype=int32))])
More specifically: Substructure “type=OrderedDict str=OrderedDict([(‘Health_status’, 4), (‘LSI_level’, 2), (‘Nearby_infected’, array([0], dtype=int32))])” is a sequence, while substructure “type=ndarray str=[2 3 0 4 0]” is not
the main function contains:
agent = (ppo.PPOConfig()
.environment(env=“npi_compliance_env”, env_config={“num_agents”: 5})
.multi_agent(policies={“policy_0”: (None,
NPIComplianceEnv().observation_space,
NPIComplianceEnv().action_space,
{})},
policy_mapping_fn = lambda agent_id, episode, worker, **kwargs: “policy_0”)
.resources(num_gpus=1)
.framework(“torch”)
.training(train_batch_size=4000) # Example of additional config
.build()
)
for i in range(10):
result = agent.train()
print(f"Iteration {i}: reward = {result[‘episode_reward_mean’]}")
Could you please let me know why the state and observation spaces don’t have similar structure? and How this will be fixed.