Hello everyone
I am trying to run a PPO algorithm on a multi-agent environment. The action space and observation space of the agents are continuous and in the gym.Box format.
environment definition is as follows:
import gym
#from gym.spaces import Discrete, MultiDiscrete
import numpy as np
import random
from ray.rllib.env.multi_agent_env import MultiAgentEnv
class MultiUnitAuction(MultiAgentEnv):
def __init__(self , config = None):
config = config or {}
self.marginal_valuation_vector = [1 , 3 , 4 , 4 , 4 , 71,7, 56, 88,16, 25, 32, 39, 12, 48, 10,40,14,12,30]
#self.marginal_valuation_vector = [5 , 10 , 12 , 15 , 20]
#define a counter to end the episode
#self.counter = 0
#number of agents
self.num_agents = 20
#block_size
self.block_size = 7
# define a list for the market clearing price
self.p_c = 0
# average accepted bids
#define agents
#self.agents = [User(i , max(marginal_valuation_vector)) for i in marginal_valuation_vector]
#self.dones = set()
self.observation_space = gym.spaces.Dict({
#"agent_" + str(i) : gym.spaces.Box(low=np.float32(0), high=np.float32(max(self.marginal_valuation_vector)), shape=(1,), dtype=np.float32) for i in range(0,self.num_agents)
"agent_" + str(i) : gym.spaces.Box(low=np.array([0]), high=np.array([max(self.marginal_valuation_vector)]), dtype=np.float32) for i in range(0,self.num_agents)
})
self.action_space = gym.spaces.Dict({
#"agent_" + str(i) : gym.spaces.Box(low=np.float32(0), high=np.float32(self.marginal_valuation_vector[i]), shape=(1,), dtype=np.float32) for i in range(0,self.num_agents)
"agent_" + str(i) : gym.spaces.Box(low=np.array([0]), high=np.array([self.marginal_valuation_vector[i]]), dtype=np.float32) for i in range(0,self.num_agents)
})
self.reset()
def reset(self):
"""Returns initial observation of next(!) episode."""
# Return the initial observation in the new episode.
return self._get_obs()
def step(self, action: dict):
"""
Returns (next observation, rewards, dones, infos) after having taken the given actions.
e.g.
`action={"agent1": action_for_agent1, "agent2": action_for_agent2}`
"""
bids = []
for i in range(0 , self.num_agents):
bids.append(action["agent_" + str(i)])
self.p_c = sorted(bids)[len(bids) - self.block_size]
# Get observations (based on new agent positions).
obs = self._get_obs()
# calculate rewards
r = []
for i in range(0 , self.num_agents):
if bids[i] < self.p_c:
r.append(0)
else:
r.append(self.marginal_valuation_vector[i] - bids[i])
rewards = {
"agent_" + str(i) : float(r[i]) for i in range(0,self.num_agents)
}
is_done = True
# Generate a `done` dict (per-agent and total).
dones = {
"agent_0" : is_done,
"agent_1" : is_done,
"agent_2" : is_done,
"agent_3" : is_done,
"agent_4" : is_done,
"agent_5" : is_done,
"agent_6" : is_done,
"agent_7" : is_done,
"agent_8" : is_done,
"agent_9" : is_done,
"agent_10" : is_done,
"agent_11" : is_done,
"agent_12" : is_done,
"agent_13" : is_done,
"agent_14" : is_done,
"agent_15" : is_done,
"agent_16" : is_done,
"agent_17" : is_done,
"agent_18" : is_done,
"agent_19" : is_done,
# special `__all__` key indicates that the episode is done for all agents.
"__all__": is_done,
}
return obs, rewards, dones, {} # <- info dict (not needed here).
def _get_obs(self):
"""
Returns obs dict (agent name to discrete-pos tuple) using each
agent's current x/y-positions.
"""
return {
"agent_" + str(i) : [self.p_c] for i in range(0,self.num_agents)
}
def render(self, mode=None):
pass
The main problem comes from the _get_obs() function. When I define it as:
def _get_obs(self):
"""
Returns obs dict (agent name to discrete-pos tuple) using each
agent's current x/y-positions.
"""
return {
"agent_" + str(i) : [self.p_c] for i in range(0,self.num_agents)
}
I am getting the following error:
ValueError: ('Observation ({} dtype={}) outside given space ({})!', array([[6.075556]], dtype=float32), dtype('float32'), Box([0.], [88.], (1,), float32))
so, somehow the observation space is not as defined, but I have checked the environment many times. I used .contains() function and the result is always part of the observation space.
Am I using this right?
How should I return the state variables??
Also the config is as follows:
from ray.rllib.agents.ppo import PPOTrainer
policies= {
"policy_" + str(i): (None , game.observation_space["agent_" + str(i)] , game.action_space["agent_" + str(i)] , {"gamma" : 0.8}) for i in range(0,game.num_agents)
}
def policy_mapping_fn (agent_id: str):
#assert agent_id in [str(i) for i in range(0,5)], f"ERROR: invalid agent id {agent_id}!!!"
return "policy_" + str(agent_id[len(agent_id)-1])
config={
"env": MultiUnitAuction, # "my_env" <- if we previously have registered the env with `tune.register_env("[name]", lambda config: [returns env object])`.
"framework": "torch",
"create_env_on_driver": True,
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
},
}
Thanks