In theory it should work, but doesn’t ;(
Hi @lady,
Can you provide more information on the error you are getting and a reproduction script.
Hi!
Thanks for answering so fast Of course - here is full script: sopt/MARL-Issue.ipynb at main · lady-pandas/sopt · GitHub (so it’s very simple). As you see, the model is learning (each iteration is better), but the inference on the already trained model fails miserably
Almost same code for single agent works without any problems.
Best,
Marta
Hey @lady , thanks for sharing this finding. It’s indeed strange. Could be action normalization or exploration settings being wrong for the inference loop. I’ll take a look.
Found the problem: We need to fix setting the default values of unsquash_action
(and also clip_action
, but that doesn’t matter here), arg to the Trainer.compute_single_action method().
By default, unsquash_action=None
, so the actions computed by the underlying policy are still in their normalized state (somewhere between -1.0 and 1.0). You env has a range of 0.0 to 1.0, so often, you get really wrong values (e.g. negative) as actions and hence the bad reward.
A quick fix would be to simply set the unsquash_actions
arg to your Trainer.compute_single_action()
call to True:
Fully working script:
from ray.rllib.env import MultiAgentEnv
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOTrainer
import ray
from ray import tune
import gym
import numpy as np
class IrrigationEnv(MultiAgentEnv):
def __init__(self, return_agent_actions = False, part=False):
self.num_agents = 5
self.observation_space = gym.spaces.Box(low=200, high=800, shape=(1,))
self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,))
def reset(self):
obs = {}
self.water = np.random.uniform(200,800)
for i in range(self.num_agents):
obs[i] = np.array([self.water])
return obs
def cal_rewards(self, action_dict):
self.curr_water = self.water
reward = 0
for i in range(self.num_agents):
water_demanded = self.water*action_dict[i][0]
if self.curr_water == 0:
# No water is left in stream
reward -= water_demanded*100 # Penalty
elif self.curr_water - water_demanded<0:
# Water in stream is less than water demanded, withdraw all left
water_needed = water_demanded - self.curr_water
water_withdrawn = self.curr_water
self.curr_water = 0
reward += -water_withdrawn**2 + 200*water_withdrawn
reward -= water_needed*100 # Penalty
else:
# Water in stream is more than water demanded, withdraw water demanded
self.curr_water -= water_demanded
water_withdrawn = water_demanded
reward += -water_withdrawn**2 + 200*water_withdrawn
return reward
def step(self, action_dict):
obs, rew, done, info = {}, {}, {}, {}
reward = self.cal_rewards(action_dict)
for i in range(self.num_agents):
obs[i], rew[i], done[i], info[i] = np.array([self.curr_water]), reward, True, {}
done["__all__"] = True
return obs, rew, done, info
def env_creator(_):
return IrrigationEnv()
single_env = IrrigationEnv()
env_name = "IrrigationEnv"
register_env(env_name, env_creator)
# Get environment obs, action spaces and number of agents
obs_space = single_env.observation_space
act_space = single_env.action_space
num_agents = single_env.num_agents
# Create a policy mapping
def gen_policy():
return (None, obs_space, act_space, {})
policy_graphs = {}
for i in range(num_agents):
policy_graphs['agent-' + str(i)] = gen_policy()
# Use `policy_mapping_fn(agent_id, episode, worker, **kwargs)` instead
def policy_mapping_fn(agent_id, episode, worker):
return 'agent-' + str(agent_id)
config = {
"log_level": "WARN",
"num_workers": 1,
"num_cpus_for_driver": 1,
"num_cpus_per_worker": 1,
"num_sgd_iter": 10,
"train_batch_size": 128,
"lr": 5e-3,
"model": {"fcnet_hiddens": [8, 8]},
"rollout_fragment_length": 128,
"multiagent": {
"policies": policy_graphs,
"policy_mapping_fn": policy_mapping_fn,
},
"env": "IrrigationEnv"}
trainer = PPOTrainer(config=config)
for i in range(15):
results = trainer.train()
print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
# Inference:
env = IrrigationEnv()
obs = env.reset()
done = False
total_reward = 0.0
while True:
action = {}
for agent_id, agent_obs in obs.items():
policy_id = config['multiagent']['policy_mapping_fn'](agent_id, None, None)
action[agent_id] = trainer.compute_single_action(agent_obs, policy_id=policy_id, unsquash_action=True)
obs, reward, done, info = env.step(action)
done = done['__all__']
total_reward += sum(reward.values())
if done:
print(f"total-reward={total_reward}")
obs = env.reset()
total_reward = 0.0
I’ll create a fix for this problem. Thanks again for raising this @lady !
Hi,
Thanks so much, the proposed fix helped!
Best,
Marta