I started with the Cartpole Client-Server example. I’m doing a multi-agent game where I want one algorithm controlling some agents and another algorithm controlling others. These agents interact in a common game (think agent in the agent-based-simulation setting and not agent=rl agorithm).
The client:
# Make two client objects, one for each server. Make sure they have different ports
client_1 = PolicyClient('http://localhost:9900', inference_mode="local")
client_2 = PolicyClient('http://localhost:9910', inference_mode="local")
# Import your sim. I am using custom MultiAgentEnv simulations that I made myself, but this should work with any of the RLlib environments that are able to work with client-server
import sim # Pseudocode ;)
for episodes:
# Start data generation
obs = sim.reset()
# Each client has to start the episode
eid_1 = client_1.start_episode(training_enabled=True)
eid_2 = client_2.start_episode(training_enabled=True)
for steps in episode:
# Combine actions from two servers. Notice that I only give client 1 the observations
# that are associated with that server, so that it only reports actions for those agents.
# This is important because everything has to sync up correctly.
action = {
**client_1.get_action(eid_1, obs[agents_with_client_1])
**client_2.get_action(eid_2, obs[agents_with_client_2])
}
# The actions are passed to the simulation, so all the agents are interacting in each step
obs, reward, done, info = sim.step(action)
client_1.log_returns(eid_1, reward[agents_with_client_1])
client_2.log_returns(eid_2, reward[agents_with_client_2])
if done['__all__']:
break
client_1.end_episode(eid_1, obs[agents_with_client_1])
client_2.end_episode(eid_2, obs[agents_with_client_2])
Server 1:
# Server 1 will train with ppo
from ray.rllib.agents import ppo
def _input(ioctx):
if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0:
return PolicyServerInputMA(
ioctx,
"localhost",
9900,
idle_timeout=3.0
)
# No InputReader (PolicyServerInput) needed.
else:
return None
# Here I have this algorithm training multiple policies
policies = {
'policy_1': (None, observation_space, action_space, {}),
'policy_2': (None, observation_space, action_space, {})
}
def policy_mapping_fn(agent_id):
... # This is straightforward, just map agent ids to policy id as you would do for any multi-agent game
config={
# Use the connector server to generate experiences.
"input": _input,
"env": None,
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
"count_steps_by": "agent_steps"
},
"num_workers": 0,
}
trainer = ppo.PPOTrainer(config=config)
# Serving and training loop.
ts = 0
for _ in range(args.stop_iters):
results = trainer.train()
print(pretty_print(results))
if results["episode_reward_mean"] >= args.stop_reward or ts >= args.stop_timesteps:
break
ts += results["timesteps_total"]
Server 2:
# Server 2 will train with a2c
from ray.rllib.agents import a2c
def _input(ioctx):
if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0:
return PolicyServerInputMA(
ioctx,
"localhost",
9900,
idle_timeout=3.0
)
# No InputReader (PolicyServerInput) needed.
else:
return None
# This algorithm just trains a single policy, but I am gonna use the multi-agent setup so that the
# expected inputs and outputs follow the format (dict of agent ids as keys and obs/actions as
# values)
policies = {
'policy_1': (None, observation_space, action_space, {}),
}
def policy_mapping_fn(agent_id):
return "policy_1"
config={
# Use the connector server to generate experiences.
"input": _input,
"env": None,
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
"count_steps_by": "agent_steps"
},
"num_workers": 0,
}
trainer = a2c.A2CTrainer(config=config)
# Serving and training loop.
ts = 0
for _ in range(args.stop_iters):
results = trainer.train()
print(pretty_print(results))
if results["episode_reward_mean"] >= args.stop_reward or ts >= args.stop_timesteps:
break
ts += results["timesteps_total"]
Then just run the client and server scripts on the same machine via different processes.
Like I said, I’m still debugging it, but hopefully it works. Please let me know if you have questions or if you get it to work for you!