MultiAgent training Issues

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

Hello, I just started (8 hours ago) moving from SB3 and PettingZoo to RL Lib because I hear great things once you get it working. I am having some issues with a custom multi-agent RL Lib env I built. (which I based off of https://github.com/ray-project/ray/blob/master/rllib/examples/env/multi_agent.py)

Running the code below, at the bottom of my env file, works and provides the following output.

if __name__ == "__main__":
    num_agents = 2
    env = CustomMultiAgentEnv()
    observations = env.reset()
    while True:
        #print(observations)
        actions = {f"agent_{i}": env.action_space.sample() for i in range(num_agents)}
        observations, rewards, terminated, truncated, infos = env.step(actions)
        print(f"Observations: {observations}, Rewards: {rewards}, terminated: {terminated}, truncated: {truncated}")
        if all(truncated.values())==True or all(terminated.values())==True:
            break

Which an example output from a single step is…

Observations: {‘agent_0’: [0.0642681013628425, 1, 0.09016049960829728, -0.01765509675238516, -7.225758699782538e-05, 0.03709053033458642, 0, 0, 94.22682344171437, -0.16013091315591885, -0.05798713945949394], ‘agent_1’: [0.14218447795654465, 1, 0.15722279679643467, -0.003532692514178509, 4.265491059653892e-05, 0.024667761852378383, 0, 0, 94.22718347629973, 0.8398697810502177, 0.05798691539824818]}, Rewards: {‘agent_0’: -2.5942241589647237, ‘agent_1’: -2.363275916592812}, terminated: {‘agent_0’: False, ‘agent_1’: False, ‘all’: False}, truncated: {‘agent_0’: False, ‘agent_1’: False, ‘all’: False}

Now, the env trains using this example I found on the docs… (but believe this is single agent)

from ray.tune.registry import register_env
# import the pettingzoo environment
from CustRayEnv import CustomMultiAgentEnv
# import rllib pettingzoo interface
import os
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.registry import register_env
#from ray.rllib.env.wrappers.multi_agent_env_compatibility import MultiAgentEnvCapatability 
from gymnasium.wrappers import EnvCompatibility
def env_creator(args):
    env = CustomMultiAgentEnv()
    #env=MultiAgentEnvWrapper(env)
    #env=EnvCompatibility(env)
    return env


if __name__ == "__main__":
    ray.init()

    env_name = "CustomRayEnv"

    register_env(env_name, lambda config: CustomMultiAgentEnv(env_creator(config)))
    
    config = (
        PPOConfig()
        .environment(env=env_name, clip_actions=True,disable_env_checking=False)
        .rollouts(num_rollout_workers=8, rollout_fragment_length=128)
        .training(
            train_batch_size=512,
            lr=2e-5,
            gamma=0.99,
            lambda_=0.9,
            use_gae=True,
            clip_param=0.4,
            grad_clip=None,
            entropy_coeff=0.1,
            vf_loss_coeff=0.25,
            sgd_minibatch_size=64,
            num_sgd_iter=10,
        )
        .debugging(log_level="ERROR")
        .framework(framework="torch")
        .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
    )

    tune.run(
        "PPO",
        name="PPO",
        stop={"timesteps_total": 5000000 if not os.environ.get("CI") else 50000},
        checkpoint_freq=10,
        local_dir="~/ray_results/" + env_name,
        config=config.to_dict(),
    )

Im trying to get it to run using this code based off of the docs…

"""Simple example of setting up an agent-to-module mapping function.

How to run this script
----------------------
`python [script file name].py --enable-new-api-stack --num-agents=2`
python RayTest.py --enable-new-api-stack --num-agents=2
Control the number of agents and policies (RLModules) via --num-agents and
--num-policies.

For debugging, use the following additional command line options
`--no-tune --num-env-runners=0`
which should allow you to set breakpoints anywhere in the RLlib code and
have the execution stop there for inspection and debugging.

For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`
"""

from RemusRayEnv import CustomMultiAgentEnv
from ray.rllib.utils.test_utils import (
    add_rllib_example_script_args,
    run_rllib_example_script_experiment,
)
from ray.tune.registry import get_trainable_cls, register_env

parser = add_rllib_example_script_args(
    default_iters=200,
    default_timesteps=100000,
    default_reward=600.0,
)
# TODO (sven): This arg is currently ignored (hard-set to 2).
parser.add_argument("--num-policies", type=int, default=2)


if __name__ == "__main__":
    args = parser.parse_args()
    args.num_agents = 2
    # Register our environment with tune.
    if args.num_agents > 0:
        register_env(
            "env",
            lambda _: CustomMultiAgentEnv(),
        )

    base_config = (
        get_trainable_cls(args.algo)
        .get_default_config()
        .environment("env" if args.num_agents > 0 else "CartPole-v1")
        .rollouts(
            # TODO (sven): MAEnvRunner does not support vectorized envs yet
            #  due to gym's env checkers and non-compatability with RLlib's
            #  MultiAgentEnv API.
            num_envs_per_worker=1
            if args.num_agents > 0
            else 20,
        )
    )

    # Add a simple multi-agent setup.
    if args.num_agents > 0:
        base_config.multi_agent(
            policies={f"p{i}" for i in range(args.num_agents)},
            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
        )

    run_rllib_example_script_experiment(base_config, args)

I get the following warnings…

(RolloutWorker pid=2176) 2024-04-08 21:43:56,976        WARNING multi_agent_env.py:180 -- observation_space_contains() of <CustomMultiAgentEnv instance> has not been implemented. You 
can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=2176) 2024-04-08 21:43:56,977        WARNING multi_agent_env.py:180 -- observation_space_contains() of <CustomMultiAgentEnv instance> has not been implemented. You 
can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=2176) 2024-04-08 21:43:56,977        WARNING multi_agent_env.py:180 -- observation_space_contains() of <CustomMultiAgentEnv instance> has not been implemented. You 
can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=2176) 2024-04-08 21:43:56,977        WARNING multi_agent_env.py:246 -- action_space_sample() of <CustomMultiAgentEnv instance> has not been implemented. You can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=2176) 2024-04-08 21:43:56,977        WARNING multi_agent_env.py:180 -- observation_space_contains() of <CustomMultiAgentEnv instance> has not been implemented. You 
can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=10176) 2024-04-08 21:43:57,358       WARNING multi_agent_env.py:282 -- observation_space_sample() of <CustomMultiAgentEnv instance> has not been implemented. You can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.
(RolloutWorker pid=10176) 2024-04-08 21:43:57,359       WARNING multi_agent_env.py:209 -- action_space_contains() of <CustomMultiAgentEnv instance> has not been implemented. You can either implement it yourself or bring the observation space into the preferred format of a mapping from agent ids to their individual observation spaces.

which are occompanied by the following errors…

  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1830, in ray._raylet.execute_task.function_executor
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\_private\function_manager.py", line 724, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\utils\actor_manager.py", line 189, in apply
    raise e
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\utils\actor_manager.py", line 178, in apply
    return func(self, *args, **kwargs)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\execution\rollout_ops.py", line 89, in <lambda>
    lambda w: w.sample(), local_worker=False, healthy_only=True
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 694, in sample
    batches = [self.input_reader.next()]
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\sampler.py", line 91, in next
    batches = [self.get_data()]
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\sampler.py", line 273, in get_data
    item = next(self._env_runner)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\env_runner_v2.py", line 348, in run
    outputs = self.step()
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\env_runner_v2.py", line 374, in step
    active_envs, to_eval, outputs = self._process_observations(
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\env_runner_v2.py", line 540, in _process_observations
    policy_id: PolicyID = episode.policy_for(agent_id)
  File "C:\Users\ljc5616\AppData\Local\Programs\Python\Python310\lib\site-packages\ray\rllib\evaluation\episode_v2.py", line 130, in policy_for
    raise KeyError(
KeyError: "policy_mapping_fn returned invalid policy id 'pagent_0'!"

This is just a snippet as the command log is quite lengthy…

Any help is appreciated!

update

well actually, the script I thought trained ran into an error (the one I mentioned under “Now, the env trains using this example I found on the docs… (but believe this is single agent)”)

rial status: 1 RUNNING
Current time: 2024-04-08 22:32:02. Total running time: 28min 3s
Logical resource usage: 5.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:RTX)
╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮  
│ Trial name                     status       iter     total time (s)       ts     reward     episode_reward_max     episode_reward_min     episode_len_mean     episodes_this_iter │  
├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤  
│ PPO_CustomRayEnv_6daa8_00000   RUNNING       472            1630.45   241664   -3058.19               -2278.63               -4961.49                  500                      0 │  
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯  

Trial PPO_CustomRayEnv_6daa8_00000 finished iteration 473 at 2024-04-08 22:32:04. Total running time: 28min 4s
╭────────────────────────────────────────────────────────╮
│ Trial PPO_CustomRayEnv_6daa8_00000 result              │
├────────────────────────────────────────────────────────┤
│ episodes_total                                     487 │
│ num_env_steps_sampled                           242176 │
│ num_env_steps_trained                           242176 │
│ sampler_results/episode_len_mean                   500 │
│ sampler_results/episode_reward_mean           -3038.48 │
╰────────────────────────────────────────────────────────╯
2024-04-08 22:32:06,548 ERROR tune_controller.py:1332 -- Trial task failed for trial PPO_CustomRayEnv_6daa8_00000
ValueError: Expected parameter loc (Tensor of shape (64, 2)) of distribution Normal(loc: torch.Size([64, 2]), scale: torch.Size([64, 2])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan],
        [nan, nan],
                 .
                 .
                 .