How to define a randomly action taking multi-agent team

How severe does this issue affect your experience of using Ray?

  • Medium: It contributes to significant difficulty to complete my task, but I can work around it.

Assume that each agent in the team observes at random steps (as a part of their behavior). This is somewhat similar to the example code in “SometimesZeroAgentsMultiAgent”, but differs from the fact that all the agents must step to reach a non-action taking (or an unavailable) mode. So I changed the example code as follows. But it throws ValueError: Data from episode 688144930894468330 does not show any agent interactions. Hint: Make sure for at least one timestep in the episode, env.step() returns non-empty values.. Not sure what I am doing wrong.

Any help on this would be really appreciated!

Reproduction code:

import argparse
import gymnasium as gym
import numpy as np
import random

import os
from ray.rllib.env.multi_agent_env import MultiAgentEnv

class MockEnv(gym.Env):
    """Mock environment for testing purposes.
    Observation=0, reward=1.0, episode-len is configurable.
    Actions are ignored.

    def __init__(self, episode_length, config=None):
        self.episode_length = episode_length
        self.config = config
        self.i = 0
        self.observation_space = gym.spaces.Discrete(1)
        self.action_space = gym.spaces.Discrete(2)
        self.available = True
        self.unavailable_count = 0

    def reset(self, *, seed=None, options=None):
        self.i = 0
        return 0, {}
        self.available = True
    def step(self, action):
        self.i += 1
        """agent defined random-availability"""
        self.available = np.random.rand(1) > 0.5
        terminated = truncated = self.i >= self.episode_length
        return 0, 1.0, terminated, truncated, {}

class SometimesZeroAgentsMultiAgent(MultiAgentEnv):
    """Multi-agent env in which sometimes, no agent acts."""

    def __init__(self, num=2):
        self.num_agents = num
        self._agent_ids = set(range(self.num_agents))
        # self.agents = [MockEnv(25) for _ in range(self.num_agents)]

        self.agents = [MockEnv(25) for i in range(self.num_agents)]
        self._observations = {}
        self._infos = {}
        self.terminateds = set()
        self.truncateds = set()
        self.observation_space = gym.spaces.Discrete(2)
        self.action_space = gym.spaces.Discrete(2)
        self._avail = set()
        self.action_requested = set()
    def reset(self, *, seed=None, options=None):
        self.terminateds = set()
        self.truncateds = set()
        self._observations = {}
        self._infos = {}
        for aid in self._get_random_agents():
            self._observations[aid], self._infos[aid] = self.agents[aid].reset()
        return self._observations, self._infos

    def step(self, action_dict):
        rew, terminated, truncated = {}, {}, {}

        """every agent must step"""
        for aid in self._agent_ids:
            if action_dict.get(aid) is not None:
                ) = self.agents[aid].step(action_dict[aid])
                (_,rew[aid],terminated[aid], truncated[aid], _ ) = self.agents[aid].step(None)

            if terminated[aid]:
            if truncated[aid]:
        terminated["__all__"] = len(self.terminateds) == self.num_agents
        truncated["__all__"] = len(self.truncateds) == self.num_agents

        """get the availability of the agents"""
        aid_avialabe = set()
        for aid in self._agent_ids:
            if self.agents[aid].available:
        obs = {}
        infos = {}

        """Here we only require actions for the available agents"""
        for aid in aid_avialabe:
            if aid not in self._observations:
                self._observations[aid] = self.observation_space.sample()
                self._infos[aid] = {"fourty-two": 42}
            obs[aid] = self._observations.pop(aid)
            infos[aid] = self._infos.pop(aid)
        """ Override some of the rewards."""
        for aid in self._get_random_agents():
            rew[aid] = np.random.rand()

        return obs, rew, terminated, truncated, infos

    def _get_random_agents(self):
        num_observing_agents = np.random.randint(self.num_agents)
        aids = np.random.permutation(self.num_agents)[:num_observing_agents]
        return {
            for aid in aids
            if aid not in self.terminateds and aid not in self.truncateds

"""Training script"""

import argparse
import gymnasium as gym
import os

import ray
from ray import air, tune
from multi_ag_env import SometimesZeroAgentsMultiAgent
from ray.tune.registry import get_trainable_cls
from ray.tune.registry import register_env

def get_cli_args():
    """Create CLI parser and return parsed arguments"""
    parser = argparse.ArgumentParser()

    # general args
        "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
    parser.add_argument("--num-cpus", type=int, default=0)
        choices=["tf", "tf2", "torch"],
        help="The DL framework specifier.",
    parser.add_argument("--eager-tracing", action="store_true")
        "--stop-iters", type=int, default=100, help="Number of iterations to train."
        help="Number of timesteps to train.",
        help="Reward at which we stop training.",
        help="Init Ray in local mode for easier debugging.",

    args = parser.parse_args()
    print(f"Running with following CLI args: {args}")
    return args

if __name__ == "__main__":
    args = get_cli_args()

    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode, log_to_driver=args.local_mode)

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
            "sometimes_zero_agents", lambda _: SometimesZeroAgentsMultiAgent(num=2)

    # TODO (Artur): in PPORLModule vf_share_layers = True is broken in tf2. fix it.
    vf_share_layers = not bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False))
    config = (
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        .training(train_batch_size=1024, model={"vf_share_layers": vf_share_layers})
        .rollouts(num_rollout_workers=5, rollout_fragment_length="auto")
        .framework(args.framework, eager_tracing=args.eager_tracing)
            # Use a simple set of policy IDs. Spaces for the individual policies
            # will be inferred automatically using reverse lookup via the
            # `policy_mapping_fn` and the env provided spaces for the different
            # agents. Alternatively, you could use:
            # policies: {main0: PolicySpec(...), main1: PolicySpec}
            policies={"main0", "main1", "main2", "main3", "main4"},
            # Simple mapping fn, mapping agent0 to main0 and agent1 to main1.
            policy_mapping_fn=(lambda aid, episode, worker, **kw: f"main{aid}"),
            # Only train main0.

    results = tune.Tuner(,

    if not results:
        raise ValueError(
            "No results returned from Something must have gone wrong."


ray.exceptions.RayTaskError(ValueError): ray::RolloutWorker.apply() (pid=278561, ip=, repr=<ray.rllib.evaluation.rollout_worker._modify_class.<locals>.Class object at 0x7f2af52e4340>)
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/utils/", line 185, in apply
    raise e
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/utils/", line 176, in apply
    return func(self, *args, **kwargs)
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/execution/", line 86, in <lambda>
    lambda w: w.sample(), local_worker=False, healthy_only=True
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 915, in sample
    batches = []
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 92, in next
    batches = [self.get_data()]
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 277, in get_data
    item = next(self._env_runner)
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 323, in run
    outputs = self.step()
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 349, in step
    active_envs, to_eval, outputs = self._process_observations(
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 667, in _process_observations
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 820, in _handle_done_episode
    self.end_episode(env_id, episode_or_exception)
  File "/home/malintha/anaconda3/envs/marl/lib/python3.9/site-packages/ray/rllib/evaluation/", line 945, in end_episode
    raise ValueError(msg)
ValueError: Data from episode 27926846350290456 does not show any agent interactions. Hint: Make sure for at least one timestep in the episode, env.step() returns non-empty values.

Thanks in advance!