Which parameters are required in minimal Multi-Agent Training

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I have written a custom MultiAgentEnv that I want to train with RLLib. The environment is called “Hexapawn”.

import logging
from copy import copy
from typing import Optional, Any, Callable

from gymnasium.spaces import MultiDiscrete, Discrete
from numpy import ndarray, zeros, ones, int8
from pettingzoo.utils.env import AgentID
from ray.rllib.env.multi_agent_env import MultiAgentEnv

from .game import Player, Board, Direction, Move, Square, FILES

logger = logging.getLogger(__name__)


class Hexapawn(MultiAgentEnv):
    """
    A Hexapawn game environment for multi-agent reinforcement learning.

    This class represents a Hexapawn game environment, inheriting from MultiAgentEnv. It supports rendering modes and
    provides methods for managing game state, actions, and observations for agents.
    """

    metadata = {"render_modes": ["ansi"], "frames_per_second": 1}

    def __init__(self, configuration: dict[str, Any] = ()):
        super().__init__()
        self.board = Board()
        configuration = dict(configuration)
        self.render_mode = configuration.get("render_mode", "human")
        self.frames_per_second = configuration.get(
            "frames_per_second", self.metadata["frames_per_second"]
        )
        self.possible_agents = [player for player in [Player.WHITE, Player.BLACK]]
        # 3x3 grid with 3 values (empty, self, and opponent).
        self.observation_spaces = self.agent_dictionary(
            MultiDiscrete((len(Player) + 1) * ones((self.rows, self.cols), dtype=int))
        )
        # 3x3 grid with three moves at each square (capture left, and capture right, and forward).
        self.action_spaces = self.agent_dictionary(Discrete(self.actions))
        self.gui = None

    @property
    def rows(self) -> int:
        return self.board.ranks

    @property
    def cols(self) -> int:
        return self.board.files

    @property
    def moves(self) -> int:
        return len(Direction)

    @property
    def actions(self) -> int:
        return self.rows * self.cols * self.moves

    def agent_dictionary(self, value) -> dict[AgentID, Any]:
        def constant_value(_):
            return value

        if isinstance(value, Callable):
            f = value
        else:
            f = constant_value
        return {agent: f(agent) for agent in self.possible_agents}

    def reset(
        self, *, seed: Optional[int] = None, options: Optional[dict] = None
    ) -> tuple[dict[Player, ndarray], dict[Player, dict]]:  # Observation, Info
        self.board = Board()
        self.agents = self.possible_agents[:]
        observations = {Player.WHITE: self.observation(Player.WHITE)}
        info = self.agent_dictionary(self.info)
        return observations, info

    def step(self, actions: dict[Player, int]) -> tuple[
        dict[Player, ndarray],  # Observations
        dict[Player, float],  # Rewards
        dict[Player, bool],  # Terminated
        dict[Player, bool],  # Truncated
        dict[Player, dict],  # Info
    ]:
        observations, rewards, terminated = {}, {}, {}
        for player, action in actions.items():
            self.board += self.move_from_action(action)
            opponent = self.board.player
            observations |= {opponent: self.observation(opponent)}
            if losing_player := self.board.has_lost():
                observations = {}
                rewards[-losing_player] = 1
                rewards[losing_player] = 0
                terminated = self.agent_dictionary(True)
        info = self.agent_dictionary(self.info)
        return observations, rewards, terminated, {}, info

    def observation(self, player: Player) -> ndarray:
        return self.players_board(player).board

    def info(self, player: Player) -> dict[str, Any]:
        return {"action_mask": self.mask(player)}

    def mask(self, player: Player) -> ndarray:
        mask = zeros(self.actions, dtype=int8)
        board = self.players_board(player)
        for move in board.legal_moves():
            action = self.action_from_move(move)
            mask[action] = 1
        return mask

    def action_from_move(self, move: Move) -> int:
        rank = move.origin.rank
        file = move.origin.file
        direction = int(move.direction)
        return sum(self.rows**i * x for i, x in enumerate([rank, file, direction]))

    def move_from_action(self, action: int) -> Move:
        rank, file, direction = [(action // self.rows**i) % self.rows for i in range(3)]
        return Move(origin=Square(rank=rank, file=file), direction=Direction(direction))

    def render(self) -> Optional[str]:
        return str(self)

    def players_board(self, player: Player) -> Board:
        board = copy(self.board)
        if board.player is not player:
            board = -board
        return board

    @staticmethod
    def players_move(player: Player, move: Move) -> Move:
        if player is Player.BLACK:
            move = copy(move)
            move.direction = -move.direction
            move.origin = Square(
                rank=move.origin.rank, file=FILES - move.origin.file - 1
            )
        return move

    def __str__(self):
        board = self.players_board(Player.WHITE)
        if player := self.board.has_lost():
            s = f"{repr(-player)} wins."
        else:
            s = f"{repr(self.board.player)}'s move"
        return f"{board}\n{s}"

Here is my training code so far.

import ray
from ray.rllib.algorithms import DQNConfig
from ray.tune.registry import register_env

from hexapawn.environment import Hexapawn


def train_hexapawn():
    def hexapawn_factory(configuration: dict) -> Hexapawn:
        return Hexapawn(configuration)

    ray.init()
    register_env("hexapawn", hexapawn_factory)
    config = DQNConfig().environment("hexapawn").multi_agent().rl_module()
    algo = config.build_algo()
    algo.train()


if __name__ == "__main__":
    train_hexapawn()

It is very minimal. I am trying to get training to complete without errors before I add more configuration parameters. You’ll need a few other files if you want to run this yourself. The easiest way is to check it out from gitlab.

Whenever I try to run it I get cryptic error messages. I spend a lot of time debugging the RLLib code. I’ve tried various configurations and nothing works.

Can someone provide me with a minimal training configuration so that I have something to start from?

When I run the above I get the following, but I don’t think this particular error message tells us much. I’m pretty sure I’m not providing the correct configuration parameters, but I don’t know how to figure out what those configuration parameters are.

/Users/mcneill/miniforge3/envs/Hexapawn/bin/python /Users/mcneill/src/Hexapawn/src/hexapawn/train.py 
2025-02-20 09:59:57,166	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
2025-02-20 09:59:58,371	WARNING algorithm_config.py:4726 -- You are running DQN on the new API stack! This is the new default behavior for this algorithm. If you don't want to use the new API stack, set `config.api_stack(enable_rl_module_and_learner=False,enable_env_runner_and_connector_v2=False)`. For a detailed migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html
/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/algorithms/algorithm.py:574: RayDeprecationWarning: This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning: This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning: This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/tune/logger/unified.py:53: RayDeprecationWarning: This API is deprecated and may be removed in future Ray releases. You could suppress this warning by setting env variable PYTHONWARNINGS="ignore::DeprecationWarning"
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
Traceback (most recent call last):
  File "/Users/mcneill/src/Hexapawn/src/hexapawn/train.py", line 20, in <module>
    train_hexapawn()
  File "/Users/mcneill/src/Hexapawn/src/hexapawn/train.py", line 15, in train_hexapawn
    algo = config.build_algo()
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/algorithms/algorithm_config.py", line 957, in build_algo
    return algo_class(
           ^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/algorithms/algorithm.py", line 590, in __init__
    super().__init__(
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/tune/trainable/trainable.py", line 158, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/algorithms/algorithm.py", line 693, in setup
    self.env_runner_group = EnvRunnerGroup(
                            ^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/env/env_runner_group.py", line 196, in __init__
    self._setup(
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/env/env_runner_group.py", line 291, in _setup
    self._local_env_runner = self._make_worker(
                             ^^^^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/env/env_runner_group.py", line 1187, in _make_worker
    worker = cls(
             ^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/env/single_agent_env_runner.py", line 98, in __init__
    self.make_env()
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/ray/rllib/env/single_agent_env_runner.py", line 658, in make_env
    gym.make_vec(
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/envs/registration.py", line 918, in make_vec
    env = gym.vector.SyncVectorEnv(
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/vector/sync_vector_env.py", line 86, in __init__
    self.envs = [env_fn() for env_fn in env_fns]
                 ^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/envs/registration.py", line 903, in create_single_env
    single_env = make(env_spec, **env_spec_kwargs.copy())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/envs/registration.py", line 803, in make
    env = gym.wrappers.PassiveEnvChecker(env)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/wrappers/common.py", line 264, in __init__
    check_action_space(env.action_space)
  File "/Users/mcneill/miniforge3/envs/Hexapawn/lib/python3.12/site-packages/gymnasium/utils/passive_env_checker.py", line 67, in check_space
    raise TypeError(
TypeError: action space does not inherit from `gymnasium.spaces.Space`, actual type: <class 'NoneType'>

Process finished with exit code 1
def train_hexapawn(checkpoint_path: Path):
    def hexapawn_factory(configuration: dict) -> Hexapawn:
        return Hexapawn(configuration)

    ray.init()
    register_env("hexapawn", hexapawn_factory)
    config = (
        DQNConfig()
        .environment("hexapawn")
        .multi_agent(policies={POLICY_ID}, policy_mapping_fn=lambda _, __: POLICY_ID)
        .rl_module(
            rl_module_spec=MultiRLModuleSpec(
                rl_module_specs={POLICY_ID: RLModuleSpec()}
            )
        )
    )
    algo = config.build_algo()
    algo.train()

Gets me a little farther but still fails in check_multiagent_environments because that precheck samples randomly from the observation space and I have a mask in “action_mask” of the info directory that selects only legal moves.

It took a lot of work but I think I finally understand what is going on here.