Playing the QMIX Two-step game on Ray

We are trying to expand the code of the Two-step game (which is an example from the QMIX paper) using the Ray framework . The changes we want to apply should extract the best checkpoint from some trial of a tune.run() , restore it on a new QMixTrainer , and then use it on a new environment to compute the subsequent actions.

The code we tried to use is the following:

"""The two-step game from QMIX: https://arxiv.org/pdf/1803.11485.pdf

Configurations you can try:
    - normal policy gradients (PG)
    - contrib/MADDPG
    - QMIX

See also: centralized_critic.py for centralized critic PPO on this game.
"""

import argparse
from gym.spaces import Tuple, MultiDiscrete, Dict, Discrete
import os

import ray
from ray import tune
from ray.rllib.agents.qmix import QMixTrainer
from ray.tune import register_env, grid_search
from ray.rllib.env.multi_agent_env import ENV_STATE
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.utils.test_utils import check_learning_achieved

import numpy as np

parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="QMIX")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-reward", type=float, default=7.0)
parser.add_argument("--stop-timesteps", type=int, default=50000)

if __name__ == "__main__":
    args = parser.parse_args()

    grouping = {
        "group_1": [0, 1],
    }
    obs_space = Tuple([
        Dict({
            "obs": MultiDiscrete([2, 2, 2, 3]),
            ENV_STATE: MultiDiscrete([2, 2, 2])
        }),
        Dict({
            "obs": MultiDiscrete([2, 2, 2, 3]),
            ENV_STATE: MultiDiscrete([2, 2, 2])
        }),
    ])
    act_space = Tuple([
        TwoStepGame.action_space,
        TwoStepGame.action_space,
    ])
    register_env(
        "grouped_twostep",
        lambda config: TwoStepGame(config).with_agent_groups(
            grouping, obs_space=obs_space, act_space=act_space))

    if args.run == "contrib/MADDPG":
        obs_space_dict = {
            "agent_1": Discrete(6),
            "agent_2": Discrete(6),
        }
        act_space_dict = {
            "agent_1": TwoStepGame.action_space,
            "agent_2": TwoStepGame.action_space,
        }
        config = {
            "learning_starts": 100,
            "env_config": {
                "actions_are_logits": True,
            },
            "multiagent": {
                "policies": {
                    "pol1": (None, Discrete(6), TwoStepGame.action_space, {
                        "agent_id": 0,
                    }),
                    "pol2": (None, Discrete(6), TwoStepGame.action_space, {
                        "agent_id": 1,
                    }),
                },
                "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
            },
            "framework": "torch" if args.torch else "tf",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        }
        group = False
    elif args.run == "QMIX":
        config = {
            "rollout_fragment_length": 4,
            "train_batch_size": 32,
            "exploration_config": {
                "epsilon_timesteps": 5000,
                "final_epsilon": 0.05,
            },
            "num_workers": 0,
            "mixer": grid_search([None, "qmix", "vdn"]),
            "env_config": {
                "separate_state_space": True,
                "one_hot_state_encoding": True
            },
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        }
        group = True
    else:
        config = {
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        }
        group = False

    ray.init(num_cpus=args.num_cpus or None)

    stop = {
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
    }

    config = dict(config, **{
        "env": "grouped_twostep" if group else TwoStepGame,
    })

    results = tune.run(args.run, stop=stop, config=config, verbose=1, checkpoint_freq=1, checkpoint_at_end=True)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)

    best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max")
    print(f".. best checkpoint was: {best_checkpoint}")

    env = TwoStepGame(config).with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)
    obs = env.reset()

    rllib_config = config.copy()
    rllib_config["mixer"] = "qmix"
    new_trainer = QMixTrainer(config=rllib_config)
    new_trainer.restore(best_checkpoint)

    a1 = new_trainer.compute_action(observation=obs['group_1'])
    a2 = new_trainer.compute_action(observation=np.concatenate([obs['group_1'], [1]]))

    ray.shutdown()

To make it easier for you to see the changes from the original, this is the patch of the changes:

Index: main.py

<+>UTF-8
===================================================================
diff --git a/main.py b/main.py
--- a/main.py	(revision 80b3473ef3eede5f94e4805797556940bee91bc8)
+++ b/main.py	(date 1637485442837)
@@ -14,13 +14,16 @@
 
 import ray
 from ray import tune
+from ray.rllib.agents.qmix import QMixTrainer
 from ray.tune import register_env, grid_search
 from ray.rllib.env.multi_agent_env import ENV_STATE
 from ray.rllib.examples.env.two_step_game import TwoStepGame
 from ray.rllib.utils.test_utils import check_learning_achieved
 
+import numpy as np
+
 parser = argparse.ArgumentParser()
-parser.add_argument("--run", type=str, default="PG")
+parser.add_argument("--run", type=str, default="QMIX")
 parser.add_argument("--num-cpus", type=int, default=0)
 parser.add_argument("--as-test", action="store_true")
 parser.add_argument("--torch", action="store_true")
@@ -120,9 +123,23 @@
         "env": "grouped_twostep" if group else TwoStepGame,
     })
 
-    results = tune.run(args.run, stop=stop, config=config, verbose=1)
+    results = tune.run(args.run, stop=stop, config=config, verbose=1, checkpoint_freq=1, checkpoint_at_end=True)
 
     if args.as_test:
         check_learning_achieved(results, args.stop_reward)
 
+    best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max")
+    print(f".. best checkpoint was: {best_checkpoint}")
+
+    env = TwoStepGame(config).with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)
+    obs = env.reset()
+
+    rllib_config = config.copy()
+    rllib_config["mixer"] = "qmix"
+    new_trainer = QMixTrainer(config=rllib_config)
+    new_trainer.restore(best_checkpoint)
+
+    a1 = new_trainer.compute_action(observation=obs['group_1'])
+    a2 = new_trainer.compute_action(observation=np.concatenate([obs['group_1'], [1]]))
+
     ray.shutdown()

When we execute, we get the following errors:

a1 = new_trainer.compute_action(observation=obs['group_1'])

Produces:

ValueError: ('Observation ({}) outside given space ({})!', [0, 3], Tuple(Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2])), Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2]))))
a2 = new_trainer.compute_action(observation=np.concatenate([obs['group_1'], [1]]))

Produces:

ValueError: ('Observation ({}) outside given space ({})!', array([0, 3, 1]), Tuple(Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2])), Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2]))))

We are currently trying to figure out how we should change the observation to get accepted by the check_shape() function of the preprocessor.

def check_shape(self, observation: Any) -> None:
"""Checks the shape of the given observation."""
if self._i % VALIDATION_INTERVAL == 0:
    if type(observation) is list and isinstance(
            self._obs_space, gym.spaces.Box):
        observation = np.array(observation)
    try:
        if not self._obs_space.contains(observation):
            raise ValueError(
                "Observation ({}) outside given space ({})!",
                observation, self._obs_space)
    except AttributeError:
        raise ValueError(
            "Observation for a Box/MultiBinary/MultiDiscrete space "
            "should be an np.array, not a Python list.", observation)
self._i += 1

When calling the check_shape() function, these are the values that are processed:

observation :
value = [0, 3]
type = <class ‘list’>

self._obs_space :
value = Tuple(Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2])), Dict(obs:MultiDiscrete([2 2 2 3]), state:MultiDiscrete([2 2 2])))
type = <class ‘gym.spaces.tuple.Tuple’>

and this line fails:

if not self._obs_space.contains(observation)

Any positive feedback is welcome!

Hi @xeirwn,

The first thing I would try is to run the snippet below and then ensure that the second environment produces observations with the exact same structure.

env =TwoStepGame(config).with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)

obs = env.reset()

print(obs)

Thank you for the feedback.

Printing the obs will produce this result to :

{'group_1': [0, 3]}

Since we initialize both tune and TwoStepGame with the same settings (regarding the observation space), shouldn’t it produce the same results?

1 Like

QMIX is (unfortunately) a little tricky with respect to the observation- and action spaces it requires (both must be n-Tuples (n=num agents) of the respective single agent spaces).

E.g.: Single agent space=Dict(a=…, b=…) → 2 agents → obs-space=Tuple([Dict(a=, b=), Dict(a=, b=)])
Same for the action space.

Manny is right, you need to make sure the env is created with the [your env].with_agent_groups(grouping, tuple_obs_space, tuple_action) tool. From the error message you provided, it seems like your env is not grouped yet: Both agents are moving separately in the env, requiring separate actions, instead of the env treating both agents as a single one, like QMIX always requires it.

1 Like

I’m trying to do the same thing. The difference is that my environment is customized, but I encounter the following errors. Can someone help me?

Traceback (most recent call last):
  File "rllib_test2_danger_v1.py", line 189, in <module>
    action=QMixagent.compute_single_action(obs['group_1'])
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/algorithm.py", line 1148, in compute_single_action
    episode=episode,
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/policy/policy.py", line 331, in compute_single_action
    timestep=timestep,
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/qmix/qmix_policy.py", line 305, in compute_actions_from_input_dict
    for s in state_batches
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/qmix/qmix_policy.py", line 620, in _mac
    q_flat, h_flat = model(obs_agents_as_batches, h_flat, None)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/models/modelv2.py", line 259, in __call__
    res = self.forward(restored, state or [], seq_lens)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/qmix/model.py", line 35, in forward
    h_in = hidden_state[0].reshape(-1, self.rnn_hidden_dim)
IndexError: list index out of range

Hi @3160105430,

I think you need to pass in a state to compute_single_action.

1 Like

This is my obs

{'group_1': [{'obs': array([0.15008491, 0.07347695, 0.3602731 , 0.20984058, 0.71743214,
       0.28644857, 0.7862787 , 0.18181819, 0.        ], dtype=float32), 'state': array([[[0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        ...,
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3]]], dtype=float32)}, {'obs': array([0.57718784, 0.6537958 , 0.3602731 , 0.7901594 , 0.22652306,
       0.71355146, 0.35917583, 0.4674175 , 0.        ], dtype=float32), 'state': array([[[0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        ...,
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3],
        [0.3, 0.3, 0.3, ..., 0.3, 0.3, 0.3]]], dtype=float32)}]}

and space

grouping = {
        "group_1": [0, 1],
    }
   
obs_space = Tuple(
    [
        Dict(
            {
                "obs":env.obs_space,
                ENV_STATE: spaces.Box(low=0.0, high=1.0, shape=(1,42, 42), dtype=np.float32),
            }
        ),
        Dict(
            {
                "obs": env.obs_space,
                ENV_STATE: spaces.Box(low=0.0, high=1.0, shape=(1,42, 42), dtype=np.float32),
            }
        ),
    ]
)
act_space = Tuple(
    [
        Discrete(5),
        Discrete(5),
    ]
)

any idea?

env.obs_space = spaces.Box(low=0.0, high=1.0, shape=(9,), dtype=np.float32)

Hi @3160105430,

Qmix is a bit confusing because there are two concepts of state. The state you shared is part of the observation and represents a global state information of the environment.

RLlib also has a state concept that represents the state of a network when using those with memory. Qmix uses a GRU cell in the actor policy so it needs this state information.

Here is an example you could adapt for your purposes.

Thank you very much indeed!

Hello, I have encountered a new problem, what should I do?

ray_results/DangerUAV2_v1/QMIX/QMIX_DangerUAV2_v1_83686_00000_0_2022-10-05_13-49-20/checkpoint_001500/checkpoint-1500
2022-10-13 13:00:31,396	INFO trainable.py:669 -- Restored on 172.28.0.2 from checkpoint: ray_results/DangerUAV2_v1/QMIX/QMIX_DangerUAV2_v1_83686_00000_0_2022-10-05_13-49-20/checkpoint_001500
2022-10-13 13:00:31,397	INFO trainable.py:677 -- Current state after restoring: {'_iteration': 1500, '_timesteps_total': None, '_time_total': 18878.440465450287, '_episodes_total': 73197}
Traceback (most recent call last):
  File "rllib_test2_danger_v1.py", line 193, in <module>
    action,state_out,_=QMixagent.compute_single_action(obs['group_1'], state)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/algorithm.py", line 1148, in compute_single_action
    episode=episode,
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/policy/policy.py", line 331, in compute_single_action
    timestep=timestep,
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/algorithms/qmix/qmix_policy.py", line 317, in compute_actions_from_input_dict
    explore=explore,
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/exploration/epsilon_greedy.py", line 101, in get_exploration_action
    action_distribution, explore, timestep
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/exploration/epsilon_greedy.py", line 188, in _get_torch_exploration_action
    epsilon = self.epsilon_schedule(self.last_timestep)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/schedules/schedule.py", line 46, in __call__
    return self.value(t)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/schedules/schedule.py", line 42, in value
    return self._value(t)
  File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/schedules/piecewise_schedule.py", line 62, in _value
    if l_t <= t < r_t:
TypeError: '<=' not supported between instances of 'int' and 'NoneType'

Hello, do you have any idea about this problem