PPO centralized critic example with more than two agents

Hi!

I am trying to solve the following environment with the MAPPO (PPO with a centralized critic)
image

Reward

  • For each time step a agents is not in its final position, it receives a reward of -1
  • For each time step a agents is in its final position, it receives a reward of 0

Actions

  1. “DOWN”
  2. “LEFT”
  3. “UP”
  4. “RIGHT”
  5. “NOOP”

Observation
For each agent an obs consists of:

  1. ID of the agent
  2. Y coordinate of the agent
  3. X coordinate of the agent
  4. a current step count (for time reference)

Resulting in the following observation and action spaces of the environment:

  • action_space = spaces.Discrete(5)
  • observation_space = spaces.Box(np.array([0., 0., 0., 0.]), np.array([1., 1., 1., 1.]))

One episode lasts for 50 time steps. The goal for all agents is to get into their final position (cell wich has the same colour as the corresponding agent) as fast as possibe and stay in there until the episode ends.

I was able to solve this environment with 2 agents, following rllibs’s centralized critic example.

In order to handle a increased number of agents, I made following changes to the example code (I am only using the TF versions):

from Multi_Agent.RLlib.MAPPO.switch_v4.centralized_critic_model_tf_switch_v4 import 
CentralizedCriticModel
from Multi_Agent.RLlib.MAPPO.envs.switch_v4 import Switch

tf1, tf, tfv = try_import_tf()

class CentralizedCriticModel(TFModelV2):
"""Multi-agent model that implements a centralized value function."""

def __init__(self, obs_space, action_space, num_outputs, model_config,name):
    super(CentralizedCriticModel, self).__init__(obs_space, action_space, num_outputs, model_config, name)

    # Base of the model
    self.model = FullyConnectedNetwork(obs_space, action_space, num_outputs, model_config, name)
    self.register_variables(self.model.variables())

    n_agents = 4  # ---> opp_obs and opp_acts now consist of 3 (4 - 1) different agent information
    obs = 4
    act = 5
    opp_obs_accum = obs * (n_agents - 1)
    opp_acts_accum = act * (n_agents - 1)

    # Central VF maps (obs, opp_obs, opp_act) -> vf_pred
    obs = tf.keras.layers.Input(shape=(obs, ), name="obs")
    opp_obs = tf.keras.layers.Input(shape=(opp_obs_accum, ), name="opp_obs")
    opp_act = tf.keras.layers.Input(shape=(opp_acts_accum, ), name="opp_act")
    concat_obs = tf.keras.layers.Concatenate(axis=1)([obs, opp_obs, opp_act])
    central_vf_dense = tf.keras.layers.Dense(16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs)
    central_vf_out = tf.keras.layers.Dense(1, activation=None, name="c_vf_out")(central_vf_dense)

    self.central_vf = tf.keras.Model(inputs=[obs, opp_obs, opp_act], outputs=central_vf_out)
    print(f"Centralized Critic Model: \n{self.central_vf.summary()}")
    self.register_variables(self.central_vf.variables)

@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
    return self.model.forward(input_dict, state, seq_lens)

def central_value_function(self, obs, opponent_obs, opponent_actions):
    return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])  # ---> changed the depth of one_hot encoding to 5 (5 actions)

@override(ModelV2)
def value_function(self):
    return self.model.value_function()  # not used 


register_env("my_switch", lambda _: Switch({}))
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)

OPPONENT_OBS = "opponent_obs"
OPPONENT_ACTION = "opponent_action"

parser = argparse.ArgumentParser()
parser.add_argument("--stop-iters", type=int, default=1000)
parser.add_argument("--stop-timesteps", type=int, default=1_000_000)


class CentralizedValueMixin:
    """Add method to evaluate the central value function from the model."""

def __init__(self):
    if self.config["framework"] != "torch":
        self.compute_central_vf = make_tf_callable(self.get_session())(
            self.model.central_value_function)


# Grabs the opponent obs/act and includes it in the experience train_batch,
# and computes GAE using the central vf predictions.
def centralized_critic_postprocessing(policy,
                                      sample_batch,
                                      other_agent_batches=None,
                                      episode=None):
    if policy.loss_initialized():
        assert other_agent_batches is not None
        #[(_, opponent_batch)] = list(other_agent_batches.values())

        # ---> opponent batch now consists of 3 SampleBatches, so I concatenate them
        concat_opponent_batch = SampleBatch.concat_samples([opponent_n_batch
                                                            for _, opponent_n_batch in other_agent_batches.values()])

    opponent_batch = concat_opponent_batch

    # also record the opponent obs and actions in the trajectory
    sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS]
    sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]

    sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
        sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
        sample_batch[OPPONENT_ACTION])
else:
    # Policy hasn't been initialized yet, use zeros.
    sample_batch[OPPONENT_OBS] = np.zeros_like(
        sample_batch[SampleBatch.CUR_OBS])
    sample_batch[OPPONENT_ACTION] = np.zeros_like(
        sample_batch[SampleBatch.ACTIONS])
    sample_batch[SampleBatch.VF_PREDS] = np.zeros_like(
        sample_batch[SampleBatch.REWARDS], dtype=np.float32)

completed = sample_batch["dones"][-1]
if completed:
    last_r = 0.0
else:
    last_r = sample_batch[SampleBatch.VF_PREDS][-1]

train_batch = compute_advantages(
    sample_batch,
    last_r,
    policy.config["gamma"],
    policy.config["lambda"],
    use_gae=policy.config["use_gae"])
return train_batch


# Copied from PPO but optimizing the central value function.
def loss_with_central_critic(policy, model, dist_class, train_batch):
    CentralizedValueMixin.__init__(policy)
    func = tf_loss

    vf_saved = model.value_function

    model.value_function = lambda: policy.model.central_value_function(
        train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS],
        train_batch[OPPONENT_ACTION])

    policy._central_value_out = model.value_function()
    loss = func(policy, model, dist_class, train_batch)

    model.value_function = vf_saved

    return loss


def setup_tf_mixins(policy, obs_space, action_space, config):
    # Copied from PPOTFPolicy (w/o ValueNetworkMixin).
    KLCoeffMixin.__init__(policy, config)
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])

def central_vf_stats(policy, train_batch, grads):
    # Report the explained variance of the central value function.
    return {
        "vf_explained_var": explained_variance(
            train_batch[Postprocessing.VALUE_TARGETS],
            policy._central_value_out),
    }


CCPPOTFPolicy = PPOTFPolicy.with_updates(
    name="CCPPOTFPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_tf_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        CentralizedValueMixin
    ])

""" ---> Original, didn't return CCPPOTFPolicy
def get_policy_class(config):
    if config["framework"] == "torch":
        return CCPPOTorchPolicy"""

def get_policy_class(config):
    if config["framework"] == "tf":
        return CCPPOTFPolicy

CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer",
    default_policy=CCPPOTFPolicy,
    get_policy_class=get_policy_class,
)


if __name__ == "__main__":
    ray.init(local_mode=True, include_dashboard=False)
    args = parser.parse_args()

    config = {
        "env": "my_switch",
        "batch_mode": "complete_episodes",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_workers": 0,
        "multiagent": {
            "policies": {
                "pol0": (None, Switch.observation_space, Switch.action_space, {
                    "framework": "tf",
                }),
                "pol1": (None, Switch.observation_space, Switch.action_space, {
                    "framework": "tf",
                }),
                "pol2": (None, Switch.observation_space, Switch.action_space, {
                    "framework": "tf",
                }),
                "pol3": (None, Switch.observation_space, Switch.action_space, {
                    "framework": "tf",
                }),
            },
            "policy_mapping_fn": lambda x: "pol0" if x == 0 else ("pol1" if x == 1 else ("pol2" if x == 2 else "pol3")),
        },
        "model": {
            "custom_model": "cc_model",
        },
        "framework": "tf",
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
    }

    results = tune.run(CCTrainer,
                       name="switch_v4",
                       config=config,
                       stop=stop,
                       verbose=1,
                       checkpoint_freq=10,
                       checkpoint_at_end=True,
                       mode="max",
                       metric="episode_reward_mean"
    )

    print(f"Best checkpoint at: {results.best_checkpoint}")

This results in the following error message (partially shortened to meet the max character limit):

C:\Users\z004757h\Anaconda3\envs\marl-env-v3\python.exe D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py

CREATED ENVIRONMENT
OBSERVATION SPACE:	 Box(0.0, 1.0, (4,), float32)
ACTION SPACE:		 Discrete(5)

WARNING:tensorflow:Model was constructed with shape (?, 12) for input Tensor("pol0/opp_obs:0", shape=(?, 12), dtype=float32), but it was called on an input with incompatible shape (?, 4).
WARNING:tensorflow:Model was constructed with shape (?, 15) for input Tensor("pol0/opp_act:0", shape=(?, 15), dtype=float32), but it was called on an input with incompatible shape (?, 5).
E1214 16:34:36.863097  2936 17592 core_worker.cc:1128] Pushed Error with JobID: 01000000 of type: task with message: ray::CCPPOTrainer.__init__() (pid=2936, ip=192.168.2.119)
  File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
    return method(actor, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
    super().__init__(config, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
    self.setup(copy.deepcopy(self.config))
  [...]
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
    obs_include_prev_action_reward=obs_include_prev_action_reward)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
    self._initialize_loss_dynamically()
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
    loss = self._do_loss_init(train_batch)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
    loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
    policy._central_value_out = model.value_function()
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
    train_batch[OPPONENT_ACTION])
  File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
    return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])  # ---> changed the depth of one_hot encoding to 5 (5 actions)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
    inputs, training=training, mask=mask)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
    outputs = node.layer(*args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
    self.name)
[...]
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
    self._initialize_loss_dynamically()
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
    loss = self._do_loss_init(train_batch)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
    loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
    policy._central_value_out = model.value_function()
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
    train_batch[OPPONENT_ACTION])
  File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
    return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
    inputs, training=training, mask=mask)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
    outputs = node.layer(*args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
    self.name)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
    ' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13] at time: 1.60796e+09
== Status ==
Memory usage on this node: 11.3/15.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1/12 CPUs, 0/1 GPUs, 0.0/2.83 GiB heap, 0.0/0.98 GiB objects
Result logdir: C:\Users\z004757h\ray_results\switch_v4
Number of trials: 1 (1 RUNNING)
+------------------------------------+----------+-------+
| Trial name                         | status   | loc   |
|------------------------------------+----------+-------|
| CCPPOTrainer_my_switch_de4af_00000 | RUNNING  |       |
+------------------------------------+----------+-------+


2020-12-14 16:34:36,974	ERROR trial_runner.py:567 -- Trial CCPPOTrainer_my_switch_de4af_00000: Error processing event.
Traceback (most recent call last):
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trial_runner.py", line 515, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\ray_trial_executor.py", line 488, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 1428, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::CCPPOTrainer.train() (pid=2936, ip=192.168.2.119)
  File "python\ray\_raylet.pyx", line 445, in ray._raylet.execute_task
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 174, in reraise_actor_init_error
    raise self.actor_init_error
  File "python\ray\_raylet.pyx", line 479, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 483, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
    return method(actor, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
    super().__init__(config, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
    self.setup(copy.deepcopy(self.config))
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 629, in setup
    self._init(self.config, self.env_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 125, in _init
    self.config["num_workers"])
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 699, in _make_workers
    logdir=self.logdir)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 74, in __init__
    self._local_config)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 305, in _make_worker
    extra_python_environs=extra_python_environs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 416, in __init__
    self._build_policy_map(policy_dict, policy_config)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
    obs_include_prev_action_reward=obs_include_prev_action_reward)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
    self._initialize_loss_dynamically()
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
    loss = self._do_loss_init(train_batch)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
    loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
    policy._central_value_out = model.value_function()
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
    train_batch[OPPONENT_ACTION])
  File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
    return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])  # ---> changed the depth of one_hot encoding to 5 (5 actions)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
    inputs, training=training, mask=mask)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
    outputs = node.layer(*args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
    self.name)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
    ' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13]
E1214 16:34:37.070096  2936 17592 core_worker.cc:1128] Pushed Error with JobID: 01000000 of type: task with message: ray::CCPPOTrainer.stop() (pid=2936, ip=192.168.2.119)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trial_runner.py", line 515, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\ray_trial_executor.py", line 488, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 1428, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::CCPPOTrainer.train() (pid=2936, ip=192.168.2.119)
  File "python\ray\_raylet.pyx", line 445, in ray._raylet.execute_task
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 174, in reraise_actor_init_error
    raise self.actor_init_error
  File "python\ray\_raylet.pyx", line 479, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 483, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
    return method(actor, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
    super().__init__(config, logger_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
    self.setup(copy.deepcopy(self.config))
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 629, in setup
    self._init(self.config, self.env_creator)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 125, in _init
    self.config["num_workers"])
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 699, in _make_workers
    logdir=self.logdir)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 74, in __init__
    self._local_config)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 305, in _make_worker
    extra_python_environs=extra_python_environs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 416, in __init__
    self._build_policy_map(policy_dict, policy_config)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
    obs_include_prev_action_reward=obs_include_prev_action_reward)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
    self._initialize_loss_dynamically()
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
    loss = self._do_loss_init(train_batch)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
    loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
    policy._central_value_out = model.value_function()
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
    train_batch[OPPONENT_ACTION])
  File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
    return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])  # ---> changed the depth of one_hot encoding to 5 (5 actions)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
    inputs, training=training, mask=mask)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
    outputs = node.layer(*args, **kwargs)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
    self.name)
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
    ' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13]
== Status ==
Memory usage on this node: 11.3/15.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/1 GPUs, 0.0/2.83 GiB heap, 0.0/0.98 GiB objects
Result logdir: C:\Users\z004757h\ray_results\switch_v4
Number of trials: 1 (1 ERROR)
+------------------------------------+----------+-------+
| Trial name                         | status   | loc   |
|------------------------------------+----------+-------|
| CCPPOTrainer_my_switch_de4af_00000 | ERROR    |       |
+------------------------------------+----------+-------+
Number of errored trials: 1
+------------------------------------+--------------+------------------------------------------------------------------------------------------------------------+
| Trial name                         |   # failures | error file                                                                                                 |
|------------------------------------+--------------+------------------------------------------------------------------------------------------------------------|
| CCPPOTrainer_my_switch_de4af_00000 |            1 | C:\Users\z004757h\ray_results\switch_v4\CCPPOTrainer_my_switch_de4af_00000_0_2020-12-14_16-34-34\error.txt |
+------------------------------------+--------------+------------------------------------------------------------------------------------------------------------+

Traceback (most recent call last):
  File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 211, in <module>
    metric="episode_reward_mean"
  File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\tune.py", line 427, in run
    raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [CCPPOTrainer_my_switch_de4af_00000])


Process finished with exit code 1

Did anyone manage to adjust the number of agents in the centralized_critic.py example or has an idea what else I have to change?

Thank you in advance!

Cheers, Korbi

1 Like

Hey @korbinian-hoermann, thanks for asking this. Could you file a github issue, but with a script that runs out-of-the-box? I cannot get yours to run due to the imports at the beginning.
Thanks!

1 Like

Hey @sven1977, thanks for the response! :slight_smile:
I filed an github issue including the script.
Filed it in as a bug report though (not sure if thats the way to do it, as feature request/questions should be asked here, right ?)

Thanks for filing this! I answered here: https://github.com/ray-project/ray/issues/12851
I don’t think it’s an RLlib bug, though, more an error in the postprocessing fn, but see for yourself and let me know.
Thx! :slight_smile:

2 Likes

@sven1977 @korbinian-hoermann Hi thanks a lot for this discussion on multi-agent central critic approach example. But could you guys give a suggestion on how was the issue of
sample_batch[OPPONENT_OBS] = np.zeros_like(

        sample_batch[SampleBatch.CUR_OBS])

was resolved, how were the current observation sizes picked up from sample batch for rest of the three agents.