Hi!
I am trying to solve the following environment with the MAPPO (PPO with a centralized critic)
Reward
- For each time step a agents is not in its final position, it receives a reward of -1
- For each time step a agents is in its final position, it receives a reward of 0
Actions
- “DOWN”
- “LEFT”
- “UP”
- “RIGHT”
- “NOOP”
Observation
For each agent an obs consists of:
- ID of the agent
- Y coordinate of the agent
- X coordinate of the agent
- a current step count (for time reference)
Resulting in the following observation and action spaces of the environment:
- action_space = spaces.Discrete(5)
- observation_space = spaces.Box(np.array([0., 0., 0., 0.]), np.array([1., 1., 1., 1.]))
One episode lasts for 50 time steps. The goal for all agents is to get into their final position (cell wich has the same colour as the corresponding agent) as fast as possibe and stay in there until the episode ends.
I was able to solve this environment with 2 agents, following rllibs’s centralized critic example.
In order to handle a increased number of agents, I made following changes to the example code (I am only using the TF versions):
from Multi_Agent.RLlib.MAPPO.switch_v4.centralized_critic_model_tf_switch_v4 import
CentralizedCriticModel
from Multi_Agent.RLlib.MAPPO.envs.switch_v4 import Switch
tf1, tf, tfv = try_import_tf()
class CentralizedCriticModel(TFModelV2):
"""Multi-agent model that implements a centralized value function."""
def __init__(self, obs_space, action_space, num_outputs, model_config,name):
super(CentralizedCriticModel, self).__init__(obs_space, action_space, num_outputs, model_config, name)
# Base of the model
self.model = FullyConnectedNetwork(obs_space, action_space, num_outputs, model_config, name)
self.register_variables(self.model.variables())
n_agents = 4 # ---> opp_obs and opp_acts now consist of 3 (4 - 1) different agent information
obs = 4
act = 5
opp_obs_accum = obs * (n_agents - 1)
opp_acts_accum = act * (n_agents - 1)
# Central VF maps (obs, opp_obs, opp_act) -> vf_pred
obs = tf.keras.layers.Input(shape=(obs, ), name="obs")
opp_obs = tf.keras.layers.Input(shape=(opp_obs_accum, ), name="opp_obs")
opp_act = tf.keras.layers.Input(shape=(opp_acts_accum, ), name="opp_act")
concat_obs = tf.keras.layers.Concatenate(axis=1)([obs, opp_obs, opp_act])
central_vf_dense = tf.keras.layers.Dense(16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs)
central_vf_out = tf.keras.layers.Dense(1, activation=None, name="c_vf_out")(central_vf_dense)
self.central_vf = tf.keras.Model(inputs=[obs, opp_obs, opp_act], outputs=central_vf_out)
print(f"Centralized Critic Model: \n{self.central_vf.summary()}")
self.register_variables(self.central_vf.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
return self.model.forward(input_dict, state, seq_lens)
def central_value_function(self, obs, opponent_obs, opponent_actions):
return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1]) # ---> changed the depth of one_hot encoding to 5 (5 actions)
@override(ModelV2)
def value_function(self):
return self.model.value_function() # not used
register_env("my_switch", lambda _: Switch({}))
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
OPPONENT_OBS = "opponent_obs"
OPPONENT_ACTION = "opponent_action"
parser = argparse.ArgumentParser()
parser.add_argument("--stop-iters", type=int, default=1000)
parser.add_argument("--stop-timesteps", type=int, default=1_000_000)
class CentralizedValueMixin:
"""Add method to evaluate the central value function from the model."""
def __init__(self):
if self.config["framework"] != "torch":
self.compute_central_vf = make_tf_callable(self.get_session())(
self.model.central_value_function)
# Grabs the opponent obs/act and includes it in the experience train_batch,
# and computes GAE using the central vf predictions.
def centralized_critic_postprocessing(policy,
sample_batch,
other_agent_batches=None,
episode=None):
if policy.loss_initialized():
assert other_agent_batches is not None
#[(_, opponent_batch)] = list(other_agent_batches.values())
# ---> opponent batch now consists of 3 SampleBatches, so I concatenate them
concat_opponent_batch = SampleBatch.concat_samples([opponent_n_batch
for _, opponent_n_batch in other_agent_batches.values()])
opponent_batch = concat_opponent_batch
# also record the opponent obs and actions in the trajectory
sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS]
sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
sample_batch[OPPONENT_ACTION])
else:
# Policy hasn't been initialized yet, use zeros.
sample_batch[OPPONENT_OBS] = np.zeros_like(
sample_batch[SampleBatch.CUR_OBS])
sample_batch[OPPONENT_ACTION] = np.zeros_like(
sample_batch[SampleBatch.ACTIONS])
sample_batch[SampleBatch.VF_PREDS] = np.zeros_like(
sample_batch[SampleBatch.REWARDS], dtype=np.float32)
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
last_r = sample_batch[SampleBatch.VF_PREDS][-1]
train_batch = compute_advantages(
sample_batch,
last_r,
policy.config["gamma"],
policy.config["lambda"],
use_gae=policy.config["use_gae"])
return train_batch
# Copied from PPO but optimizing the central value function.
def loss_with_central_critic(policy, model, dist_class, train_batch):
CentralizedValueMixin.__init__(policy)
func = tf_loss
vf_saved = model.value_function
model.value_function = lambda: policy.model.central_value_function(
train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS],
train_batch[OPPONENT_ACTION])
policy._central_value_out = model.value_function()
loss = func(policy, model, dist_class, train_batch)
model.value_function = vf_saved
return loss
def setup_tf_mixins(policy, obs_space, action_space, config):
# Copied from PPOTFPolicy (w/o ValueNetworkMixin).
KLCoeffMixin.__init__(policy, config)
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
config["entropy_coeff_schedule"])
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
def central_vf_stats(policy, train_batch, grads):
# Report the explained variance of the central value function.
return {
"vf_explained_var": explained_variance(
train_batch[Postprocessing.VALUE_TARGETS],
policy._central_value_out),
}
CCPPOTFPolicy = PPOTFPolicy.with_updates(
name="CCPPOTFPolicy",
postprocess_fn=centralized_critic_postprocessing,
loss_fn=loss_with_central_critic,
before_loss_init=setup_tf_mixins,
grad_stats_fn=central_vf_stats,
mixins=[
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
CentralizedValueMixin
])
""" ---> Original, didn't return CCPPOTFPolicy
def get_policy_class(config):
if config["framework"] == "torch":
return CCPPOTorchPolicy"""
def get_policy_class(config):
if config["framework"] == "tf":
return CCPPOTFPolicy
CCTrainer = PPOTrainer.with_updates(
name="CCPPOTrainer",
default_policy=CCPPOTFPolicy,
get_policy_class=get_policy_class,
)
if __name__ == "__main__":
ray.init(local_mode=True, include_dashboard=False)
args = parser.parse_args()
config = {
"env": "my_switch",
"batch_mode": "complete_episodes",
# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
"num_workers": 0,
"multiagent": {
"policies": {
"pol0": (None, Switch.observation_space, Switch.action_space, {
"framework": "tf",
}),
"pol1": (None, Switch.observation_space, Switch.action_space, {
"framework": "tf",
}),
"pol2": (None, Switch.observation_space, Switch.action_space, {
"framework": "tf",
}),
"pol3": (None, Switch.observation_space, Switch.action_space, {
"framework": "tf",
}),
},
"policy_mapping_fn": lambda x: "pol0" if x == 0 else ("pol1" if x == 1 else ("pol2" if x == 2 else "pol3")),
},
"model": {
"custom_model": "cc_model",
},
"framework": "tf",
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
}
results = tune.run(CCTrainer,
name="switch_v4",
config=config,
stop=stop,
verbose=1,
checkpoint_freq=10,
checkpoint_at_end=True,
mode="max",
metric="episode_reward_mean"
)
print(f"Best checkpoint at: {results.best_checkpoint}")
This results in the following error message (partially shortened to meet the max character limit):
C:\Users\z004757h\Anaconda3\envs\marl-env-v3\python.exe D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py
CREATED ENVIRONMENT
OBSERVATION SPACE: Box(0.0, 1.0, (4,), float32)
ACTION SPACE: Discrete(5)
WARNING:tensorflow:Model was constructed with shape (?, 12) for input Tensor("pol0/opp_obs:0", shape=(?, 12), dtype=float32), but it was called on an input with incompatible shape (?, 4).
WARNING:tensorflow:Model was constructed with shape (?, 15) for input Tensor("pol0/opp_act:0", shape=(?, 15), dtype=float32), but it was called on an input with incompatible shape (?, 5).
E1214 16:34:36.863097 2936 17592 core_worker.cc:1128] Pushed Error with JobID: 01000000 of type: task with message: ray::CCPPOTrainer.__init__() (pid=2936, ip=192.168.2.119)
File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
return method(actor, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
Trainer.__init__(self, config, env, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
super().__init__(config, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
self.setup(copy.deepcopy(self.config))
[...]
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
policy_map[name] = cls(obs_space, act_space, merged_conf)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
obs_include_prev_action_reward=obs_include_prev_action_reward)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
self._initialize_loss_dynamically()
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
loss = self._do_loss_init(train_batch)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
policy._central_value_out = model.value_function()
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
train_batch[OPPONENT_ACTION])
File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1]) # ---> changed the depth of one_hot encoding to 5 (5 actions)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
inputs, training=training, mask=mask)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
self.name)
[...]
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
self._initialize_loss_dynamically()
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
loss = self._do_loss_init(train_batch)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
policy._central_value_out = model.value_function()
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
train_batch[OPPONENT_ACTION])
File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1])
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
inputs, training=training, mask=mask)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
self.name)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13] at time: 1.60796e+09
== Status ==
Memory usage on this node: 11.3/15.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1/12 CPUs, 0/1 GPUs, 0.0/2.83 GiB heap, 0.0/0.98 GiB objects
Result logdir: C:\Users\z004757h\ray_results\switch_v4
Number of trials: 1 (1 RUNNING)
+------------------------------------+----------+-------+
| Trial name | status | loc |
|------------------------------------+----------+-------|
| CCPPOTrainer_my_switch_de4af_00000 | RUNNING | |
+------------------------------------+----------+-------+
2020-12-14 16:34:36,974 ERROR trial_runner.py:567 -- Trial CCPPOTrainer_my_switch_de4af_00000: Error processing event.
Traceback (most recent call last):
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trial_runner.py", line 515, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\ray_trial_executor.py", line 488, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 1428, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::CCPPOTrainer.train() (pid=2936, ip=192.168.2.119)
File "python\ray\_raylet.pyx", line 445, in ray._raylet.execute_task
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 174, in reraise_actor_init_error
raise self.actor_init_error
File "python\ray\_raylet.pyx", line 479, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 483, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
return method(actor, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
Trainer.__init__(self, config, env, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
super().__init__(config, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
self.setup(copy.deepcopy(self.config))
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 629, in setup
self._init(self.config, self.env_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 125, in _init
self.config["num_workers"])
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 699, in _make_workers
logdir=self.logdir)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 74, in __init__
self._local_config)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 305, in _make_worker
extra_python_environs=extra_python_environs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 416, in __init__
self._build_policy_map(policy_dict, policy_config)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
policy_map[name] = cls(obs_space, act_space, merged_conf)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
obs_include_prev_action_reward=obs_include_prev_action_reward)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
self._initialize_loss_dynamically()
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
loss = self._do_loss_init(train_batch)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
policy._central_value_out = model.value_function()
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
train_batch[OPPONENT_ACTION])
File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1]) # ---> changed the depth of one_hot encoding to 5 (5 actions)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
inputs, training=training, mask=mask)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
self.name)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13]
E1214 16:34:37.070096 2936 17592 core_worker.cc:1128] Pushed Error with JobID: 01000000 of type: task with message: ray::CCPPOTrainer.stop() (pid=2936, ip=192.168.2.119)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trial_runner.py", line 515, in _process_trial
result = self.trial_executor.fetch_result(trial)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\ray_trial_executor.py", line 488, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 1428, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::CCPPOTrainer.train() (pid=2936, ip=192.168.2.119)
File "python\ray\_raylet.pyx", line 445, in ray._raylet.execute_task
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\worker.py", line 174, in reraise_actor_init_error
raise self.actor_init_error
File "python\ray\_raylet.pyx", line 479, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 483, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\function_manager.py", line 553, in actor_method_executor
return method(actor, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 101, in __init__
Trainer.__init__(self, config, env, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 476, in __init__
super().__init__(config, logger_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\trainable.py", line 249, in __init__
self.setup(copy.deepcopy(self.config))
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 629, in setup
self._init(self.config, self.env_creator)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer_template.py", line 125, in _init
self.config["num_workers"])
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\agents\trainer.py", line 699, in _make_workers
logdir=self.logdir)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 74, in __init__
self._local_config)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 305, in _make_worker
extra_python_environs=extra_python_environs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 416, in __init__
self._build_policy_map(policy_dict, policy_config)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1008, in _build_policy_map
policy_map[name] = cls(obs_space, act_space, merged_conf)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 221, in __init__
obs_include_prev_action_reward=obs_include_prev_action_reward)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 299, in __init__
self._initialize_loss_dynamically()
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 437, in _initialize_loss_dynamically
loss = self._do_loss_init(train_batch)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 449, in _do_loss_init
loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 113, in loss_with_central_critic
policy._central_value_out = model.value_function()
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 111, in <lambda>
train_batch[OPPONENT_ACTION])
File "D:\Git\example-codes\Multi_Agent\RLlib\MAPPO\switch_v4\centralized_critic_model_tf_switch_v4.py", line 46, in central_value_function
return tf.reshape(self.central_vf([obs, opponent_obs, tf.one_hot(opponent_actions, 5)]), [-1]) # ---> changed the depth of one_hot encoding to 5 (5 actions)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 776, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 386, in call
inputs, training=training, mask=mask)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\functional.py", line 508, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\base_layer_v1.py", line 752, in __call__
self.name)
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py", line 216, in assert_input_compatibility
' but received input with shape ' + str(shape))
ValueError: Input 0 of layer c_vf_dense is incompatible with the layer: expected axis -1 of input shape to have value 31 but received input with shape [None, 13]
== Status ==
Memory usage on this node: 11.3/15.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/1 GPUs, 0.0/2.83 GiB heap, 0.0/0.98 GiB objects
Result logdir: C:\Users\z004757h\ray_results\switch_v4
Number of trials: 1 (1 ERROR)
+------------------------------------+----------+-------+
| Trial name | status | loc |
|------------------------------------+----------+-------|
| CCPPOTrainer_my_switch_de4af_00000 | ERROR | |
+------------------------------------+----------+-------+
Number of errored trials: 1
+------------------------------------+--------------+------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------------------+--------------+------------------------------------------------------------------------------------------------------------|
| CCPPOTrainer_my_switch_de4af_00000 | 1 | C:\Users\z004757h\ray_results\switch_v4\CCPPOTrainer_my_switch_de4af_00000_0_2020-12-14_16-34-34\error.txt |
+------------------------------------+--------------+------------------------------------------------------------------------------------------------------------+
Traceback (most recent call last):
File "D:/Git/example-codes/Multi_Agent/RLlib/MAPPO/switch_v4/mappo_train_switch_v4.py", line 211, in <module>
metric="episode_reward_mean"
File "C:\Users\z004757h\Anaconda3\envs\marl-env-v3\lib\site-packages\ray\tune\tune.py", line 427, in run
raise TuneError("Trials did not complete", incomplete_trials)
ray.tune.error.TuneError: ('Trials did not complete', [CCPPOTrainer_my_switch_de4af_00000])
Process finished with exit code 1
Did anyone manage to adjust the number of agents in the centralized_critic.py example or has an idea what else I have to change?
Thank you in advance!
Cheers, Korbi