class MetaLearningPolicy(TorchPolicyV2):
def __init__(self, observation_space, action_space, config):
# Initialize the meta-policy network first
self.meta_policy_net = MetaPolicyNetwork(input_size=observation_space.shape[0])
# Initialize the underlying PPO policy
self.ppo_policy = PPOTorchPolicy(observation_space, action_space, config)
# Combine parameters from both PPO policy and Meta-policy network
all_params = list(self.ppo_policy.model.parameters()) + list(self.meta_policy_net.parameters())
# Set up a dummy model that includes all parameters
self.model = torch.nn.Module()
self.model.parameters = lambda: iter(all_params)
# Now call the parent constructor to initialize TorchPolicyV2
super().__init__(observation_space, action_space, config)
# Initialize the optimizers after the model is fully set up
self._optimizers = self._setup_optimizers()
def _setup_optimizers(self):
# Combine optimizers from PPO and the meta-policy network
ppo_optimizers = self.ppo_policy.optimizer()
meta_policy_optimizer = torch.optim.Adam(self.meta_policy_net.parameters(), lr=1e-4)
return ppo_optimizers + [meta_policy_optimizer]
def optimizer(self):
# Override optimizer method to return the combined optimizer list
if not hasattr(self, '_optimizers') or self._optimizers is None:
self._optimizers = self._setup_optimizers()
return self._optimizers
def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None):
# Get the dynamic gamma from the meta-policy network
gammas = self.meta_policy_net(sample_batch[SampleBatch.OBS]).squeeze()
gammas = torch.clamp(gammas, min=0.0, max=1.0)
rewards = sample_batch[SampleBatch.REWARDS]
dones = sample_batch[SampleBatch.DONES]
values = sample_batch[SampleBatch.VF_PREDS]
lambdas = torch.full_like(gammas, self.config["lambda"])
# Compute discounted rewards and advantages
discounted_rewards = self.compute_value_targets(rewards, gammas, dones)
advantages = self.compute_gae(rewards, values, gammas, lambdas, dones)
sample_batch[SampleBatch.REWARDS] = discounted_rewards
sample_batch[SampleBatch.ADVANTAGES] = advantages
sample_batch[SampleBatch.VALUE_TARGETS] = discounted_rewards
# Pass the processed batch to the underlying PPO policy
return self.ppo_policy.postprocess_trajectory(sample_batch, other_agent_batches, episode)
# def optimizer(self):
# # Include both PPO and meta-policy network parameters in the optimizer
# ppo_optimizers = self.ppo_policy.optimizer()
# meta_policy_optimizer = torch.optim.Adam(self.meta_policy_net.parameters(), lr=1e-4)
# return ppo_optimizers + [meta_policy_optimizer]
# Delegate other methods to the underlying PPO policy
def compute_actions(self, *args, **kwargs):
return self.ppo_policy.compute_actions(*args, **kwargs)
def learn_on_batch(self, samples):
return self.ppo_policy.learn_on_batch(samples)
def get_weights(self):
return self.ppo_policy.get_weights()
def set_weights(self, weights):
self.ppo_policy.set_weights(weights)
def compute_value_targets(self, rewards, gammas, dones):
value_targets = torch.zeros_like(rewards)
cumulative_reward = 0
for t in reversed(range(len(rewards))):
if dones[t]:
cumulative_reward = 0
cumulative_reward = rewards[t] + gammas[t] * cumulative_reward
value_targets[t] = cumulative_reward
return value_targets
def compute_gae(self, rewards, values, gammas, lambdas, dones):
advantages = torch.zeros_like(rewards)
gae = 0
next_value = 0
for t in reversed(range(len(rewards))):
if dones[t]:
gae = 0
next_value = 0
delta = rewards[t] + gammas[t] * next_value - values[t]
gae = delta + gammas[t] * lambdas[t] * gae
advantages[t] = gae
next_value = values[t]
return advantages
config = (
PPOConfig()
.environment(env=LudoMultiAgentEnv)
.framework("torch") # Use PyTorch
.callbacks(RandomBotPosCallback)
.multi_agent(
policies={
"ppo_policy":PolicySpec(
policy_class=MetaLearningPolicy, # Use your custom policy
observation_space=env.observation_space,
action_space=env.action_space,
),
"random_policy": (None, env.observation_space, env.action_space, {}),
},
policy_mapping_fn=policy_mapping_fn,
)
.training(
# gamma=0.99, # Discount factor
lr=1e-4, # Learning rate
train_batch_size=4000, # Batch size for training
model={
"fcnet_hiddens": [1024, 512, 256, 64, 32],
"fcnet_activation": "relu",
}
)
.rl_module(
rl_module_spec=MultiAgentRLModuleSpec(
module_specs={
"random_policy": SingleAgentRLModuleSpec(
module_class=RandomAction,
observation_space=env.observation_space,
action_space=env.action_space,
),
"ppo_policy": SingleAgentRLModuleSpec(
module_class=None, # Use the default PPO policy
observation_space=env.observation_space,
action_space=env.action_space,
),
}
)
)
.resources(num_gpus=1) # Use CPUs if GPUs are not available
.debugging(log_level="INFO")
)
# Replace rollouts with env_runners
config.env_runners(
num_env_runners=4, # This replaces num_rollout_workers
num_envs_per_env_runner=8 # This replaces num_envs_per_worker
)
# Set up TensorBoard log directory
storage_path = os.path.abspath("./ludo_rllib_logs_agt_vs_random_dynamic_gamma/")
os.makedirs(storage_path, exist_ok=True)
# Run the training
results = tune.run(
PPO,
config=config.to_dict(),
stop={"training_iteration": 2000}, # Stop after 100 iterations
storage_path=storage_path, # Directory for logs
checkpoint_freq=200, # Save checkpoint every 10 iterations
checkpoint_at_end=True, # Save checkpoint at the end
verbose=1, # Print progress
)
Getting this bug as below:
File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/env/env_runner_group.py", line 195, in __init__ [repeated 27x across cluster]
(PPO pid=5774) self._update_policy_map(policy_dict=self.policy_dict) [repeated 8x across cluster]
(PPO pid=5774) File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1726, in _update_policy_map [repeated 8x across cluster]
(PPO pid=5774) self._build_policy_map( [repeated 8x across cluster]
(PPO pid=5774) File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1837, in _build_policy_map [repeated 8x across cluster]
(PPO pid=5774) new_policy = create_policy_for_framework( [repeated 8x across cluster]
(PPO pid=5774) File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/utils/policy.py", line 140, in create_policy_for_framework [repeated 8x across cluster]
(PPO pid=5774) return policy_class(observation_space, action_space, merged_config) [repeated 8x across cluster]
(PPO pid=5774) super().__init__(observation_space, action_space, config) [repeated 8x across cluster]
(PPO pid=5774) param_indices.append(main_params[p]) [repeated 8x across cluster]
(PPO pid=5774) KeyError: Parameter containing: [repeated 9x across cluster]
(PPO pid=5774) [ 1.4564e-03, -3.2454e-03, -1.8770e-03, -1.8261e-03, 1.6383e-05, [repeated 19x across cluster]
(PPO pid=5774) -1.4045e-03, -1.3660e-03]], requires_grad=True) [repeated 9x across cluster]
(PPO pid=5774) tensor([[ 5.9189e-04, -9.2856e-04, -9.7775e-04, 2.2166e-03, -2.3867e-03, [repeated 3x across cluster]
Can someone help?
@Tarun_Sai thanks for posting this. Can you give us a simple reproducable example so we can debug this? The code above is non executable like this.