Tried creating a new policy. but getting this error

class MetaLearningPolicy(TorchPolicyV2):
    def __init__(self, observation_space, action_space, config):    
        # Initialize the meta-policy network first
        self.meta_policy_net = MetaPolicyNetwork(input_size=observation_space.shape[0])

        # Initialize the underlying PPO policy
        self.ppo_policy = PPOTorchPolicy(observation_space, action_space, config)

        # Combine parameters from both PPO policy and Meta-policy network
        all_params = list(self.ppo_policy.model.parameters()) + list(self.meta_policy_net.parameters())

        # Set up a dummy model that includes all parameters
        self.model = torch.nn.Module()
        self.model.parameters = lambda: iter(all_params)

        # Now call the parent constructor to initialize TorchPolicyV2
        super().__init__(observation_space, action_space, config)

        # Initialize the optimizers after the model is fully set up
        self._optimizers = self._setup_optimizers()

    def _setup_optimizers(self):
        # Combine optimizers from PPO and the meta-policy network
        ppo_optimizers = self.ppo_policy.optimizer()
        meta_policy_optimizer = torch.optim.Adam(self.meta_policy_net.parameters(), lr=1e-4)
        return ppo_optimizers + [meta_policy_optimizer]

    def optimizer(self):
        # Override optimizer method to return the combined optimizer list
        if not hasattr(self, '_optimizers') or self._optimizers is None:
            self._optimizers = self._setup_optimizers()
        return self._optimizers


    def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None):
        # Get the dynamic gamma from the meta-policy network
        gammas = self.meta_policy_net(sample_batch[SampleBatch.OBS]).squeeze()
        gammas = torch.clamp(gammas, min=0.0, max=1.0)
        
        rewards = sample_batch[SampleBatch.REWARDS]
        dones = sample_batch[SampleBatch.DONES]
        values = sample_batch[SampleBatch.VF_PREDS]
        lambdas = torch.full_like(gammas, self.config["lambda"])

        # Compute discounted rewards and advantages
        discounted_rewards = self.compute_value_targets(rewards, gammas, dones)
        advantages = self.compute_gae(rewards, values, gammas, lambdas, dones)

        sample_batch[SampleBatch.REWARDS] = discounted_rewards
        sample_batch[SampleBatch.ADVANTAGES] = advantages
        sample_batch[SampleBatch.VALUE_TARGETS] = discounted_rewards
        
        # Pass the processed batch to the underlying PPO policy
        return self.ppo_policy.postprocess_trajectory(sample_batch, other_agent_batches, episode)


    # def optimizer(self):
    #     # Include both PPO and meta-policy network parameters in the optimizer
    #     ppo_optimizers = self.ppo_policy.optimizer()
    #     meta_policy_optimizer = torch.optim.Adam(self.meta_policy_net.parameters(), lr=1e-4)
    #     return ppo_optimizers + [meta_policy_optimizer]

    # Delegate other methods to the underlying PPO policy
    def compute_actions(self, *args, **kwargs):
        return self.ppo_policy.compute_actions(*args, **kwargs)
    
    def learn_on_batch(self, samples):
        return self.ppo_policy.learn_on_batch(samples)
    
    def get_weights(self):
        return self.ppo_policy.get_weights()
    
    def set_weights(self, weights):
        self.ppo_policy.set_weights(weights)

    def compute_value_targets(self, rewards, gammas, dones):
        value_targets = torch.zeros_like(rewards)
        cumulative_reward = 0
        for t in reversed(range(len(rewards))):
            if dones[t]:
                cumulative_reward = 0
            cumulative_reward = rewards[t] + gammas[t] * cumulative_reward
            value_targets[t] = cumulative_reward
        return value_targets

    def compute_gae(self, rewards, values, gammas, lambdas, dones):
        advantages = torch.zeros_like(rewards)
        gae = 0
        next_value = 0
        for t in reversed(range(len(rewards))):
            if dones[t]:
                gae = 0
                next_value = 0
            delta = rewards[t] + gammas[t] * next_value - values[t]
            gae = delta + gammas[t] * lambdas[t] * gae
            advantages[t] = gae
            next_value = values[t]
        return advantages

config = (
    PPOConfig()
    .environment(env=LudoMultiAgentEnv)
    .framework("torch")  # Use PyTorch
    .callbacks(RandomBotPosCallback)
    .multi_agent(
        policies={
            "ppo_policy":PolicySpec(
                policy_class=MetaLearningPolicy,  # Use your custom policy
                observation_space=env.observation_space,
                action_space=env.action_space,
            ),
            "random_policy": (None, env.observation_space, env.action_space, {}),
        },
        policy_mapping_fn=policy_mapping_fn,
    )
    .training(
       # gamma=0.99,  # Discount factor
        lr=1e-4,  # Learning rate
        train_batch_size=4000,  # Batch size for training
        model={
            "fcnet_hiddens": [1024, 512, 256, 64, 32],
            "fcnet_activation": "relu",
        }
    )
    .rl_module(
        rl_module_spec=MultiAgentRLModuleSpec(
            module_specs={
                "random_policy": SingleAgentRLModuleSpec(
                    module_class=RandomAction,
                    observation_space=env.observation_space,
                    action_space=env.action_space,
                ),
                "ppo_policy": SingleAgentRLModuleSpec(
                    module_class=None,  # Use the default PPO policy
                    observation_space=env.observation_space,
                    action_space=env.action_space,
                ),
            }
        )
    )
    .resources(num_gpus=1)  # Use CPUs if GPUs are not available
    .debugging(log_level="INFO")
)

# Replace rollouts with env_runners
config.env_runners(
    num_env_runners=4,  # This replaces num_rollout_workers
    num_envs_per_env_runner=8  # This replaces num_envs_per_worker
)

# Set up TensorBoard log directory
storage_path = os.path.abspath("./ludo_rllib_logs_agt_vs_random_dynamic_gamma/")
os.makedirs(storage_path, exist_ok=True)

# Run the training
results = tune.run(
    PPO,
    config=config.to_dict(),
    stop={"training_iteration": 2000},  # Stop after 100 iterations
    storage_path=storage_path,  # Directory for logs
    checkpoint_freq=200,  # Save checkpoint every 10 iterations
    checkpoint_at_end=True,  # Save checkpoint at the end
    verbose=1,  # Print progress    
)

Getting this bug as below:

File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/env/env_runner_group.py", line 195, in __init__ [repeated 27x across cluster]
(PPO pid=5774)     self._update_policy_map(policy_dict=self.policy_dict) [repeated 8x across cluster]
(PPO pid=5774)   File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1726, in _update_policy_map [repeated 8x across cluster]
(PPO pid=5774)     self._build_policy_map( [repeated 8x across cluster]
(PPO pid=5774)   File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1837, in _build_policy_map [repeated 8x across cluster]
(PPO pid=5774)     new_policy = create_policy_for_framework( [repeated 8x across cluster]
(PPO pid=5774)   File "/nfs_storage/fs-mnt6/tarunsai/anaconda/lib/python3.9/site-packages/ray/rllib/utils/policy.py", line 140, in create_policy_for_framework [repeated 8x across cluster]
(PPO pid=5774)     return policy_class(observation_space, action_space, merged_config) [repeated 8x across cluster]
(PPO pid=5774)     super().__init__(observation_space, action_space, config) [repeated 8x across cluster]
(PPO pid=5774)     param_indices.append(main_params[p]) [repeated 8x across cluster]
(PPO pid=5774) KeyError: Parameter containing: [repeated 9x across cluster]
(PPO pid=5774)         [ 1.4564e-03, -3.2454e-03, -1.8770e-03, -1.8261e-03,  1.6383e-05, [repeated 19x across cluster]
(PPO pid=5774)          -1.4045e-03, -1.3660e-03]], requires_grad=True) [repeated 9x across cluster]
(PPO pid=5774) tensor([[ 5.9189e-04, -9.2856e-04, -9.7775e-04,  2.2166e-03, -2.3867e-03, [repeated 3x across cluster]

Can someone help?

@Tarun_Sai thanks for posting this. Can you give us a simple reproducable example so we can debug this? The code above is non executable like this.