How to separate rewards between agent in adversarial multi agent env

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I created a custom adversarial multi-agent environment using PettingZoo. Looking at the rewards given by the info and tensorboard logs, it seems that the agents rewards are summed together.

I wanted to separate the rewards sum between agents.

Here’s my training code:

import pettingzoo_env

import os
from copy import deepcopy

from ray.tune.registry import register_env
from ray.rllib.env import PettingZooEnv
from ray.rllib.agents.registry import get_trainer_class
from ray import tune

if __name__ == '__main__':
    os.environ["TUNE_ORIG_WORKING_DIR"] = os.getcwd()
    
    algorithm_version = 'DQN'
    comment_suffix = "a(3w2s)-d(2w1s)_default"

    config = deepcopy(get_trainer_class(algorithm_version)._default_config)

    def env_creator(max_turn=20, render_mode="show"):
        env = pettingzoo_env.PettingZooEnv(max_turn, render_mode)
        return env

    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    
    register_env("my_env",lambda config: PettingZooEnv(env_creator()))

    config["multiagent"] = {
        "policies": {
            "attacker": (None, obs_space, act_space, {}),
            "defender": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: agent_id,
    }   

    config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
    config["log_level"] = "INFO"
    config["num_workers"] = 1
    config["env"] = "my_env"
     
    register_env("env", lambda config: PettingZooEnv(env_creator()))
    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    act_space = test_env.action_space

    tune.run(algorithm_version,
             name=algorithm_version,
             checkpoint_freq=1000,
             stop={"episodes_total": 2},
             config=config,
             local_dir="./logs")

The environment code is here:

I’m not sure if there is a way to print out separated rewards in the training output that prints to the terminal. However, the rewards are stored by policy in the local_dir output. For example, if you launch tensorboard, go to page 4 or 5, and you should see the reward curves for each policy.

This will be covered in tomorrow’s office hours :slight_smile:

1 Like

Example code from today’s office hours:

import numpy as np

from pettingzoo.sisl import waterworld_v3

import ray
from ray.tune import CLIReporter
from ray import air, tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
from ray.tune.registry import register_env
from ray.rllib.algorithms.callbacks import DefaultCallbacks


class MyCallbacks(DefaultCallbacks):
    def on_train_result(self, *, algorithm, result: dict, **kwargs):
        result["custom_metrics"]["policy_reward_mean"] = {
            "pursuer_0": result["policy_reward_mean"].get("pursuer_0", np.nan),
            "pursuer_1": result["policy_reward_mean"].get("pursuer_1", np.nan),
            "pursuer_2": result["policy_reward_mean"].get("pursuer_2", np.nan),
            "pursuer_3": result["policy_reward_mean"].get("pursuer_3", np.nan),
        }


if __name__ == '__main__':
    ray.init(local_mode=True)

    def env_creator(args):
        return PettingZooEnv(waterworld_v3.env())

    dummy_env = env_creator({})
    register_env("waterworld", env_creator)

    obs_space = dummy_env.observation_space
    act_space = dummy_env.action_space

    config = PPOConfig()
    config.multi_agent(
        policies={pid: (None, obs_space, act_space, {}) for pid in
                  dummy_env.env.agents},
        policy_mapping_fn=(lambda agent_id, episode, **kwargs: agent_id),
        )
    config.rollouts(num_rollout_workers=4)
    config.environment(env="waterworld")
    config.callbacks(MyCallbacks)
    config = config.to_dict()

    tune.Tuner(
        "PPO",
        run_config=air.RunConfig(
            stop={"episodes_total": 1,
                  "custom_metrics/policy_reward_mean/pursuer_0": 0,
                  "custom_metrics/policy_reward_mean/pursuer_1": 0,
                  "custom_metrics/policy_reward_mean/pursuer_2": 0,
                  "custom_metrics/policy_reward_mean/pursuer_3": 0},
            checkpoint_config=air.CheckpointConfig(
                checkpoint_frequency=1000,
            ),
        progress_reporter=CLIReporter(
            metric_columns={
                "training_iteration": "training_iteration",
                "time_total_s": "time_total_s",
                "timesteps_total": "timesteps",
                "episodes_this_iter": "episodes_trained",
                "custom_metrics/policy_reward_mean/pursuer_0": "m_reward_p_0",
                "custom_metrics/policy_reward_mean/pursuer_1": "m_reward_p_1",
                "custom_metrics/policy_reward_mean/pursuer_2": "m_reward_p_2",
                "custom_metrics/policy_reward_mean/pursuer_3": "m_reward_p_3",
                "episode_reward_mean": "mean_reward_sum",
            },
            sort_by_metric=True,
        ),
        ),
        param_space=config,
    ).fit()
2 Likes