Multi-agent rock paper and scissor training gets weird result

Eloisaaa · September 1, 2021, 6:45am

Hello,

I am implementing a double player(Alice and Bob) rock paper and scissor environment using MultiAgentEnv class.
Observation space contains both players’ utilities(reward), and both players’ action.
Action space contains 0(rock),1(paper),2(scissor)
Reward for each player: winner +20, loser -20
If it’s a tie, one case(a) is all players will have +10, the other case(b) is all players will have 0 or -10.

After 500 iteration,
Case a) converges to Alice and Bob will always have the same action,
which will maximize the total reward.
Case b) converges to Alice always shows scissor, and Bob shows rock, which only maximize Alice reward while ignoring Bobs’.

What I understand is the result won’t be converged to one specific action, each action will have 1/3 chance to be shown. I am wondering where is this error from?

Thanks!

import numpy as np
import networkx as nx
import gym
import random

from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.models import ModelCatalog
from ray.tune import run_experiments
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOTrainer
from IPython import embed

# 0 Alice;1 Bob


class rpsEnv(MultiAgentEnv):
    def __init__(self, return_agent_actions=False, part=False):
        self.num_agents = 2
        self.player_list = ['Alice', 'Bob']
        self.action_space = gym.spaces.Discrete(3)
        self.action_space_dict = {}
        self.obs_space_dict = {}
        for i in self.player_list:
            self.obs_space_dict[i] = gym.spaces.Dict({
                'Alice_utilities': gym.spaces.Box(low=np.array([-50.]), high=np.array([50.])),
                'Bob_utilities': gym.spaces.Box(low=np.array([-50.]), high=np.array([50.])),
                'Alice_action': gym.spaces.Discrete(3),
                'Bob_action': gym.spaces.Discrete(3)
            })
        self.action_space_dict = {
            i: gym.spaces.Discrete(3)
            for i in self.player_list
        }
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}  # Observations.
        self.rew = {i: 0. for i in self.player_list}  # Rewards.
        self.done = {i: False for i in self.player_list}
        self.done['__all__'] = False

    def reset(self):
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}
        return self.obs

    def cal_rewards(self, action_dict):
        if action_dict['Alice'] == 0:
            if action_dict['Bob'] == 2:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 1:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        elif action_dict['Alice'] == 1:
            if action_dict['Bob'] == 0:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 2:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        else:
            if action_dict['Bob'] == 1:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 0:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        return reward

    def step(self, action_dict):
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}
        self.rew, self.done, info = {}, {}, {}
        reward = self.cal_rewards(action_dict)
        for i in self.player_list:
            self.obs[i]['Alice_action'] = action_dict['Alice']
            self.obs[i]['Bob_action'] = action_dict['Bob']
            self.obs[i]['Alice_utilities'] = reward['Alice']
            self.obs[i]['Bob_utilities'] = reward['Bob']
            self.done[i], info[i] = True, {}

        self.rew = reward
        self.done["__all__"] = True
        # print("Alice: ", action_dict['Alice'])
        # print("Bob: ", action_dict['Bob'])
        return self.obs, self.rew, self.done, info


def env_creator(_):
    return rpsEnv()


single_env = rpsEnv()
env_name = "rpsEnv"
register_env(env_name, env_creator)
policy_graphs = {}
for i in single_env.player_list:
    policy_graphs[i] = (None, single_env.obs_space_dict[i],
                        single_env.action_space_dict[i], {})


def policy_mapping_fn(agent_id):
    return agent_id


config = {
    "log_level": "WARN",
    "num_workers": 1,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 1,
    "lr": 5e-3,
    "model": {"fcnet_hiddens": [8, 8]},
    "multiagent": {
        "policies": policy_graphs,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "env": "rpsEnv"
}

exp_name = 'double_win'
exp_dict = {
    'name': exp_name,
    'run_or_experiment': 'PG',
    "stop": {
        "training_iteration": 500
    },
    'checkpoint_freq': 20,
    "config": config,
}

ray.init()
tune.run(**exp_dict)

michaelzhiluo · September 30, 2021, 11:23pm

Try different weights initialization for the policy networks.
Correct me if I’m wrong, it looks like both Alice and Bob have access to each other’s state. They should not be able to see each other’s state I think.

Topic		Replies	Views
Rllib multiagent examples are not working RLlib	2	431	January 5, 2023
Question about Environment/Observation construction RLlib	1	381	June 17, 2021
Return obs_space in gym.Box format RLlib	1	555	March 6, 2022
Do multi-agent environments need to specify an "action_space"? Configure Algorithm, Training, Evaluation, Scaling	11	77	April 7, 2025
MultiAgent training Issues RLlib	1	439	April 9, 2024

Multi-agent rock paper and scissor training gets weird result

Related topics