Multi-agent rock paper and scissor training gets weird result

Hello,

I am implementing a double player(Alice and Bob) rock paper and scissor environment using MultiAgentEnv class.
Observation space contains both players’ utilities(reward), and both players’ action.
Action space contains 0(rock),1(paper),2(scissor)
Reward for each player: winner +20, loser -20
If it’s a tie, one case(a) is all players will have +10, the other case(b) is all players will have 0 or -10.

After 500 iteration,
Case a) converges to Alice and Bob will always have the same action,
which will maximize the total reward.
Case b) converges to Alice always shows scissor, and Bob shows rock, which only maximize Alice reward while ignoring Bobs’.

What I understand is the result won’t be converged to one specific action, each action will have 1/3 chance to be shown. I am wondering where is this error from?

Thanks!

import numpy as np
import networkx as nx
import gym
import random

from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray import tune
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.models import ModelCatalog
from ray.tune import run_experiments
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOTrainer
from IPython import embed

# 0 Alice;1 Bob


class rpsEnv(MultiAgentEnv):
    def __init__(self, return_agent_actions=False, part=False):
        self.num_agents = 2
        self.player_list = ['Alice', 'Bob']
        self.action_space = gym.spaces.Discrete(3)
        self.action_space_dict = {}
        self.obs_space_dict = {}
        for i in self.player_list:
            self.obs_space_dict[i] = gym.spaces.Dict({
                'Alice_utilities': gym.spaces.Box(low=np.array([-50.]), high=np.array([50.])),
                'Bob_utilities': gym.spaces.Box(low=np.array([-50.]), high=np.array([50.])),
                'Alice_action': gym.spaces.Discrete(3),
                'Bob_action': gym.spaces.Discrete(3)
            })
        self.action_space_dict = {
            i: gym.spaces.Discrete(3)
            for i in self.player_list
        }
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}  # Observations.
        self.rew = {i: 0. for i in self.player_list}  # Rewards.
        self.done = {i: False for i in self.player_list}
        self.done['__all__'] = False

    def reset(self):
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}
        return self.obs

    def cal_rewards(self, action_dict):
        if action_dict['Alice'] == 0:
            if action_dict['Bob'] == 2:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 1:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        elif action_dict['Alice'] == 1:
            if action_dict['Bob'] == 0:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 2:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        else:
            if action_dict['Bob'] == 1:
                reward['Alice'] += 20
                reward['Bob'] -= 20
            elif action_dict['Bob'] == 0:
                reward['Alice'] -= 20
                reward['Bob'] += 20
            else:
                reward['Alice'] -= 0
                reward['Bob'] -= 0
        return reward

    def step(self, action_dict):
        self.obs = {i: {
            'Alice_utilities': np.array([0]),
            'Bob_utilities': np.array([0]),
            'Alice_action': 0,
            'Bob_action': 0
        } for i in self.player_list}
        self.rew, self.done, info = {}, {}, {}
        reward = self.cal_rewards(action_dict)
        for i in self.player_list:
            self.obs[i]['Alice_action'] = action_dict['Alice']
            self.obs[i]['Bob_action'] = action_dict['Bob']
            self.obs[i]['Alice_utilities'] = reward['Alice']
            self.obs[i]['Bob_utilities'] = reward['Bob']
            self.done[i], info[i] = True, {}

        self.rew = reward
        self.done["__all__"] = True
        # print("Alice: ", action_dict['Alice'])
        # print("Bob: ", action_dict['Bob'])
        return self.obs, self.rew, self.done, info


def env_creator(_):
    return rpsEnv()


single_env = rpsEnv()
env_name = "rpsEnv"
register_env(env_name, env_creator)
policy_graphs = {}
for i in single_env.player_list:
    policy_graphs[i] = (None, single_env.obs_space_dict[i],
                        single_env.action_space_dict[i], {})


def policy_mapping_fn(agent_id):
    return agent_id


config = {
    "log_level": "WARN",
    "num_workers": 1,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 1,
    "lr": 5e-3,
    "model": {"fcnet_hiddens": [8, 8]},
    "multiagent": {
        "policies": policy_graphs,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "env": "rpsEnv"
}

exp_name = 'double_win'
exp_dict = {
    'name': exp_name,
    'run_or_experiment': 'PG',
    "stop": {
        "training_iteration": 500
    },
    'checkpoint_freq': 20,
    "config": config,
}

ray.init()
tune.run(**exp_dict)

  1. Try different weights initialization for the policy networks.

  2. Correct me if I’m wrong, it looks like both Alice and Bob have access to each other’s state. They should not be able to see each other’s state I think.