Am I performing Independent Multi Agent Learning? (RayTune)

How severe does this issue affect your experience of using Ray?

  • Medium: It contributes to significant difficulty to complete my task, but I can work around it.

Do i need to create a custom logger to see both RL agents that I am training? or am I not setting up the training correctly? I am under the impression I am training 2 independent learning PPO agents… Would someone please let me know???

Here is my training script

logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "torch"],
    default="torch",
    help="The DL framework specifier.",
)
parser.add_argument(
    "--run-as-test",
    action="store_true",
    help="Whether this script should be run as a test: --stop-reward must "
    "be achieved within --stop-timesteps AND --stop-iters.",
)

parser.add_argument(
    "--stop-timesteps", type=int, default=20000, help="Number of timesteps to train."
)
parser.add_argument(
    "--stop-reward", type=float, default=0, help="Reward at which we stop training."
)
parser.add_argument(
    "--enable-new-api-stack"
)
parser.add_argument(
    "--num-agents", type=int, default=2, help="Enable new api stack."
)
parser.add_argument(
    "--num-policies", type=int, default=2, help="Enable new api stack."
)

if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(log_to_driver=True)    
    num_agents=2
    observation_size = 3 + 4 * num_agents
    action_space = Box(low=np.array([0, -1]), high=np.array([1, 1]))
    observation_space = Box(low=-np.inf, high=np.inf, shape=(observation_size,))
    
    
    register_env("REMUS RAY",lambda config: CustomMultiAgentEnv)


    obs_space = observation_space
    act_space = action_space
    policies = {f"agent_{i}" for i in range(num_agents)} # Policy mapping must have same agent_ids here as in ENV??? I think...

    config = (
       PPOConfig()
            .framework("torch")
            .rollouts(num_rollout_workers=6, create_env_on_local_worker=True)
            .debugging(seed=0, log_level="ERROR")
            .training(model={"fcnet_hiddens" : [64, 64, 64, 64, 64, 64]})
            .environment(env=CustomMultiAgentEnv)
            .multi_agent(
                policies=policies,
                policy_mapping_fn=(lambda aid, *args, **kwargs: aid),
                policies_to_train=policies,
                #policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent_0" else "policy2"
            )
            .rl_module(rl_module_spec=MultiAgentRLModuleSpec(module_specs={p: SingleAgentRLModuleSpec() for p in policies},),)
            .experimental(_enable_new_api_stack=True)
            .resources()
            
    )

    stop = {
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
    }
    # Get the path to the desktop
    desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')

    # Create a path to the RayResults folder on the desktop
    ray_results_path = os.path.join(desktop_path, 'RayResults')
    results = tune.Tuner(PPO, run_config=air.RunConfig(storage_path=ray_results_path, name=f"Ray_{date.today()}",stop=stop, verbose=1), param_space=config,).fit()

    if args.run_as_test:
        check_learning_achieved(results, args.stop_reward)

    ray.shutdown()

My output looks as follows (only snippit can provide full output if needed)


Trial status: 1 RUNNING
Current time: 2024-04-12 14:41:32. Total running time: 1min 0s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭────────────────────────────────────────────────╮
│ Trial name                            status   │
├────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000   RUNNING  │
╰────────────────────────────────────────────────╯
Trial status: 1 RUNNING
Current time: 2024-04-12 14:42:02. Total running time: 1min 30s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name                            status       iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean     episodes_this_iter │
├────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000   RUNNING         1            41.0765   4000   -2941.78               -764.081               -3530.44              447.714                      7 │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Trial PPO_CustomMultiAgentEnv_24306_00000 completed after 2 iterations at 2024-04-12 14:42:30. Total running time: 1min 58s
╭───────────────────────────────────────────────────────────────╮
│ Trial PPO_CustomMultiAgentEnv_24306_00000 result              │
├───────────────────────────────────────────────────────────────┤
│ episodes_total                                             13 │
│ num_env_steps_sampled                                    8000 │
│ num_env_steps_trained                                       0 │
│ sampler_results/episode_len_mean                      471.846 │
│ sampler_results/episode_reward_mean                  -3090.05 │
╰───────────────────────────────────────────────────────────────╯
2024-04-12 14:42:30,777 INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to 'C:/Users/ljc5616/Desktop/RayResults/Ray_2024-04-12' in 0.0419s.

Trial status: 1 TERMINATED
Current time: 2024-04-12 14:42:30. Total running time: 1min 58s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name                            status         iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean     episodes_this_iter │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000   TERMINATED        2            78.6452   8000   -3090.05               -764.081               -3537.11              471.846                      6 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

The

must return the policy name based on the agent id
rather than using a lambda maybe it is easier to debug by creating a regular function


def get_policy_name(agent: str) -> str:
    return f"agent_{agent}"

I added that and it still doesnt seem to be outputting episode information for each agent…

Inside the Trial PPO_CustomMultiAgentEnv_2d233_00000 config output it does list both policies

 policies/agent_0                                     ...None, None, None) │
│ policies/agent_1                                     ...None, None, None) │
│ policies_to_train                                    ...nt_0', 'agent_1'] │

code with new function below…

def get_policy_name(agent_id: str, *args, **kwargs) -> str:
    """
    Maps an agent ID to a policy name, ignoring any additional positional and keyword arguments.
    
    Args:
    agent_id (str): The ID of the agent.
    *args: Additional positional arguments which are not used but must be accepted.
    **kwargs: Additional keyword arguments which are not used but must be accepted.

    Returns:
    str: The policy name corresponding to the given agent ID.
    """
    if not agent_id.startswith("agent_"):
       agent_id=f"agent_{agent_id}"

    return agent_id


logger = logging.getLogger(__name__)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "torch"],
    default="torch",
    help="The DL framework specifier.",
)
parser.add_argument(
    "--run-as-test",
    action="store_true",
    help="Whether this script should be run as a test: --stop-reward must "
    "be achieved within --stop-timesteps AND --stop-iters.",
)

parser.add_argument(
    "--stop-timesteps", type=int, default=5000, help="Number of timesteps to train."
)
parser.add_argument(
    "--stop-reward", type=float, default=0, help="Reward at which we stop training."
)
parser.add_argument(
    "--enable-new-api-stack"
)
parser.add_argument(
    "--num-agents", type=int, default=2, help="Enable new api stack."
)
parser.add_argument(
    "--num-policies", type=int, default=2, help="Enable new api stack."
)

if __name__ == "__main__":
    args = parser.parse_args()

    ray.init()    
    num_agents=2
    observation_size = 3 + 4 * num_agents
    action_space = Box(low=np.array([0, -1]), high=np.array([1, 1]))
    observation_space = Box(low=-np.inf, high=np.inf, shape=(observation_size,))
    
    
    register_env("RAY",lambda config: CustomMultiAgentEnv)

    
    obs_space = observation_space
    act_space = action_space
    # Create a dictionary with policy names and their corresponding configurations
    policies = {f"agent_{i}": PolicySpec() for i in range(num_agents)}
    config = (
       PPOConfig()
            .framework("torch")
            .rollouts(num_rollout_workers=6, create_env_on_local_worker=True)
            .debugging(seed=0, log_level="ERROR")
            .training(model={"fcnet_hiddens" : [64, 64, 64, 64, 64, 64]})
            .environment(env=CustomMultiAgentEnv)
            .multi_agent(
                policies=policies,
                policy_mapping_fn=get_policy_name,
                policies_to_train=list(policies.keys()),
                #policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent_0" else "policy2"
            )
            .rl_module(rl_module_spec=MultiAgentRLModuleSpec(module_specs={p: SingleAgentRLModuleSpec() for p in policies},),)
            .experimental(_enable_new_api_stack=True)
            .resources()
            
    )

    stop = {
        "episode_reward_mean": args.stop_reward,
        "timesteps_total": args.stop_timesteps,
    }
    # Get the path to the desktop
    desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')

    # Create a path to the RayResults folder on the desktop
    ray_results_path = os.path.join(desktop_path, 'RayResults')
    results = tune.Tuner(PPO, run_config=air.RunConfig(storage_path=ray_results_path, name=f"Ray_{date.today()}",stop=stop, verbose=1), param_space=config,).fit()

    if args.run_as_test:
        check_learning_achieved(results, args.stop_reward)

    ray.shutdown()