How severe does this issue affect your experience of using Ray?
- Medium: It contributes to significant difficulty to complete my task, but I can work around it.
Do i need to create a custom logger to see both RL agents that I am training? or am I not setting up the training correctly? I am under the impression I am training 2 independent learning PPO agents… Would someone please let me know???
Here is my training script
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser()
parser.add_argument(
"--framework",
choices=["tf", "tf2", "torch"],
default="torch",
help="The DL framework specifier.",
)
parser.add_argument(
"--run-as-test",
action="store_true",
help="Whether this script should be run as a test: --stop-reward must "
"be achieved within --stop-timesteps AND --stop-iters.",
)
parser.add_argument(
"--stop-timesteps", type=int, default=20000, help="Number of timesteps to train."
)
parser.add_argument(
"--stop-reward", type=float, default=0, help="Reward at which we stop training."
)
parser.add_argument(
"--enable-new-api-stack"
)
parser.add_argument(
"--num-agents", type=int, default=2, help="Enable new api stack."
)
parser.add_argument(
"--num-policies", type=int, default=2, help="Enable new api stack."
)
if __name__ == "__main__":
args = parser.parse_args()
ray.init(log_to_driver=True)
num_agents=2
observation_size = 3 + 4 * num_agents
action_space = Box(low=np.array([0, -1]), high=np.array([1, 1]))
observation_space = Box(low=-np.inf, high=np.inf, shape=(observation_size,))
register_env("REMUS RAY",lambda config: CustomMultiAgentEnv)
obs_space = observation_space
act_space = action_space
policies = {f"agent_{i}" for i in range(num_agents)} # Policy mapping must have same agent_ids here as in ENV??? I think...
config = (
PPOConfig()
.framework("torch")
.rollouts(num_rollout_workers=6, create_env_on_local_worker=True)
.debugging(seed=0, log_level="ERROR")
.training(model={"fcnet_hiddens" : [64, 64, 64, 64, 64, 64]})
.environment(env=CustomMultiAgentEnv)
.multi_agent(
policies=policies,
policy_mapping_fn=(lambda aid, *args, **kwargs: aid),
policies_to_train=policies,
#policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy1" if agent_id == "agent_0" else "policy2"
)
.rl_module(rl_module_spec=MultiAgentRLModuleSpec(module_specs={p: SingleAgentRLModuleSpec() for p in policies},),)
.experimental(_enable_new_api_stack=True)
.resources()
)
stop = {
"episode_reward_mean": args.stop_reward,
"timesteps_total": args.stop_timesteps,
}
# Get the path to the desktop
desktop_path = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
# Create a path to the RayResults folder on the desktop
ray_results_path = os.path.join(desktop_path, 'RayResults')
results = tune.Tuner(PPO, run_config=air.RunConfig(storage_path=ray_results_path, name=f"Ray_{date.today()}",stop=stop, verbose=1), param_space=config,).fit()
if args.run_as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
My output looks as follows (only snippit can provide full output if needed)
Trial status: 1 RUNNING
Current time: 2024-04-12 14:41:32. Total running time: 1min 0s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭────────────────────────────────────────────────╮
│ Trial name status │
├────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000 RUNNING │
╰────────────────────────────────────────────────╯
Trial status: 1 RUNNING
Current time: 2024-04-12 14:42:02. Total running time: 1min 30s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name status iter total time (s) ts reward episode_reward_max episode_reward_min episode_len_mean episodes_this_iter │
├────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000 RUNNING 1 41.0765 4000 -2941.78 -764.081 -3530.44 447.714 7 │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Trial PPO_CustomMultiAgentEnv_24306_00000 completed after 2 iterations at 2024-04-12 14:42:30. Total running time: 1min 58s
╭───────────────────────────────────────────────────────────────╮
│ Trial PPO_CustomMultiAgentEnv_24306_00000 result │
├───────────────────────────────────────────────────────────────┤
│ episodes_total 13 │
│ num_env_steps_sampled 8000 │
│ num_env_steps_trained 0 │
│ sampler_results/episode_len_mean 471.846 │
│ sampler_results/episode_reward_mean -3090.05 │
╰───────────────────────────────────────────────────────────────╯
2024-04-12 14:42:30,777 INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to 'C:/Users/ljc5616/Desktop/RayResults/Ray_2024-04-12' in 0.0419s.
Trial status: 1 TERMINATED
Current time: 2024-04-12 14:42:30. Total running time: 1min 58s
Logical resource usage: 7.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T2000)
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name status iter total time (s) ts reward episode_reward_max episode_reward_min episode_len_mean episodes_this_iter │
├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ PPO_CustomMultiAgentEnv_24306_00000 TERMINATED 2 78.6452 8000 -3090.05 -764.081 -3537.11 471.846 6 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯