# Policy Selection Method
def select_policy(agent_id):
if agent_id == "deputy_0":
policyname = "policy_0"
elif agent_id == "deputy_1":
policyname = "policy_1"
elif agent_id == "deputy_2":
policyname = "policy_2"
return policyname
# Training configs
exper_params = {"bufferCap": 100000, "burn_in": 20, "learn_rate": tune.grid_search([0.00005, 0.0001, 0.0005, 0.001]), "batch_size": tune.grid_search([128, 256, 512, 1028]),
"discount_rate": tune.grid_search([0.75, 0.85, 0.95, 0.99]), "num_workers": 15, "policy_fcnet_hiddens": [64, 64], "lstm_cell_size": tune.grid_search([32, 64, 128, 256]),
"policy_max_seq_length": 20, "timesteps_trained": 75000}
# Chief object and viewpoint params
chief_params = {"Point cloud": infoEnv.chief_object.ptCldName, "Number of points": infoEnv.chief_object.numPoints,
"Diam of bounding box": infoEnv.chief_object.diam, "Projection diam": infoEnv.chief_object.sphereRadius,
"Viewpoint dist from origin": infoEnv.chief_object.viewScale}
# Environment params
env_params = {"Rotation Mode": infoEnv.env_config["RotationMode"], "Number of viewpoints": infoEnv.num_inspection_points, "Agent Field of View": infoEnv.FOV,
"Angular velocity scaling": infoEnv.SF, "POI reward": infoEnv.POI_reward, "Fuel penalty": infoEnv.fuel_penalty, "Inspection threshold": infoEnv.info_thresh,
"Reward tranlsation": infoEnv.reward_translation}
# Configs`Preformatted text`
config = R2D2Config()
config.environment(env="HLInfoInspEnv", env_config=env_config)
replay_config = config.replay_buffer_config.update({"capacity": exper_params["bufferCap"], "replay_burn_in": exper_params["burn_in"]})
config.training(lr= exper_params["learn_rate"])
config.training(train_batch_size= exper_params["batch_size"])
config.training(gamma= exper_params["discount_rate"])
config.rollouts(num_rollout_workers = exper_params["num_workers"])
policy_map = {
"policy_0": (
None, obs_space_high, act_space_high,
{"model": {"fcnet_hiddens": exper_params["policy_fcnet_hiddens"], "fcnet_activation": "tanh",
"use_lstm": True,
"lstm_cell_size": exper_params["lstm_cell_size"],
"max_seq_len": exper_params["policy_max_seq_length"]}}),
"policy_1": (
None, obs_space_high, act_space_high,
{"model": {"fcnet_hiddens": exper_params["policy_fcnet_hiddens"], "fcnet_activation": "tanh",
"use_lstm": True,
"lstm_cell_size": exper_params["lstm_cell_size"],
"max_seq_len": exper_params["policy_max_seq_length"]}}),
"policy_2": (
None, obs_space_high, act_space_high,
{"model": {"fcnet_hiddens": exper_params["policy_fcnet_hiddens"], "fcnet_activation": "tanh",
"use_lstm": True,
"lstm_cell_size": exper_params["lstm_cell_size"],
"max_seq_len": exper_params["policy_max_seq_length"]}}),
}
config.multi_agent(policies=policy_map, policy_mapping_fn=select_policy)
# Run configs
stop_dict = {'timesteps_total': exper_params["timesteps_trained"]}
# Train - saves experiment to an output folder.
tuner = tune.Tuner(
"R2D2",
run_config=air.RunConfig(
name = "experiment_output",
stop=stop_dict,
local_dir = output_dir,
sync_config=sync_config,
checkpoint_config=air.CheckpointConfig(
checkpoint_score_attribute="episode_reward_mean",
checkpoint_frequency=1,
num_to_keep=2,
),
),
param_space=config.to_dict())
results = tuner.fit()