Hi again!
Here is a brief description of the relevant parts of my project:
I am running multi-agent reinforcement learning environment using Ray/RLlib library (version: 0.8.6) for an extension of the AI-Economist framework. Here are the relevant codes:
from rllib.env_wrapper import RLlibEnvWrapper
env_obj = RLlibEnvWrapper({"env_config_dict": env_config_dict}, verbose=True)
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.models.catalog import ModelCatalog
from rllib.tf_models import KerasConvLSTM
ModelCatalog.register_custom_model(KerasConvLSTM.custom_name, KerasConvLSTM)
policies = {
"a": (
None, # uses default policy
env_obj.observation_space,
env_obj.action_space,
#{},
{'clip_param': 0.3,
'entropy_coeff': 0.025,
'entropy_coeff_schedule': None,
'gamma': 0.998,
'grad_clip': 10.0,
'kl_coeff': 0.0,
'kl_target': 0.01,
'lambda': 0.98,
'lr': 0.0003,
'lr_schedule': None,
'model': {'custom_model': 'keras_conv_lstm',
'custom_model_config': {'fc_dim': 128,
'idx_emb_dim': 4,
'input_emb_vocab': 100,
'lstm_cell_size': 128,
'num_conv': 2,
'num_fc': 2},
'max_seq_len': 25},
'use_gae': True,
'vf_clip_param': 50.0,
'vf_loss_coeff': 0.05,
'vf_share_layers': False} # define a custom agent policy configuration.
),
"p": (
None, # uses default policy
env_obj.observation_space_pl,
env_obj.action_space_pl,
#{},
{'clip_param': 0.3,
'entropy_coeff': 0.125,
'entropy_coeff_schedule': [[0, 2.0], [50000000, 0.125]],
'gamma': 0.998,
'grad_clip': 10.0,
'kl_coeff': 0.0,
'kl_target': 0.01,
'lambda': 0.98,
'lr': 0.0001,
'lr_schedule': None,
'model': {'custom_model': 'keras_conv_lstm',
'custom_model_config': {'fc_dim': 256,
'idx_emb_dim': 4,
'input_emb_vocab': 100,
'lstm_cell_size': 256,
'num_conv': 2,
'num_fc': 2},
'max_seq_len': 25},
'use_gae': True,
'vf_clip_param': 50.0,
'vf_loss_coeff': 0.05,
'vf_share_layers': False} # define a custom planner policy configuration.
)
}
policy_mapping_fun = lambda i: "a" if str(i).isdigit() else "p"
policies_to_train = ["a", "p"]
trainer_config = {
"multiagent": {
"policies": policies,
"policies_to_train": policies_to_train,
"policy_mapping_fn": policy_mapping_fun,
}
}
trainer_config.update(
{
"num_workers": 6,
"num_envs_per_worker": 1,
# Other training parameters
"train_batch_size": 4000,
"sgd_minibatch_size": 4000,
"num_sgd_iter": 1,
"output": "D:\\ENI Projects\\Aslan\\AutocurriculaLab\\Githubs\\modified-ai-economist-main\\Results",
}
)
# We also add the "num_envs_per_worker" parameter for the env. wrapper to index the environments.
env_config = {
"env_config_dict": env_config_dict,
"num_envs_per_worker": trainer_config.get('num_envs_per_worker'),
}
trainer_config.update(
{
"env_config": env_config
}
)
ray.init()
trainer = PPOTrainer(env=RLlibEnvWrapper, config=trainer_config)
NUM_ITERS = 20
episode_reward_mean = np.zeros(NUM_ITERS)
for iteration in range(NUM_ITERS):
print(f'********** Iter : {iteration} **********')
result = trainer.train()
episode_reward_mean[iteration] = result.get('episode_reward_mean')
print(f'''episode_reward_mean: {episode_reward_mean[iteration]}''')
After running this code for 5 + 1 agents and episode length of 2000, 240 output files are generated having the following structure:
output-date_time_worker_i_j
The number “i” goes from 1 to 6, and the number “j” goes from 0 to 39.
I assume 6 refers to the number of workers, and 40 refers to the number of iterations, 20, multiplied by 2.
In each one of these files, there are 8 sets of state_out_a values, in which “a” goes from 0 to 3. I think these 4 state values refer to the idx_emb_dim defined in my LSTM. Moreover, each one of these values has the size of (1000, 128) or (1000, 256). Again, I think 128 or 256 refer to the lstm_cell_size defined in my LSTM. However, I don’t have any clue about what is the origin of 1000. Basically, my question is that (maybe a naïve question!) if it is possible for an agent to define one state or a set of states in each time-point of an episode.
Many many thanks in advance and sorry for so many questions!