For sure! I can also provide additional code as well if desired. One thing to note is that I have several ppoTrainers
(/other trainers depending on a flag – impalaTrainer
).
ray version: 1.2.0
So, I am also using Chris Bamford’s Griddly, just FYI.
Config: This almost entirely defaults –
{'_fake_gpus': False,
'_use_trajectory_view_api': True,
'batch_mode': 'truncate_episodes',
'callbacks': <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>,
'clip_actions': True,
'clip_param': 0.3,
'clip_rewards': None,
'collect_metrics_timeout': 180,
'compress_observations': False,
'create_env_on_driver': False,
'custom_eval_function': None,
'custom_resources_per_worker': {},
'eager_tracing': False,
'entropy_coeff': 0.0,
'entropy_coeff_schedule': None,
'env': 'limited_zelda_custom',
'env_config': {'environment_name': 'limited_zelda_custom',
'global_observer_type': <ObserverType.VECTOR: 4>,
'level_string': DYNAMICALLY_ADDED_IN,
'max_steps': 500,
'player_observer_type': <ObserverType.VECTOR: 4>,
'random_level_on_reset': False,
'record_video_config': {'directory': '.\\videos',
'frequency': 10000000},
'yaml_file': 'levels\\limited_zelda.yaml'},
'evaluation_config': {},
'evaluation_interval': None,
'evaluation_num_episodes': 10,
'evaluation_num_workers': 0,
'exploration_config': {'type': 'StochasticSampling'},
'explore': True,
'extra_python_environs_for_driver': {},
'extra_python_environs_for_worker': {},
'fake_sampler': False,
'framework': 'torch',
'gamma': 0.99,
'grad_clip': None,
'horizon': None,
'ignore_worker_failures': False,
'in_evaluation': False,
'input': 'sampler',
'input_evaluation': ['is', 'wis'],
'kl_coeff': 0.2,
'kl_target': 0.01,
'lambda': 1.0,
'local_tf_session_args': {'inter_op_parallelism_threads': 8,
'intra_op_parallelism_threads': 8},
'log_level': 'WARN',
'log_sys_usage': True,
'logger_config': None,
'lr': 5e-05,
'lr_schedule': None,
'memory': 0,
'memory_per_worker': 0,
'metrics_smoothing_episodes': 100,
'min_iter_time_s': 0,
'model': {'custom_model': 'AIIDE_PINSKY_MODEL', 'custom_model_config': {}},
'monitor': False,
'multiagent': {'count_steps_by': 'env_steps',
'observation_fn': None,
'policies': {},
'policies_to_train': None,
'policy_mapping_fn': None,
'replay_mode': 'independent'},
'no_done_at_end': False,
'normalize_actions': False,
'num_cpus_for_driver': 1,
'num_cpus_per_worker': 1,
'num_envs_per_worker': 1,
'num_gpus': 0,
'num_gpus_per_worker': 0,
'num_sgd_iter': 30,
'num_workers': 1,
'object_store_memory': 0,
'object_store_memory_per_worker': 0,
'observation_filter': 'NoFilter',
'optimizer': {},
'output': None,
'output_compress_columns': ['obs', 'new_obs'],
'output_max_file_size': 67108864,
'postprocess_inputs': False,
'preprocessor_pref': 'deepmind',
'remote_env_batch_wait_ms': 0,
'remote_worker_envs': False,
'replay_sequence_length': 1,
'rollout_fragment_length': 200,
'sample_async': False,
'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>,
'seed': None,
'sgd_minibatch_size': 128,
'shuffle_buffer_size': 0,
'shuffle_sequences': True,
'simple_optimizer': False,
'soft_horizon': False,
'synchronize_filters': True,
'tf_session_args': {'allow_soft_placement': True,
'device_count': {'CPU': 1},
'gpu_options': {'allow_growth': True},
'inter_op_parallelism_threads': 2,
'intra_op_parallelism_threads': 2,
'log_device_placement': False},
'timesteps_per_iteration': 0,
'train_batch_size': 4000,
'use_critic': True,
'use_gae': True,
'vf_clip_param': 10.0,
'vf_loss_coeff': 1.0,
'vf_share_layers': -1}
For completeness, here’s a copy of the remote call to optimize:
@ray.remote
def optimize(trainer_constructor, trainer_config, registered_gym_name, level_string_monad, network_weights,
**kwargs):
"""Run one step of optimization!!
:param trainer_constructor: constructor for algo to optimize wtih e.g. ppo.PPOTrainer for rllib to run optimization.
:param trainer_config: config dict for e.g. PPO.
:param registered_gym_name: name of env registered with ray via `env_register`
:param level_string_monad: callback to allow for dynamically created strings
:param network_weights: torch state_dict
:return: dict of {optimized weights, result_dict}
"""
# todo same as rollout.py
# todo will probably have to change this to first instantiate a generator model
# and then query it for the levels.
# That will allow something like PAIRED to function?
trainer_config['env_config']['level_string'], _ = level_string_monad()
trainer = trainer_constructor(config=trainer_config, env=registered_gym_name)
trainer.get_policy('default_policy').model.load_state_dict(network_weights)
result = trainer.train()
return {0: {'weights': trainer.get_policy('default_policy').model.state_dict(), # key of zero b/c this will be adapted to multi-agent in the future.
"result_dict": result,
'pair_id': kwargs.get('pair_id', 0)
}
}
The call looks like this:
list_of_agent_env_pairs = [...]
for p in list_of_agent_env_pairs:
opt_refs = optimize.remote(ppoTrainer, config, gym_name, p.generator_fn_wrapper(), p.solver.get_state_dict())
For good measure, here’s a copy of the NN code:
class AIIDEActor(TorchModelV2, nn.Module):
def __init__(self, obs_space: gym.spaces.Space,
action_space: gym.spaces.Space, num_outputs: int,
model_config: ModelConfigDict, name: str):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
model_config, name)
nn.Module.__init__(self)
self._num_objects = obs_space.shape[2]
self._num_actions = num_outputs
self.embedding = nn.Sequential(
layer_init(nn.Conv2d(in_channels=self._num_objects, out_channels=8, kernel_size=1)),
nn.ReLU(),
layer_init(nn.Conv2d(in_channels=8, out_channels=32, kernel_size=2)),
nn.ReLU(),
nn.Flatten(),
layer_init(nn.Linear(512, 128)),
nn.ReLU()
)
self.policy_head = nn.Sequential(
layer_init(nn.Linear(128, num_outputs))
)
self.value_head = nn.Sequential(
layer_init(nn.Linear(128, 1))
)
def forward(self, input_dict, state, seq_lens):
# print(input_dict['obs'].shape)
x = input_dict['obs'].permute(0, 3, 1, 2)
self._last_batch_size = x.shape[0]
embed = self.embedding(x)
logits = self.policy_head(embed)
value = self.value_head(embed)
self._value = value.reshape(-1)
return logits, state
def value_function(self):
return self._value