When I run my environment in a test script launching a PPOTrainer, RLLib has things working.
working code:
sep = os.pathsep
os.environ['PYTHONPATH'] = sep.join(sys.path)
ray.init(num_gpus=1)
env_name = "walker"
def _custom(env_config):
env = BipedalWalkerCustom(env_config.get('named_tuple', DEFAULT_ENV))
return OverrideWalker(env)
register_env(env_name, _custom)
ModelCatalog.register_custom_model('2LFC', TwoLayerFC)
max_training_steps = 10000
config = ppo.DEFAULT_CONFIG.copy()
config["model"] = {
'custom_model': '2LFC',
'custom_model_config': {'hidden_dim': 40}
}
config['env'] = env_name
config['env_config'] = {'named_tuple': DEFAULT_ENV}
config["num_workers"] = 1
config["num_envs_per_worker"] = 1
config["framework"] = "torch"
config['num_gpus'] = 1
stop = {
"episode_reward_mean": 2,
"timesteps_total": max_training_steps,
}
print(config)
try:
result = tune.run(ppo.PPOTrainer, config=config, stop=stop, checkpoint_at_end=True,
local_dir=os.path.join('..', 'rllib_results'))
except (KeyboardInterrupt, Exception) as e:
print(e)
ray.shutdown()
ray.shutdown()
Here’s the NN code:
class TwoLayerFC(TorchModelV2, nn.Module):
def __init__(self, obs_space: gym.spaces.Space,
action_space: gym.spaces.Space, num_outputs: int,
model_config: ModelConfigDict, name: str):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
model_config, name)
nn.Module.__init__(self)
self._num_objects = obs_space.shape[0]
self._num_actions = num_outputs
self.hidden_dim = model_config.get('custom_model_config', {}).get('hidden_size', 40)
self.embedding = nn.Sequential(
layer_init(nn.Linear(self._num_objects, self.hidden_dim)),
nn.Tanh(),
layer_init(nn.Linear(self.hidden_dim, self.hidden_dim)),
nn.Tanh(),
)
self.policy_head = nn.Sequential(
layer_init(nn.Linear(self.hidden_dim, self.num_outputs)),
nn.Tanh()
)
self.value_head = nn.Sequential(
layer_init(nn.Linear(self.hidden_dim, 1))
)
def forward(self, input_dict, state, seq_lens):
# print(input_dict['obs'].shape)
x = input_dict['obs']
self._last_batch_size = x.shape[0]
embed = self.embedding(x)
logits = self.policy_head(embed)
value = self.value_head(embed)
self._value = value.reshape(-1)
return logits, state
def value_function(self):
return self._value
When I build a ppoTrainer object directly in my code the resulting NN is of the incorrect size. According to rllib, the number of logits my networks should be putting out is 8 when really the action space is: Box([-1. -1. -1. -1.], [1. 1. 1. 1.], (4,), float64)
which should result in the output logit size being [batch, 4]. However, the ray\rllib\models\catalog.py
function get_action_dist
is returning 8 for the num_outputs
variable rather than 4 as the action space has it.
When I run my code with a (Multi-)Discrete action_space, everything works fine (e.g. an action space of Discrete(4) will have 4 outputs and an action space of MultiDiscrete([12, 12, 6, 4]) will have 34 outputs.
@ray.remote
class Trainer_wrapper:
def __init__(self, trainer_constructor, trainer_config, registered_gym_name, network_factory, gym_factory, weights={},
log_id='foo_bar'):
...
self.trainer = trainer_constructor(config=trainer_config, env=registered_gym_name,
logger_creator=custom_log_creator(os.path.join('.', 'watts_logs', self.exp),
f'SAS_{log_id}.')
)
...
What I expect to be passed to the TwoLayerFC class is:
build_info = {
'obs_space': gym.spaces.Box([-np.inf for _ in range(24)],
[np.inf for _ in range(24)], (24,)),
'action_space': gym.spaces.Box([-1, -1, -1, -1], [1, 1, 1, 1], (4,)),
'num_outputs': 4,
'model_config': {'custom_model_config': {'hidden_size': 40}},
'name': 'poet_fc'
}
What actually gets passed in is correct in its Spaces, but wrong in the num_outputs:
**build_info = {
'obs_space': gym.spaces.Box([-np.inf for _ in range(24)],
[np.inf for _ in range(24)], (24,)),
'action_space': gym.spaces.Box([-1, -1, -1, -1], [1, 1, 1, 1], (4,)),
'num_outputs': 8,
'model_config': {a whole bunch of stuff},
'name': 'default_model'
}