Hello,
I am using a custom PPO model for the training. I report here the implementation for completeness.
class CustomModel(TorchRLModule, ValueFunctionAPI):
"""
Custom network model
"""
def setup(self):
"""
Override setup method
"""
# Process configuration
n_obs = self.observation_space.shape[0]
n_out_actor = self.action_space.shape[0] * 2 # Mean and std of actions
n_hid_lay_actor = self.model_config["custom_model_config"]["n_hid_lay_actor"]
n_neur_hid_actor = self.model_config["custom_model_config"]["n_neur_hid_actor"]
act_fun_hid_actor = self.model_config["custom_model_config"]["act_fun_hid_actor"]
act_fun_out_actor = self.model_config["custom_model_config"]["act_fun_out_actor"]
n_hid_lay_critic = self.model_config["custom_model_config"]["n_hid_lay_critic"]
n_neur_hid_critic = self.model_config["custom_model_config"]["n_neur_hid_critic"]
act_fun_hid_critic = self.model_config["custom_model_config"]["act_fun_hid_critic"]
act_fun_out_critic = self.model_config["custom_model_config"]["act_fun_out_critic"]
# Construct actor network
layers_actor = []
prev_layer_size = n_obs
for layer in range(n_hid_lay_actor):
layers_actor.append(SlimFC(in_size=prev_layer_size,
out_size=n_neur_hid_actor,
initializer=normc_initializer(1.0),
activation_fn=act_fun_hid_actor))
prev_layer_size = n_neur_hid_actor
layers_actor.append(SlimFC(in_size=prev_layer_size,
out_size=n_out_actor,
initializer=normc_initializer(1.0),
activation_fn=act_fun_out_actor))
self._policy_net = nn.Sequential(*layers_actor)
# Construct critic network
layers_critic = []
prev_layer_size = n_obs
for layer in range(n_hid_lay_critic):
layers_critic.append(SlimFC(in_size=prev_layer_size,
out_size=n_neur_hid_critic,
initializer=normc_initializer(1.0),
activation_fn=act_fun_hid_critic))
prev_layer_size = n_neur_hid_critic
layers_critic.append(SlimFC(in_size=prev_layer_size,
out_size=1,
initializer=normc_initializer(0.01),
activation_fn=act_fun_out_critic))
self._critic_net = nn.Sequential(*layers_critic)
def _forward(self, batch, **kwargs):
"""
Override actor forward method
"""
action_logits = self._policy_net(batch[Columns.OBS])
return {Columns.ACTION_DIST_INPUTS: action_logits}
def compute_values(self, batch, embeddings = None):
"""
Override value function forward method
"""
critic_logit = self._critic_net(batch[Columns.OBS]).squeeze(1)
return critic_logit
As you can see, the output layer of the policy net implements an activation function (act_fun_out_actor is set to “tanh”) to bound actions between ±1. Investigating the step method of the environment, actions are indeed bounded correctly.
However, when I extract actions from the on_episode_end callback function using episode.actions.data, actions that were originally between ±1 before the tanh are reported correctly. All actions that in the environment were exactly ±1 seems here reported with their original values before tanh (for example, if in the environment there is [ 0.0403372 , -1. , -0.55389929, 0.97667265, 1. ], in the callback I get [ 0.04033718, -2.1049635 , -0.5538993 , 0.9766726 , 2.0102165 ]. I am not sure why this is happening. I solved the problem in the callback function by simply clipping the actions. However, I am now wondering whether there may be other situations throughout the training routine where this issue happens (with obviously more serious consequences, for example in the learner) silently without me knowing.
Thanks