Hi @tlaurie99 ,
Welcome to the forum.
If I had to venture a guess as to where the Nan’s originate it would be here:
250 self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
It has been my experience that when using a continuous action space sometime during training the std logits that parameterize the action distribution can become very negative. Which leads to an std close to zero which causes a Nan when it divides by ~zero on the backward calculation of the normal distribution.
RLLIB is unique the popular frameworks in that it uses the nn policy to generate the log_std values.
@override(ActionDistribution)
def __init__(
self,
inputs: List[TensorType],
model: TorchModelV2,
*,
action_space: Optional[gym.spaces.Space] = None
):
super().__init__(inputs, model)
mean, log_std = torch.chunk(self.inputs, 2, dim=1)
self.log_std = log_std
self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
# Remember to squeeze action samples in case action space is Box(shape)
self.zero_action_dim = action_space and action_space.shape == ()
@override(TorchDistributionWrapper)
def sample(self) -> TensorType:
sample = super().sample()
if self.zero_action_dim:
return torch.squeeze(sample, dim=-1)
If you look at cleanrl or sb3s implementation you will see that they register the log_std as a parameter of the model so they can be learned but not as part of the nn layers.
Since you are already using a custom model you might try implementing this alternative to see if it helps.
cleanrl:
nn.Tanh(),
layer_init(nn.Linear(64, 1), std=1.0),
)
self.actor_mean = nn.Sequential(
layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
nn.Tanh(),
layer_init(nn.Linear(64, 64)),
nn.Tanh(),
layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
)
self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))
def get_value(self, x):
return self.critic(x)
def get_action_and_value(self, x, action=None):
action_mean = self.actor_mean(x)
action_logstd = self.actor_logstd.expand_as(action_mean)
action_std = torch.exp(action_logstd)
probs = Normal(action_mean, action_std)
if action is None:
sb3:
Create the layers and parameter that represent the distribution:
one output will be the mean of the Gaussian, the other parameter will be the
standard deviation (log std in fact to allow negative values)
:param latent_dim: Dimension of the last layer of the policy (before the action layer)
:param log_std_init: Initial value for the log standard deviation
:return:
"""
mean_actions = nn.Linear(latent_dim, self.action_dim)
# TODO: allow action dependent std
log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init, requires_grad=True)
return mean_actions, log_std
def proba_distribution(
self: SelfDiagGaussianDistribution, mean_actions: th.Tensor, log_std: th.Tensor
) -> SelfDiagGaussianDistribution:
"""
Create the distribution given its parameters (mean, std)
:param mean_actions:
:param log_std:
2 Likes