Continuous action space

Hi there!

I was using a action_space = gym.Discrete(6) in my custom environment and everithing was good. Now, I’m traying to experiment with a continuos action spaces (the rest of the configuration remaind as before). To do this I write action_space = gym.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) to define my action space.

The simulation start without problems, but after a while (and only sometimes) an error happend.

Failure # 1 (occurred at 2024-07-29_21-07-56)
e[36mray::PPO.train()e[39m (pid=13792, ip=127.0.0.1, actor_id=bc323549a84daed0a1b3c69101000000, repr=PPO)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo_torch_policy.py", line 85, in loss
    curr_action_dist = dist_class(logits, model)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\models\torch\torch_action_dist.py", line 250, in __init__
    self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\torch\distributions\normal.py", line 56, in __init__
    super().__init__(batch_shape, validate_args=validate_args)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\torch\distributions\distribution.py", line 68, in __init__
    raise ValueError(
ValueError: Expected parameter loc (Tensor of shape (816, 1)) of distribution Normal(loc: torch.Size([816, 1]), scale: torch.Size([816, 1])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan],
        [nan],
        [nan],
        [nan],
        ...
        [nan],
        [nan]], grad_fn=<SplitBackward0>)

The above exception was the direct cause of the following exception:

e[36mray::PPO.train()e[39m (pid=13792, ip=127.0.0.1, actor_id=bc323549a84daed0a1b3c69101000000, repr=PPO)
  File "python\ray\_raylet.pyx", line 1895, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1996, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1901, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1842, in ray._raylet.execute_task.function_executor
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\_private\function_manager.py", line 691, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\tune\trainable\trainable.py", line 331, in train
    raise skipped from exception_cause(skipped)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\tune\trainable\trainable.py", line 328, in train
    result = self.step()
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 895, in step
    train_results, train_iter_ctx = self._run_one_training_iteration()
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 3437, in _run_one_training_iteration
    training_step_results = self.training_step()
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo.py", line 426, in training_step
    return self._training_step_old_and_hybrid_api_stacks()
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
    return method(self, *_args, **_kwargs)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo.py", line 566, in _training_step_old_and_hybrid_api_stacks
    train_results = train_one_step(self, train_batch)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\execution\train_ops.py", line 56, in train_one_step
    info = do_minibatch_sgd(
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\sgd.py", line 129, in do_minibatch_sgd
    local_worker.learn_on_batch(
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 797, in learn_on_batch
    info_out[pid] = policy.learn_on_batch(batch)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\threading.py", line 24, in wrapper
    return func(self, *a, **k)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 715, in learn_on_batch
    grads, fetches = self.compute_gradients(postprocessed_batch)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\threading.py", line 24, in wrapper
    return func(self, *a, **k)
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 933, in compute_gradients
    tower_outputs = self._multi_gpu_parallel_grad_calc([postprocessed_batch])
  File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 1433, in _multi_gpu_parallel_grad_calc
    raise last_result[0] from last_result[1]
ValueError: Expected parameter loc (Tensor of shape (816, 1)) of distribution Normal(loc: torch.Size([816, 1]), scale: torch.Size([816, 1])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan],
        [nan],
        [nan],
        ...
        [nan],
        [nan],
        [nan]], grad_fn=<SplitBackward0>)

Another observation that I encounter, that I don’t know if is related to above issue, is that the action dict (I’m using a MultiAgent envitonment) has the following fortmat (for 9 agents):

{
     'agent_1': array([float, float, float, float, float, float, float, float, float]),
     'agent_2': array([float, float, float, float, float, float, float, float, float]),
     'agent_3': array([float, float, float, float, float, float, float, float, float]),
     'agent_4': array([float, float, float, float, float, float, float, float, float]),
     'agent_5': array([float, float, float, float, float, float, float, float, float]),
     'agent_6': array([float, float, float, float, float, float, float, float, float]),
     'agent_7': array([float, float, float, float, float, float, float, float, float]),
     'agent_8': array([float, float, float, float, float, float, float, float, float]),
     'agent_9': array([float, float, float, float, float, float, float, float, float])
}

I was expecting that the action dictionary has the following format of:

{
    'agent_1': float,
    'agent_2': float,
    'agent_3': float,
    'agent_4': float,
    'agent_5': float,
    'agent_6': float,
    'agent_7': float,
    'agent_8': float,
    'agent_9': float,
}

Is this format of the action that I got okay, composed per an array per agent with the observation of all agents?

Thanks @mannyv , I will see the solution!