Hi there!
I was using a action_space = gym.Discrete(6)
in my custom environment and everithing was good. Now, I’m traying to experiment with a continuos action spaces (the rest of the configuration remaind as before). To do this I write action_space = gym.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
to define my action space.
The simulation start without problems, but after a while (and only sometimes) an error happend.
Failure # 1 (occurred at 2024-07-29_21-07-56)
e[36mray::PPO.train()e[39m (pid=13792, ip=127.0.0.1, actor_id=bc323549a84daed0a1b3c69101000000, repr=PPO)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo_torch_policy.py", line 85, in loss
curr_action_dist = dist_class(logits, model)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\models\torch\torch_action_dist.py", line 250, in __init__
self.dist = torch.distributions.normal.Normal(mean, torch.exp(log_std))
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\torch\distributions\normal.py", line 56, in __init__
super().__init__(batch_shape, validate_args=validate_args)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\torch\distributions\distribution.py", line 68, in __init__
raise ValueError(
ValueError: Expected parameter loc (Tensor of shape (816, 1)) of distribution Normal(loc: torch.Size([816, 1]), scale: torch.Size([816, 1])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan],
[nan],
[nan],
[nan],
...
[nan],
[nan]], grad_fn=<SplitBackward0>)
The above exception was the direct cause of the following exception:
e[36mray::PPO.train()e[39m (pid=13792, ip=127.0.0.1, actor_id=bc323549a84daed0a1b3c69101000000, repr=PPO)
File "python\ray\_raylet.pyx", line 1895, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1996, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1901, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1842, in ray._raylet.execute_task.function_executor
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\_private\function_manager.py", line 691, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\tune\trainable\trainable.py", line 331, in train
raise skipped from exception_cause(skipped)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\tune\trainable\trainable.py", line 328, in train
result = self.step()
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 895, in step
train_results, train_iter_ctx = self._run_one_training_iteration()
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 3437, in _run_one_training_iteration
training_step_results = self.training_step()
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo.py", line 426, in training_step
return self._training_step_old_and_hybrid_api_stacks()
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\algorithms\ppo\ppo.py", line 566, in _training_step_old_and_hybrid_api_stacks
train_results = train_one_step(self, train_batch)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\execution\train_ops.py", line 56, in train_one_step
info = do_minibatch_sgd(
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\sgd.py", line 129, in do_minibatch_sgd
local_worker.learn_on_batch(
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 797, in learn_on_batch
info_out[pid] = policy.learn_on_batch(batch)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\threading.py", line 24, in wrapper
return func(self, *a, **k)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 715, in learn_on_batch
grads, fetches = self.compute_gradients(postprocessed_batch)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\utils\threading.py", line 24, in wrapper
return func(self, *a, **k)
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 933, in compute_gradients
tower_outputs = self._multi_gpu_parallel_grad_calc([postprocessed_batch])
File "c:\Users\grhen\anaconda3\envs\eprllib1-2-5\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 1433, in _multi_gpu_parallel_grad_calc
raise last_result[0] from last_result[1]
ValueError: Expected parameter loc (Tensor of shape (816, 1)) of distribution Normal(loc: torch.Size([816, 1]), scale: torch.Size([816, 1])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan],
[nan],
[nan],
...
[nan],
[nan],
[nan]], grad_fn=<SplitBackward0>)
Another observation that I encounter, that I don’t know if is related to above issue, is that the action dict (I’m using a MultiAgent envitonment) has the following fortmat (for 9 agents):
{
'agent_1': array([float, float, float, float, float, float, float, float, float]),
'agent_2': array([float, float, float, float, float, float, float, float, float]),
'agent_3': array([float, float, float, float, float, float, float, float, float]),
'agent_4': array([float, float, float, float, float, float, float, float, float]),
'agent_5': array([float, float, float, float, float, float, float, float, float]),
'agent_6': array([float, float, float, float, float, float, float, float, float]),
'agent_7': array([float, float, float, float, float, float, float, float, float]),
'agent_8': array([float, float, float, float, float, float, float, float, float]),
'agent_9': array([float, float, float, float, float, float, float, float, float])
}
I was expecting that the action dictionary has the following format of:
{
'agent_1': float,
'agent_2': float,
'agent_3': float,
'agent_4': float,
'agent_5': float,
'agent_6': float,
'agent_7': float,
'agent_8': float,
'agent_9': float,
}
Is this format of the action that I got okay, composed per an array per agent with the observation of all agents?