Hey,
I currently have Rlib running a DQN algorithm with the following observation space and DQN config. I also have a discrete action space of 29 possible values. I sometimes get the error stated below that the algorithm failed in a training or evaluation step. This seems to happen around the 150K steps which is equal to the reply buffer size. Both GPU and Memory and below maximum capacity.
Any ideas of what might be the issue or how to solve the problem?
Thanks in advance.
Observation Space
def get_observation_space(self):
image_space = Dict(
{"values": Box(
low=np.array([0,0,-math.pi,-math.pi,-math.pi,-math.pi,-math.pi,-math.pi,-math.pi,0,-1,0,0,0,0,0,0,0,0,0,0,0]),
high=np.array([100,100,math.pi,math.pi,math.pi,math.pi,math.pi,math.pi,math.pi,1,1,1,1,1,1,1,1,1,1,1,1,1]),
dtype=np.float32
),
"occupancyMap_now": Box(
low=0,
high=1,
shape=(self.occupancy_map_y, self.occupancy_map_x, 1),
dtype=np.float64
),
"occupancyMap_05": Box(
low=0,
high=1,
shape=(self.occupancy_map_y, self.occupancy_map_x, 1),
dtype=np.float64
),
})
return image_space
DQN Config
framework: "torch"
num_workers: 1
num_gpus: 1
#num_gpus_per_worker: 1
num_cpus_per_worker: 11
rollout_fragment_length: 4
#timesteps_per_iteration: 20000
train_batch_size: 32
num_steps_sampled_before_learning_starts: 10000
recreate_failed_workers: True
horizon: 6500
n_step: 1
num_atoms: 51
noisy: False
# "batch_mode": "complete_episodes"
gamma: 0.99
"exploration_config": {
"type": "EpsilonGreedy",
"initial_epsilon": 1.0,
"final_epsilon": 0.01,
"epsilon_timesteps": 500000
}
target_network_update_freq: 8000
double_q: true
dueling: true
replay_buffer_config:
type: MultiAgentPrioritizedReplayBuffer
capacity: 150000
# How many steps of the model to sample before learning starts.
# If True prioritized replay buffer will be used.
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
lr: .0000625
adam_epsilon: .00015
min_sample_timesteps_per_iteration: 10000
model: {
# use_lstm: True,
grayscale: True,
dim: 128,
conv_filters: [
[16, [5, 5], 2],
[32, [5, 5], 2],
[64, [5, 5], 2],
[128, [5, 5], 2],
[256, [5, 5], 2],
[516, [4,4], 1],
]
}
Error
(CustomDQNTrainer pid=6443) 2023-03-20 21:56:46,363 ERROR algorithm.py:2604 -- Error in training or evaluation attempt! Trying to recover.
(CustomDQNTrainer pid=6443) Traceback (most recent call last):
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py", line 2947, in _run_one_training_iteration
(CustomDQNTrainer pid=6443) results = self.training_step()
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span
(CustomDQNTrainer pid=6443) return method(self, *_args, **_kwargs)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/algorithms/dqn/dqn.py", line 379, in training_step
(CustomDQNTrainer pid=6443) new_sample_batch = synchronous_parallel_sample(
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/execution/rollout_ops.py", line 100, in synchronous_parallel_sample
(CustomDQNTrainer pid=6443) sample_batches = ray.get(
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
(CustomDQNTrainer pid=6443) return func(*args, **kwargs)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/worker.py", line 2289, in get
(CustomDQNTrainer pid=6443) raise value.as_instanceof_cause()
(CustomDQNTrainer pid=6443) ray.exceptions.RayTaskError(RuntimeError): ray::RolloutWorker.sample() (pid=6501, ip=192.168.1.175, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fdfbf731430>)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
(CustomDQNTrainer pid=6443) batches = [self.input_reader.next()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
(CustomDQNTrainer pid=6443) batches = [self.get_data()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
(CustomDQNTrainer pid=6443) item = next(self._env_runner)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
(CustomDQNTrainer pid=6443) base_env.send_actions(actions_to_send)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 396, in send_actions
(CustomDQNTrainer pid=6443) ) = self.vector_env.vector_step(action_vector)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 309, in vector_step
(CustomDQNTrainer pid=6443) raise e
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 302, in vector_step
(CustomDQNTrainer pid=6443) obs, r, done, info = self.envs[i].step(actions[i])
(CustomDQNTrainer pid=6443) File "/home/daniel/rllib-integration/rllib_integration/carla_env.py", line 96, in step
(CustomDQNTrainer pid=6443) sensor_data = self.core.tick(control)
(CustomDQNTrainer pid=6443) File "/home/daniel/rllib-integration/rllib_integration/carla_core.py", line 655, in tick
(CustomDQNTrainer pid=6443) self.world.tick()
(CustomDQNTrainer pid=6443) RuntimeError: time-out of 30000ms while waiting for the simulator, make sure the simulator is ready and connected to 192.168.1.113:2000
(CustomDQNTrainer pid=6443) 2023-03-20 21:56:46,403 ERROR worker_set.py:958 -- Worker 1 is faulty.
(CustomDQNTrainer pid=6443) Traceback (most recent call last):
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py", line 2947, in _run_one_training_iteration
(CustomDQNTrainer pid=6443) results = self.training_step()
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 466, in _resume_span
(CustomDQNTrainer pid=6443) return method(self, *_args, **_kwargs)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/algorithms/dqn/dqn.py", line 379, in training_step
(CustomDQNTrainer pid=6443) new_sample_batch = synchronous_parallel_sample(
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/execution/rollout_ops.py", line 100, in synchronous_parallel_sample
(CustomDQNTrainer pid=6443) sample_batches = ray.get(
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
(CustomDQNTrainer pid=6443) return func(*args, **kwargs)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/worker.py", line 2289, in get
(CustomDQNTrainer pid=6443) raise value.as_instanceof_cause()
(CustomDQNTrainer pid=6443) ray.exceptions.RayTaskError(RuntimeError): ray::RolloutWorker.sample() (pid=6501, ip=192.168.1.175, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fdfbf731430>)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
(CustomDQNTrainer pid=6443) batches = [self.input_reader.next()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
(CustomDQNTrainer pid=6443) batches = [self.get_data()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
(CustomDQNTrainer pid=6443) item = next(self._env_runner)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
(CustomDQNTrainer pid=6443) base_env.send_actions(actions_to_send)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 396, in send_actions
(CustomDQNTrainer pid=6443) ) = self.vector_env.vector_step(action_vector)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 309, in vector_step
(CustomDQNTrainer pid=6443) raise e
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/env/vector_env.py", line 302, in vector_step
(CustomDQNTrainer pid=6443) obs, r, done, info = self.envs[i].step(actions[i])
(CustomDQNTrainer pid=6443) File "/home/daniel/rllib-integration/rllib_integration/carla_env.py", line 96, in step
(CustomDQNTrainer pid=6443) sensor_data = self.core.tick(control)
(CustomDQNTrainer pid=6443) File "/home/daniel/rllib-integration/rllib_integration/carla_core.py", line 655, in tick
(CustomDQNTrainer pid=6443) self.world.tick()
(CustomDQNTrainer pid=6443) RuntimeError: time-out of 30000ms while waiting for the simulator, make sure the simulator is ready and connected to 192.168.1.113:2000
(CustomDQNTrainer pid=6443)
(CustomDQNTrainer pid=6443) During handling of the above exception, another exception occurred:
(CustomDQNTrainer pid=6443)
(CustomDQNTrainer pid=6443) Traceback (most recent call last):
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 955, in _worker_health_check
(CustomDQNTrainer pid=6443) ray.get(obj_ref)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
(CustomDQNTrainer pid=6443) return func(*args, **kwargs)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/_private/worker.py", line 2289, in get
(CustomDQNTrainer pid=6443) raise value.as_instanceof_cause()
(CustomDQNTrainer pid=6443) ray.exceptions.RayTaskError(StopIteration): ray::RolloutWorker.sample_with_count() (pid=6501, ip=192.168.1.175, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fdfbf731430>)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 888, in sample_with_count
(CustomDQNTrainer pid=6443) batch = self.sample()
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
(CustomDQNTrainer pid=6443) batches = [self.input_reader.next()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
(CustomDQNTrainer pid=6443) batches = [self.get_data()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
(CustomDQNTrainer pid=6443) item = next(self._env_runner)
(CustomDQNTrainer pid=6443) StopIteration
(CustomDQNTrainer pid=6443) 2023-03-20 21:56:46,404 ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RolloutWorker.sample_with_count() (pid=6501, ip=192.168.1.175, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fdfbf731430>)
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 888, in sample_with_count
(CustomDQNTrainer pid=6443) batch = self.sample()
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
(CustomDQNTrainer pid=6443) batches = [self.input_reader.next()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
(CustomDQNTrainer pid=6443) batches = [self.get_data()]
(CustomDQNTrainer pid=6443) File "/home/daniel/anaconda3/envs/NewCarlaRlib/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
(CustomDQNTrainer pid=6443) item = next(self._env_runner)
(CustomDQNTrainer pid=6443) StopIteration