I’m using ray/rllib 1.2.0 and I’m hitting this error after 100 training iterations.
Failure # 1 (occurred at 2021-03-31_11-10-22)
Traceback (most recent call last):
File "/opt/miniconda/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 586, in _process_trial
results = self.trial_executor.fetch_result(trial)
File "/opt/miniconda/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 609, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/opt/miniconda/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
return func(*args, **kwargs)
File "/opt/miniconda/lib/python3.7/site-packages/ray/worker.py", line 1456, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AssertionError): e[36mray::PPO.train_buffered()e[39m (pid=264, ip=10.1.0.8)
File "python/ray/_raylet.pyx", line 480, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 432, in ray._raylet.execute_task.function_executor
File "/opt/miniconda/lib/python3.7/site-packages/ray/tune/trainable.py", line 167, in train_buffered
result = self.train()
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 526, in train
raise e
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 515, in train
result = Trainable.train(self)
File "/opt/miniconda/lib/python3.7/site-packages/ray/tune/trainable.py", line 226, in train
result = self.step()
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 157, in step
evaluation_metrics = self._evaluate()
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 778, in _evaluate
for w in self.evaluation_workers.remote_workers()
File "/opt/miniconda/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
return func(*args, **kwargs)
ray.exceptions.RayTaskError(AssertionError): e[36mray::RolloutWorker.sample()e[39m (pid=375, ip=10.1.0.8)
File "python/ray/_raylet.pyx", line 480, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 432, in ray._raylet.execute_task.function_executor
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py", line 662, in sample
batches = [self.input_reader.next()]
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 95, in next
batches = [self.get_data()]
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 224, in get_data
item = next(self.rollout_provider)
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 620, in _env_runner
sample_collector=sample_collector,
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/sampler.py", line 1198, in _process_observations_w_trajectory_view_api
new_episode.length - 1, filtered_obs)
File "/opt/miniconda/lib/python3.7/site-packages/ray/rllib/evaluation/collectors/simple_list_collector.py", line 487, in add_init_obs
assert agent_key not in self.agent_collectors
AssertionError
There is nothing very particular going on in my code, except few customizations (model, env, action distribution). I’ve checked memory usage and it is mostly below 20%. Does anyone know what it can be? Shall I open a github ticket?