Hi, I got a problem with ray.get.
I tried to do my best but don’t know how to solve it.
This is my first question so there may be some insufficient explanation for my situation.
import ray
import gym
import time
from queue import Queue
@ray.remote
class SharedStorage:
def __init__(self):
self.storage = Queue()
def append(self, item):
for i in item:
self.storage.put(i)
def get(self):
return [self.storage.get() for _ in range(self.storage.qsize())]
@ray.remote
class Actor:
def __init__(self, shared_storage):
self.shared_storage = shared_storage
def actor(self):
env = gym.make('MsPacman-v0')
state = env.reset()
print(env.observation_space.shape)
for i in range(1000000):
action = env.action_space.sample()
state, reward, done, _ = env.step(action)
self.shared_storage.append.remote([[state, reward, done, state]])
if done:
state = env.reset()
class Runner:
def __init__(self):
ray.init()
self.num = 0
self.buffer = []
self.shared_storage = SharedStorage.remote()
self.actors = [Actor.remote(self.shared_storage) for _ in range(1)]
def learner(self):
[actor.actor.remote() for actor in self.actors]
for i in range(100000000):
storage = ray.get(self.shared_storage.get.remote())
for t in storage:
self.buffer.append(t)
print(len(self.buffer))
ray.shutdown()
runner = Runner()
runner.learner()
My project code has the same structure with this code, with the same error when using visual observation.
It works well in low-dimension vector observation, but in visual observation, error like below occurs.
Traceback (most recent call last):
File "C:/Users/iv112/PycharmProjects/rlrobot/test.py", line 64, in <module>
runner.learner()
File "C:/Users/iv112/PycharmProjects/rlrobot/test.py", line 52, in learner
storage = ray.get(self.shared_storage.get.remote())
File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\_private\client_mode_hook.py", line 89, in wrapper
return func(*args, **kwargs)
File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\worker.py", line 1621, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::SharedStorage.get() (pid=13648, ip=172.30.1.27, repr=<test.SharedStorage object at 0x0000026C5DB64EE0>)
File "python\ray\_raylet.pyx", line 614, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 615, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1785, in ray._raylet.CoreWorker.store_task_outputs
File "python\ray\_raylet.pyx", line 162, in ray._raylet.check_status
ray.exceptions.RaySystemError: System error: Unknown error
(pid=None) [2021-10-09 00:29:55,194 C 5756 4052] dlmalloc.cc:121: Check failed: *handle != nullptr CreateFileMapping() failed. GetLastError() = 1450
(pid=None) *** StackTrace Information ***
(pid=None)
(pid=13648) Traceback (most recent call last):
(pid=13648) File "python\ray\_raylet.pyx", line 614, in ray._raylet.execute_task
(pid=13648) File "python\ray\_raylet.pyx", line 615, in ray._raylet.execute_task
(pid=13648) File "python\ray\_raylet.pyx", line 1785, in ray._raylet.CoreWorker.store_task_outputs
(pid=13648) File "python\ray\_raylet.pyx", line 162, in ray._raylet.check_status
(pid=13648) ray.exceptions.RaySystemError: System error: Unknown error
(pid=13648)
(pid=13648) During handling of the above exception, another exception occurred:
(pid=13648)
(pid=13648) Traceback (most recent call last):
(pid=13648) File "python\ray\_raylet.pyx", line 684, in ray._raylet.task_execution_handler
(pid=13648) File "python\ray\_raylet.pyx", line 524, in ray._raylet.execute_task
(pid=13648) File "python\ray\_raylet.pyx", line 642, in ray._raylet.execute_task
(pid=13648) File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\_private\utils.py", line 110, in push_error_to_driver
(pid=13648) worker.core_worker.push_error(job_id, error_type, message, time.time())
(pid=13648) File "python\ray\_raylet.pyx", line 1902, in ray._raylet.CoreWorker.push_error
(pid=13648) File "python\ray\_raylet.pyx", line 162, in ray._raylet.check_status
(pid=13648) ray.exceptions.RaySystemError: System error: Unknown error
(pid=13648)
(pid=13648) During handling of the above exception, another exception occurred:
(pid=13648)
(pid=13648) Traceback (most recent call last):
(pid=13648) File "python\ray\_raylet.pyx", line 706, in ray._raylet.task_execution_handler
(pid=13648) File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\_private\utils.py", line 110, in push_error_to_driver
(pid=13648) worker.core_worker.push_error(job_id, error_type, message, time.time())
(pid=13648) File "python\ray\_raylet.pyx", line 1902, in ray._raylet.CoreWorker.push_error
(pid=13648) File "python\ray\_raylet.pyx", line 162, in ray._raylet.check_status
(pid=13648) ray.exceptions.RaySystemError: System error: Unknown error
(pid=13648) Exception ignored in: 'ray._raylet.task_execution_handler'
(pid=13648) Traceback (most recent call last):
(pid=13648) File "python\ray\_raylet.pyx", line 706, in ray._raylet.task_execution_handler
(pid=13648) File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\_private\utils.py", line 110, in push_error_to_driver
(pid=13648) worker.core_worker.push_error(job_id, error_type, message, time.time())
(pid=13648) File "python\ray\_raylet.pyx", line 1902, in ray._raylet.CoreWorker.push_error
(pid=13648) File "python\ray\_raylet.pyx", line 162, in ray._raylet.check_status
(pid=13648) ray.exceptions.RaySystemError: System error: Unknown error
(pid=13648) [2021-10-09 00:29:55,216 C 13648 12736] core_worker.cc:2170: Check failed: _s.ok() Bad status: IOError: Unknown error
(pid=13648) *** StackTrace Information ***
(pid=13648)
(pid=13648) Windows fatal exception: access violation
(pid=13648)
(pid=13648) Stack (most recent call first):
(pid=13648) File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\worker.py", line 428 in main_loop
(pid=13648) File "C:\Users\iv112\Anaconda3\envs\RL\lib\site-packages\ray\workers/default_worker.py", line 212 in <module>
[2021-10-09 00:29:55,713 E 4016 14232] raylet_client.cc:159: IOError: Unknown error [RayletClient] Failed to disconnect from raylet.
Process finished with exit code 1
I found this error occurs when i get data from ray.get and append to the list at the learner function.
Program works well for a few minutes, but when it reaches to some point, program crashes.
The error does not happen when i use ray.put before adding to the shared memory at the actor function and make it to ray reference object.
But when i use ray.get to the reference object i got, and append it to the list, that error occurs again.
I thought ray.get just returns ray reference object’s content, but it seems not.
I want to know how this happens and how to solve it.
Thank you for reading this question.