Hi all, Ray does not store Dask graph nodes in the object store in tmp
folder.
I get this error:
ray.exceptions.RayTaskError(GraphComponentException): ray::dask:train_MemoizationPolicy0 (pid=3186, ip=10.2.0.5)
File "/home/azureuser/bot/Raysa-Rasa/rasa/core/policies/memoization.py", line 184, in train
self.persist()
File "/home/azureuser/bot/Raysa-Rasa/rasa/core/policies/memoization.py", line 269, in persist
with self._model_storage.write_to(self._resource) as path:
File "/home/azureuser/miniconda3/envs/raysa_env/lib/python3.7/contextlib.py", line 112, in __enter__
return next(self.gen)
File "/home/azureuser/bot/Raysa-Rasa/rasa/engine/storage/local_model_storage.py", line 121, in write_to
directory.mkdir()
File "/home/azureuser/miniconda3/envs/raysa_env/lib/python3.7/pathlib.py", line 1273, in mkdir
self._accessor.mkdir(self, mode)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmphz8rz95c/train_MemoizationPolicy0'
The pathlib mkdir
try/except that raises this exception:
try:
self._accessor.mkdir(self, mode)
except FileNotFoundError:
if not parents or self.parent == self:
raise
self.parent.mkdir(parents=True, exist_ok=True)
self.mkdir(mode, parents=False, exist_ok=exist_ok)
except OSError:
# Cannot rely on checking for EEXIST, since the operating system
# could give priority to other errors like EACCES or EROFS
if not exist_ok or not self.is_dir():
raise
Here are some metrics of the cluster:
2022-02-01 23:45:17,037 INFO worker.py:843 -- Connecting to existing Ray cluster at address: 10.2.0.4:6379
[{'Alive': True,
'MetricsExportPort': 58445,
'NodeID': '95b0266e9fd012b5234ab9e53dc9d6091e4a349bdf90dc14fdc8294f',
'NodeManagerAddress': '10.2.0.4',
'NodeManagerHostname': 'masterVM',
'NodeManagerPort': 43793,
'ObjectManagerPort': 38789,
'ObjectStoreSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/plasma_store',
'RayletSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/raylet',
'Resources': {'CPU': 2.0,
'memory': 2335265588.0,
'node:10.2.0.4': 1.0,
'object_store_memory': 1167632793.0},
'alive': True},
{'Alive': True,
'MetricsExportPort': 62246,
'NodeID': 'd9a732ff8be4f26aa014a97c67eb5caeb2f2970c219a0ffdad43750c',
'NodeManagerAddress': '10.2.0.6',
'NodeManagerHostname': 'worker2VM',
'NodeManagerPort': 41731,
'ObjectManagerPort': 35951,
'ObjectStoreSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/plasma_store',
'RayletSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/raylet',
'Resources': {'CPU': 2.0,
'memory': 5665873920.0,
'node:10.2.0.6': 1.0,
'object_store_memory': 2428231680.0},
'alive': True},
{'Alive': True,
'MetricsExportPort': 41000,
'NodeID': '15ffd200525394763500c3ae9448a8c4afdea178b7b81387259b7844',
'NodeManagerAddress': '10.2.0.5',
'NodeManagerHostname': 'worker1VM',
'NodeManagerPort': 35321,
'ObjectManagerPort': 39995,
'ObjectStoreSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/plasma_store',
'RayletSocketName': '/tmp/ray/session_2022-02-01_23-23-35_116706_6075/sockets/raylet',
'Resources': {'CPU': 2.0,
'memory': 2723662234.0,
'node:10.2.0.5': 1.0,
'object_store_memory': 1167283814.0},
'alive': True}]
What am I missing here?