How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I’ve finally managed to get Ray running on our slurm cluster, and started about 90 experiments using tune.run()
. That was running fine for about 22 hours, before the entire ray cluster crashed. The only error message I can see is in the console output of the head node, nothing in the logs on either the head node or the worker that seems to have caused this. Any idea what’s going on?
The only thing I could find that might be related is this github issue, but even that seems like it might not be the same error: [2.0rc1][nightly-test] long_running_distributed_pytorch_pbt_failure failed · Issue #27709 · ray-project/ray · GitHub
Traceback (most recent call last):
File ".../main.py", line 499, in <module>
main(args, args.num_cpus, group=args.experiment_group, name=args.experiment_name, ray_local_mode=args.ray_local_mode)
File ".../main.py", line 475, in main
tune.run(experiments, callbacks=callbacks, raise_on_failed_trial=False)
File ".../lib/python3.9/site-packages/ray/tune/tune.py", line 427, in run
return ray.get(remote_future)
File ".../lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return getattr(ray, func.__name__)(*args, **kwargs)
File ".../lib/python3.9/site-packages/ray/util/client/api.py", line 42, in get
return self.worker.get(vals, timeout=timeout)
File ".../lib/python3.9/site-packages/ray/util/client/worker.py", line 434, in get
res = self._get(to_get, op_timeout)
File ".../lib/python3.9/site-packages/ray/util/client/worker.py", line 462, in _get
raise err
types.RayTaskError(TuneError): ray::run() (pid=42004, ip=10.31.143.135)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 964, in _on_training_result
self._process_trial_results(trial, result)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1048, in _process_trial_results
decision = self._process_trial_result(trial, result)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1103, in _process_trial_result
self._callbacks.on_trial_result(
File ".../lib/python3.9/site-packages/ray/tune/callback.py", line 329, in on_trial_result
callback.on_trial_result(**info)
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 529, in on_trial_result
self._sync_trial_dir(trial, force=False, wait=False)
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 494, in _sync_trial_dir
sync_process.wait()
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 127, in wait
raise exception
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 108, in entrypoint
result = self._fn(*args, **kwargs)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 64, in sync_dir_between_nodes
return _sync_dir_between_different_nodes(
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 176, in _sync_dir_between_different_nodes
return ray.get(unpack_future)
ray.exceptions.RayTaskError: ray::_unpack_from_actor() (pid=256724, ip=10.31.143.135)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 393, in _unpack_from_actor
for buffer in _iter_remote(pack_actor):
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 354, in _iter_remote
buffer = ray.get(actor.next.remote())
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::_PackActor.__init__() (pid=243457, ip=10.31.141.53, repr=<ray.tune.utils.file_transfer._PackActor object at 0x2bb728e38c70>)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 314, in __init__
self.stream = _pack_dir(source_dir=source_dir, files_stats=files_stats)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 278, in _pack_dir
tar.add(os.path.join(source_dir, key), arcname=key)
File ".../lib/python3.9/tarfile.py", line 1988, in add
self.addfile(tarinfo, f)
File ".../lib/python3.9/tarfile.py", line 2016, in addfile
copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
File ".../lib/python3.9/tarfile.py", line 249, in copyfileobj
raise exception("unexpected end of data")
OSError: unexpected end of data
During handling of the above exception, another exception occurred:
ray::run() (pid=42004, ip=10.31.143.135)
File ".../lib/python3.9/site-packages/ray/tune/tune.py", line 722, in run
runner.step()
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 872, in step
self._wait_and_handle_event(next_trial)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 851, in _wait_and_handle_event
raise TuneError(traceback.format_exc())
ray.tune.error.TuneError: Traceback (most recent call last):
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 839, in _wait_and_handle_event
self._on_training_result(
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 964, in _on_training_result
self._process_trial_results(trial, result)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1048, in _process_trial_results
decision = self._process_trial_result(trial, result)
File ".../lib/python3.9/site-packages/ray/tune/execution/trial_runner.py", line 1103, in _process_trial_result
self._callbacks.on_trial_result(
File ".../lib/python3.9/site-packages/ray/tune/callback.py", line 329, in on_trial_result
callback.on_trial_result(**info)
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 529, in on_trial_result
self._sync_trial_dir(trial, force=False, wait=False)
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 494, in _sync_trial_dir
sync_process.wait()
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 127, in wait
raise exception
File ".../lib/python3.9/site-packages/ray/tune/syncer.py", line 108, in entrypoint
result = self._fn(*args, **kwargs)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 64, in sync_dir_between_nodes
return _sync_dir_between_different_nodes(
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 176, in _sync_dir_between_different_nodes
return ray.get(unpack_future)
ray.exceptions.RayTaskError: ray::_unpack_from_actor() (pid=256724, ip=10.31.143.135)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 393, in _unpack_from_actor
for buffer in _iter_remote(pack_actor):
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 354, in _iter_remote
buffer = ray.get(actor.next.remote())
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::_PackActor.__init__() (pid=243457, ip=10.31.141.53, repr=<ray.tune.utils.file_transfer._PackActor object at 0x2bb728e38c70>)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 314, in __init__
self.stream = _pack_dir(source_dir=source_dir, files_stats=files_stats)
File ".../lib/python3.9/site-packages/ray/tune/utils/file_transfer.py", line 278, in _pack_dir
tar.add(os.path.join(source_dir, key), arcname=key)
File ".../lib/python3.9/tarfile.py", line 1988, in add
self.addfile(tarinfo, f)
File ".../lib/python3.9/tarfile.py", line 2016, in addfile
copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
File ".../lib/python3.9/tarfile.py", line 249, in copyfileobj
raise exception("unexpected end of data")
OSError: unexpected end of data