I’m seeing this new error with ray==1.12. The same code worked fine in 1.11
---------------------------------------------------------------------------
RayTaskError Traceback (most recent call last)
/tmp/ipykernel_191/224503594.py in run_trainer(trainer, train_fn, train_config, hdfs_config, **run_kwargs)
16
---> 18 results = trainer.run(my_fn, train_config, **run_kwargs)
19
~/.local/lib/python3.7/site-packages/ray/train/trainer.py in run(self, train_func, config, callbacks, dataset, checkpoint, checkpoint_strategy)
330 checkpoint=checkpoint,
331 checkpoint_strategy=checkpoint_strategy,
--> 332 run_dir=self.latest_run_dir,
333 )
334 for intermediate_result in iterator:
~/.local/lib/python3.7/site-packages/ray/train/trainer.py in __init__(self, backend_executor, backend_config, train_func, dataset, checkpoint_manager, checkpoint, checkpoint_strategy, run_dir)
653 dataset=dataset,
654 checkpoint=checkpoint,
--> 655 checkpoint_strategy=checkpoint_strategy,
656 )
657
~/.local/lib/python3.7/site-packages/ray/train/trainer.py in _start_training(self, train_func, run_dir, dataset, checkpoint, checkpoint_strategy, latest_checkpoint_id)
678 checkpoint_dict = self._checkpoint_manager._load_checkpoint(checkpoint)
679 self._run_with_error_handling(
--> 680 lambda: self._backend_executor.start_training(
681 train_func=train_func, dataset=dataset, checkpoint=checkpoint_dict
682 )
~/.local/lib/python3.7/site-packages/ray/train/trainer.py in _run_with_error_handling(self, func)
685 def _run_with_error_handling(self, func: Callable):
686 try:
--> 687 return func()
688 except TrainingWorkerError:
689 # Workers have already been restarted.
~/.local/lib/python3.7/site-packages/ray/train/trainer.py in <lambda>()
679 self._run_with_error_handling(
680 lambda: self._backend_executor.start_training(
--> 681 train_func=train_func, dataset=dataset, checkpoint=checkpoint_dict
682 )
683 )
~/.local/lib/python3.7/site-packages/ray/train/utils.py in <lambda>(*args, **kwargs)
171 # actor.
172 actor_method = getattr(self.actor, item)
--> 173 return lambda *args, **kwargs: ray.get(actor_method.remote(*args, **kwargs))
~/.local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
103 if func.__name__ != "init" or is_client_mode_enabled_by_default:
104 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105 return func(*args, **kwargs)
106
107 return wrapper
~/.local/lib/python3.7/site-packages/ray/worker.py in get(object_refs, timeout)
1807 worker.core_worker.dump_object_store_memory_usage()
1808 if isinstance(value, RayTaskError):
-> 1809 raise value.as_instanceof_cause()
1810 else:
1811 raise value
RayTaskError: ray::BackendExecutor.start_training() (pid=1040, ip=100.96.160.118, repr=<ray.train.backend.BackendExecutor object at 0x7fd0f8158110>)
At least one of the input arguments for this task could not be computed:
ray.exceptions.RaySystemError: System error: maximum recursion depth exceeded while calling a Python object
traceback: Traceback (most recent call last):
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 332, in deserialize_objects
obj = self._deserialize_object(data, metadata, object_ref)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 235, in _deserialize_object
return self._deserialize_msgpack_data(data, metadata_fields)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 190, in _deserialize_msgpack_data
python_objects = self._deserialize_pickle5_data(pickle5_data)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 180, in _deserialize_pickle5_data
obj = pickle.loads(in_band)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/train/utils.py", line 172, in __getattr__
actor_method = getattr(self.actor, item)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/train/utils.py", line 172, in __getattr__
actor_method = getattr(self.actor, item)
File "/home/jobuser/.local/lib/python3.7/site-packages/ray/train/utils.py", line 172, in __getattr__
actor_method = getattr(self.actor, item)
[Previous line repeated 492 more times]
RecursionError: maximum recursion depth exceeded while calling a Python object