Hi, I have developed a distributed NLP framework using RaySGD, it works well, but after several commits, the code starts to crash with the following error:
Traceback (most recent call last):
File "rayrtc.py", line 760, in <module>
ModelTrainer(spec_manma2021)()
File "/opt/tiger/runner/runner_lite/runner_lite/op/base.py", line 292, in __call__
res = self.run(*args, **kwargs)
File "rayrtc.py", line 626, in run
'eval_batch_size': self.getc('evaluate.batch_size'),
File "rayrtc.py", line 463, in run_train_purely
model_train_result = ModelTrainerRunner()
File "/opt/tiger/runner/runner_lite/runner_lite/op/base.py", line 292, in __call__
res = self.run(*args, **kwargs)
File "/opt/tiger/runner/runner_lite/runner_lite/op/train_model_simply.py", line 209, in run
model_train_result = ModelTrainerRunner()
File "/opt/tiger/runner/runner_lite/runner_lite/op/base.py", line 292, in __call__
res = self.run(*args, **kwargs)
File "/opt/tiger/runner/runner_lite/runner_lite/op/ptx_v1/train.py", line 60, in run
return fit(option)
File "/opt/tiger/runner/rtc/rtc/fit.py", line 152, in fit
metric = learner.fit()
File "/opt/tiger/runner/ptx/ptx/fit.py", line 1992, in fit
train_metrics = trainer.train() # this returns
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/util/sgd/torch/torch_trainer.py", line 415, in train
num_steps=num_steps, profile=profile, info=info, dataset=dataset)
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/util/sgd/torch/worker_group.py", line 325, in train
success = check_for_failure(remote_worker_stats)
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/util/sgd/utils.py", line 244, in check_for_failure
finished = ray.get(finished)
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 47, in wrapper
return func(*args, **kwargs)
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/worker.py", line 1456, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::DistributedTorchRunner.train_epoch() (pid=303, ip=10.206.77.137)
File "python/ray/_raylet.pyx", line 480, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 432, in ray._raylet.execute_task.function_executor
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/util/sgd/torch/distributed_torch_runner.py", line 112, in train_epoch
num_steps=num_steps, profile=profile, info=info, iterator=iterator)
File "/data00/jialin.liu/.local/lib/python3.7/site-packages/ray/util/sgd/torch/torch_runner.py", line 140, in train_epoch
train_stats = self.training_operator.train_epoch(iterator, info)
File "/opt/tiger/runner/ptx/ptx/fit.py", line 2430, in train_epoch
metrics = self.train_batch(batch, batch_info=batch_info)
File "/opt/tiger/runner/ptx/ptx/fit.py", line 2321, in train_batch
scaled_loss.backward()
File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 166, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Operation timed out!
Any suggestions on how to debug this error?