Hey guys! When I was trying to use the checkpoint in the ray train, I came across an error. Here is my code
for epoch in range(start_epoch,epochs):
forward,backward,step,timedur=train_epoch(train_dataloader, model, loss_fn, optimizer, device,epoch)
sgd.save_checkpoint(epoch=epoch, model=model)
sgd.report(forward=forward, backward=backward,step=step,time=timedur)
loss_results.append([forward,backward,step,timedur])
Before I add the third line sgd.save_checkpoint
, it works and I can finish the training. But after I add it, it gives this error
2021-12-08 15:45:16,987 ERROR serialization.py:270 -- Default process group has not been initialized, please make sure to call init_process_group.
Traceback (most recent call last):
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 268, in deserialize_objects
obj = self._deserialize_object(data, metadata, object_ref)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 191, in _deserialize_object
return self._deserialize_msgpack_data(data, metadata_fields)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 169, in _deserialize_msgpack_data
python_objects = self._deserialize_pickle5_data(pickle5_data)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 159, in _deserialize_pickle5_data
obj = pickle.loads(in_band)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 576, in __setstate__
self.process_group = _get_default_group()
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 358, in _get_default_group
raise RuntimeError("Default process group has not been initialized, "
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
Files already downloaded and verified
Files already downloaded and verified
trainer start
Traceback (most recent call last):
File "train_example-checkpoint.py", line 274, in <module>
train_wrapper(args,pid,num_workers=args.num_workers, use_gpu=True)
File "train_example-checkpoint.py", line 212, in train_wrapper
callbacks=[JsonLoggerCallback()])
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/trainer.py", line 253, in run
for intermediate_result in iterator:
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/trainer.py", line 600, in __next__
self._executor.fetch_next_result)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/trainer.py", line 573, in _run_with_error_handling
return func()
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/backends/backend.py", line 562, in fetch_next_result
results = self._get_next_results()
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/backends/backend.py", line 524, in _get_next_results
results = self.get_with_failure_handling(futures)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/backends/backend.py", line 662, in get_with_failure_handling
success, failed_worker_indexes = check_for_failure(remote_values)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/train/utils.py", line 41, in check_for_failure
ray.get(object_ref)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/worker.py", line 1627, in get
raise value
ray.exceptions.RaySystemError: System error: Default process group has not been initialized, please make sure to call init_process_group.
traceback: Traceback (most recent call last):
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 268, in deserialize_objects
obj = self._deserialize_object(data, metadata, object_ref)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 191, in _deserialize_object
return self._deserialize_msgpack_data(data, metadata_fields)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 169, in _deserialize_msgpack_data
python_objects = self._deserialize_pickle5_data(pickle5_data)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/ray/serialization.py", line 159, in _deserialize_pickle5_data
obj = pickle.loads(in_band)
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 576, in __setstate__
self.process_group = _get_default_group()
File "/home/dixiyao/allreduce/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 358, in _get_default_group
raise RuntimeError("Default process group has not been initialized, "
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
Have any idea about what’s the probelm?