@matthewdeng I also tried to replace
inputs = batch[0]
masks = batch[1]
by inputs, masks = batch
(I use this dataset class here that returns 2 objects inputs, masks)
But then I get a device type mismatch:
Traceback (most recent call last):
File "a2d2_code/train-ray.py", line 213, in <module>
results = trainer.run(train_func)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/trainer.py", line 281, in run
for intermediate_result in iterator:
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/trainer.py", line 651, in __next__
self._finish_training)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/trainer.py", line 620, in _run_with_error_handling
return func()
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/trainer.py", line 721, in _finish_training
return ray.get(self._backend_executor_actor.finish_training.remote())
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
return func(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/worker.py", line 1713, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::BackendExecutor.finish_training() (pid=14314, ip=172.16.59.122, repr=<ray.train.backend.BackendExecutor object at 0x7ff17e0ab110>)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/backend.py", line 507, in finish_training
results = self.get_with_failure_handling(futures)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/backend.py", line 526, in get_with_failure_handling
success, failed_worker_indexes = check_for_failure(remote_values)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/utils.py", line 42, in check_for_failure
ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): ray::BaseWorkerMixin._BaseWorkerMixin__execute() (pid=14251, ip=172.16.59.122, repr=<ray.train.worker_group.BaseWorkerMixin object at 0x7fd478e49890>)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/worker_group.py", line 26, in __execute
return func(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/backend.py", line 498, in end_training
output = session.finish()
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/session.py", line 102, in finish
func_output = self.training_thread.join()
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/utils.py", line 94, in join
raise self.exc
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/ray/train/utils.py", line 87, in run
self.ret = self._target(*self._args, **self._kwargs)
File "a2d2_code/train-ray.py", line 187, in train_func
val_loss = CE(outputs["out"], masks)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/nn/modules/loss.py", line 1048, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/nn/functional.py", line 2693, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/nn/functional.py", line 2390, in nll_loss
ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Expected object of device type cuda but got device type cpu for argument #2 'target' in call to _thnn_nll_loss2d_forward
are you sure we need to delete the code copying tensors to device when using Ray Train as done here?