I use TorchTrainer.as_trainable() to tune my neural net on 2 GPU machine.
I checked Cuda usage with nvidia-smi on first trial and it says I use together about 6 GB out of 26 GB but torch.cuda.is_available() is obviously False. After first trial I get following error:
Traceback (most recent call last):
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/tune/trial_runner.py”, line 726, in _process_trial
result = self.trial_executor.fetch_result(trial)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py”, line 489, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/worker.py”, line 1452, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): e[36mray::TorchTrainable.train()e[39m (pid=163211, ip=…)
File “python/ray/_raylet.pyx”, line 482, in ray._raylet.execute_task
File “python/ray/_raylet.pyx”, line 436, in ray._raylet.execute_task.function_executor
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/tune/trainable.py”, line 336, in train
result = self.step()
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/util/sgd/torch/torch_trainer.py”, line 657, in step
output = override_tune_step(
File “/…/temp/src/SentiNet.py”, line 941, in step
validation_stats = trainer.validate(reduce_results=False)[0]
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/util/sgd/torch/torch_trainer.py”, line 498, in validate
worker_stats = self.worker_group.validate(
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/util/sgd/torch/worker_group.py”, line 340, in validate
return ray.get(remote_worker_stats)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/serialization.py”, line 308, in deserialize_objects
self._deserialize_object(data, metadata, object_ref))
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/serialization.py”, line 247, in _deserialize_object
return self._deserialize_msgpack_data(data, metadata)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/serialization.py”, line 226, in _deserialize_msgpack_data
python_objects = self._deserialize_pickle5_data(pickle5_data)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/ray/serialization.py”, line 216, in _deserialize_pickle5_data
obj = pickle.loads(in_band)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/storage.py”, line 141, in _load_from_bytes
return torch.load(io.BytesIO(b))
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 595, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 774, in _legacy_load
result = unpickler.load()
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 730, in persistent_load
deserialized_objects[root_key] = restore_location(obj, location)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 175, in default_restore_location
result = fn(storage, location)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 151, in _cuda_deserialize
device = validate_cuda_device(location)
File “/…/anaconda3/envs/ox/lib/python3.8/site-packages/torch/serialization.py”, line 135, in validate_cuda_device
raise RuntimeError('Attempting to deserialize object on a CUDA ’
RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device(‘cpu’) to map your storages to the CPU.
I guess that ray or torch preallocate all Cuda memory but I have no clue how to avoid it.
I appreciate any help
Here is my code I use:
def step(trainer, info: dict):
"""
Define a custom training loop for tune.
This is needed because we want to manually update our scheduler.
"""
train_stats = trainer.train()
validation_stats = trainer.validate(reduce_results=False)[0]
# Manually update our scheduler with the given metric.
# trainer.update_scheduler(metric=validation_stats["val_loss"])
all_stats = ray.tune.utils.merge_dicts(train_stats, validation_stats[0])
return all_stats
TorchTrainable = TorchTrainer.as_trainable(
override_tune_step=step,
training_operator_cls=sn_operator_cls,
num_workers=num_workers,
num_cpus_per_worker=4,
use_gpu=use_gpu,
use_tqdm=True,
scheduler_step_freq="epoch",
config=operator_config
)
analysis = ray.tune.run(
TorchTrainable,
num_samples=3,
config=tune_config,
stop={"epoch": 4},
verbose=1,
)
return analysis.get_best_config(metric="val_loss", mode="min")