import torch
import os
import ray
from ray import tune
import ctypes
library = ctypes.CDLL("/root/library.so")
def my_trainable_function(config, reporter, library):
visible_gpus = ray.get_gpu_ids() # Get visible GPUs
trial_name = reporter.trial_name
print("Trial name:", trial_name)
print("Visible GPUs:", visible_gpus)
# Manually set visible GPUs for PyTorch
gpu_index = visible_gpus[0] # Select the first GPU
torch.cuda.set_device(0)
device_name = torch.cuda.get_device_name(0)
print("GPU", gpu_index, ":", device_name)
print(f'CUDA device used: {torch.cuda.current_device()}')
# Rest of your PyTorch training code
# Define the configuration for the hyperparameter search
config = {
"lr": tune.uniform(0.001, 0.1),
"batch_size": tune.choice([32, 64, 128])
}
# Set visible GPUs
visible_gpus = "0,1,2" # Specify the visible GPUs here
tune.run(tune.with_parameters(my_trainable_function, library=library), config=config, resources_per_trial={"gpu": 0.5}, verbose=2,
local_dir="./tune_results", num_samples=100)
I ran this code on ubuntu20.04, python3.8.10, torch.1.13.1, and encountered the errors below, how to solve it, how to use ctype object in objective function?
(my_trainable_function pid=48217) type object 'PyCFuncPtrType' has no attribute '__mul__'
(my_trainable_function pid=48217) Traceback (most recent call last):
(my_trainable_function pid=48217) File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/serialization.py", line 387, in deserialize_objects
(my_trainable_function pid=48217) obj = self._deserialize_object(data, metadata, object_ref)
(my_trainable_function pid=48217) File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/serialization.py", line 268, in _deserialize_object
(my_trainable_function pid=48217) return self._deserialize_msgpack_data(data, metadata_fields)
(my_trainable_function pid=48217) File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/serialization.py", line 223, in _deserialize_msgpack_data
(my_trainable_function pid=48217) python_objects = self._deserialize_pickle5_data(pickle5_data)
(my_trainable_function pid=48217) File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/serialization.py", line 213, in _deserialize_pickle5_data
(my_trainable_function pid=48217) obj = pickle.loads(in_band)
(my_trainable_function pid=48217) AttributeError: type object 'PyCFuncPtrType' has no attribute '__mul__'
2023-08-09 17:18:06,490 ERROR tune_controller.py:911 -- Trial task failed for trial my_trainable_function_a46d9_00004
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
result = ray.get(future)
File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
return fn(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/ray/_private/worker.py", line 2520, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::ImplicitFunc.train() (pid=48217, ip=172.17.0.2, actor_id=3f06df4fd4d3b808031bfe7c01000000, repr=my_trainable_function)
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 375, in train
raise skipped from exception_cause(skipped)
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 349, in entrypoint
return self._trainable_func(
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 666, in _trainable_func
output = fn()
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 332, in _inner
return inner(config, checkpoint_dir=None)
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 323, in inner
fn_kwargs[k] = parameter_registry.get(prefix + k)
File "/root/miniconda3/lib/python3.8/site-packages/ray/tune/registry.py", line 301, in get
return ray.get(self.references[k])
ray.exceptions.RaySystemError: System error: type object 'PyCFuncPtrType' has no attribute '__mul__'
traceback: Traceback (most recent call last):
AttributeError: type object 'PyCFuncPtrType' has no attribute '__mul__'
The library.so is a helloworld code, build with cmd gcc -shared -o library.so helloworld.c
:
#include <stdio.h>
__attribute__((visibility("default")))
void hello() {
printf("Hello, World!\n");
}