How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I was trying to integrate gpu logical partition with ray core actors for parallel gpu computation using single gpu with 32GB VRAM
PARALLEL_CALLS=10
@ray.remote(num_gpus=1 / PARALLEL_CALLS)
class AsyncClients:
def __init__(self, cid):
gpus = tf.config.list_physical_devices("GPU")
if gpus:
try:
tf.config.set_logical_device_configuration(
gpus[0],
[
tf.config.LogicalDeviceConfiguration(memory_limit=512)
for _ in range(PARALLEL_CALLS)
],
)
logical_gpus = tf.config.list_logical_devices("GPU")
except RuntimeError as e:
# Virtual devices must be set before GPUs have been initialized
print(e)
self.cid = cid
self.device = logical_gpus[self.cid].name
self.model = client_model(self.cid)
x_train, y_train = client_data(self.cid, NUM_CLIENTS)
split_idx = math.floor(len(x_train) * 0.9) # Use 10% of x_train for validation
self.x_train, self.y_train = x_train[:split_idx], y_train[:split_idx]
self.x_val, self.y_val = x_train[split_idx:], y_train[split_idx:]
def get_parameters(self, config):
return self.model.get_weights()
def fit_and_evaluate(self, parameters, config):
with tf.device(self.device):
# train
self.model.set_weights(parameters)
self.model.fit(
self.x_train, self.y_train, epochs=config["epochs"], verbose=2
)
# evaluate
loss, acc = self.model.evaluate(self.x_val, self.y_val, verbose=2)
Here, I divide entire my GPU RAM to use with ray but unfortunately I got into memory related errors.
Kindly help to resolve this issue of parallel training processes using single GPU.
Thanks a lot