I have a cluster with 8 GPU, trying to deploy multiple different models on it, each model will take a fraction of GPU
When start the service using any of following method:
- serve.run(my_class.options().bind(), name=name, route_prefix=route)
- driver = DAGDriver.bind(dag); serve.run(driver, port=8000)
- serve run -p 8000 my_file:my_bind
Get error: RuntimeError: CUDA error: invalid device ordinal
Demo code which can reproduce the error:
‘’'import torch
import numpy as np
import logging
import ray
from ray import serve
from starlette.requests import Request
import os
from ray.serve.drivers import DAGDriver
Logging
logging.basicConfig(level=logging.DEBUG)
import torch
import torch.nn as nn
class MyModel(nn.Module):
def init(self, input_size, hidden_size, num_classes):
super(MyModel, self).init()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
return out
RAY_ACTOR_OPTIONS = {“num_gpus”: 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP1:
def init(self):
print("@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@@")
print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
device = f"cuda:{ray.get_gpu_ids()[0]}"
print(f"Initializing APP1 to {device}")
print("@@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@")
# device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
self.device = device
self.model = MyModel(10, 10, 3)
self.model.to(device)
async def __call__(self, starlette_request: Request):
input = (await starlette_request.json())["input"]
input = np.array(input)
output = self.model(torch.from_numpy(input).to(self.device))
output = output.cpu().detach().numpy().tolist()
return {"status":"success", "image":output}
RAY_ACTOR_OPTIONS = {“num_gpus”: 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP2:
def init(self):
print("@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@@")
print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
device = f"cuda:{ray.get_gpu_ids()[0]}"
print(f"Initializing APP2 to {device}")
print("@@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@")
# device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
self.device = device
self.model = MyModel(10, 10, 3)
self.model.to(device)
async def __call__(self, starlette_request: Request):
input = (await starlette_request.json())["input"]
input = np.array(input)
output = self.model(torch.from_numpy(input).to(self.device))
output = output.cpu().detach().numpy().tolist()
return {"status":"success", "image":output}
ray.init(num_gpus=8)
serve.run(APP1.options().bind(), name=“app1”, route_prefix=f"/app1")
serve.run(APP2.options().bind(), name=“app2”, route_prefix=f"/app2")
dag = {
‘/app1’: APP1.bind(),
# ‘/image_editor’: ImageEditor.bind(),
# ‘/text2image’: Text2Image.bind(),
‘/app2’: APP2.bind()
}
driver = DAGDriver.bind(dag)
handle = serve.run(driver, port=8000)‘’’