RuntimeError: CUDA error: invalid device ordinal when have multiple ray deployment on the same GPU

I have a cluster with 8 GPU, trying to deploy multiple different models on it, each model will take a fraction of GPU

When start the service using any of following method:

  1. serve.run(my_class.options().bind(), name=name, route_prefix=route)
  2. driver = DAGDriver.bind(dag); serve.run(driver, port=8000)
  3. serve run -p 8000 my_file:my_bind

Get error: RuntimeError: CUDA error: invalid device ordinal

Demo code which can reproduce the error:
‘’'import torch
import numpy as np
import logging

import ray
from ray import serve
from starlette.requests import Request

import os

from ray.serve.drivers import DAGDriver

Logging

logging.basicConfig(level=logging.DEBUG)

import torch
import torch.nn as nn

class MyModel(nn.Module):
def init(self, input_size, hidden_size, num_classes):
super(MyModel, self).init()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)

def forward(self, x):
    out = self.fc1(x)
    out = self.relu1(out)
    out = self.fc2(out)
    return out

RAY_ACTOR_OPTIONS = {“num_gpus”: 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP1:
def init(self):

    print("@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@@")
    print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
    print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
    
    device = f"cuda:{ray.get_gpu_ids()[0]}"
    print(f"Initializing APP1 to {device}")
    print("@@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@")
    # device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
    self.device = device
    self.model = MyModel(10, 10, 3)
    self.model.to(device)

async def __call__(self, starlette_request: Request):
    input = (await starlette_request.json())["input"]
    input = np.array(input)

    output = self.model(torch.from_numpy(input).to(self.device))
    output = output.cpu().detach().numpy().tolist()
    
    return {"status":"success", "image":output}

RAY_ACTOR_OPTIONS = {“num_gpus”: 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP2:
def init(self):

    print("@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@@")
    print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
    print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
    
    device = f"cuda:{ray.get_gpu_ids()[0]}"
    print(f"Initializing APP2 to {device}")
    print("@@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@")
    # device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
    self.device = device
    self.model = MyModel(10, 10, 3)
    self.model.to(device)

async def __call__(self, starlette_request: Request):
    input = (await starlette_request.json())["input"]
    input = np.array(input)

    output = self.model(torch.from_numpy(input).to(self.device))
    output = output.cpu().detach().numpy().tolist()
    
    return {"status":"success", "image":output}

ray.init(num_gpus=8)

serve.run(APP1.options().bind(), name=“app1”, route_prefix=f"/app1")

serve.run(APP2.options().bind(), name=“app2”, route_prefix=f"/app2")

dag = {
‘/app1’: APP1.bind(),
# ‘/image_editor’: ImageEditor.bind(),
# ‘/text2image’: Text2Image.bind(),
‘/app2’: APP2.bind()
}
driver = DAGDriver.bind(dag)
handle = serve.run(driver, port=8000)‘’’

Moving this to ray serve category.

cc: @Sihan_Wang

For some reason I can’t edit or delete this one any more, so I created a new one when serve multiple models on a multi-gpu cluster, got error RuntimeError: CUDA error: invalid device ordinal

actually, I already moved the post to the right category for you. Let’s probably just keep one. Maybe let me delete this one?

Sure please delete this one, thanks