when serve multiple models on a multi-gpu cluster, got error RuntimeError: CUDA error: invalid device ordinal

Trying to deploy multiple models on a 8 GPU cluster: get error:
(ServeController pid=550735) RuntimeError: CUDA error: invalid device ordinal (ServeController pid=550735) CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. (ServeController pid=550735) For debugging consider passing CUDA_LAUNCH_BLOCKING=1

Tried to use DAG, serve.run or use cli, get the same error

Versions / Dependencies

2.4.0

Reproduction script

import torch
import numpy as np
import logging

import ray
from ray import serve
from starlette.requests import Request

import os

from ray.serve.drivers import DAGDriver

# Logging
logging.basicConfig(level=logging.DEBUG)

import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        return out


RAY_ACTOR_OPTIONS = {"num_gpus": 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP1:
    def __init__(self):
        
        print("@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@@")
        print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
        print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
        
        device = f"cuda:{ray.get_gpu_ids()[0]}"
        print(f"Initializing APP1 to {device}")
        print("@@@@@@@@@@@@@@@@@ APP1 @@@@@@@@@@@@@@@@@@@@@")
        # device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.device = device
        self.model = MyModel(10, 10, 3)
        self.model.to(device)
    
    async def __call__(self, starlette_request: Request):
        input = (await starlette_request.json())["input"]
        input = np.array(input)
    
        output = self.model(torch.from_numpy(input).to(self.device))
        output = output.cpu().detach().numpy().tolist()
        
        return {"status":"success", "image":output}
    

RAY_ACTOR_OPTIONS = {"num_gpus": 1}
@serve.deployment(ray_actor_options=RAY_ACTOR_OPTIONS)
class APP2:
    def __init__(self):
        
        print("@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@@")
        print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
        print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
        
        device = f"cuda:{ray.get_gpu_ids()[0]}"
        print(f"Initializing APP2 to {device}")
        print("@@@@@@@@@@@@@@@@@ APP2 @@@@@@@@@@@@@@@@@@@@@")
        # device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.device = device
        self.model = MyModel(10, 10, 3)
        self.model.to(device)
    
    async def __call__(self, starlette_request: Request):
        input = (await starlette_request.json())["input"]
        input = np.array(input)
    
        output = self.model(torch.from_numpy(input).to(self.device))
        output = output.cpu().detach().numpy().tolist()
        
        return {"status":"success", "image":output}

ray.init(num_gpus=8)
# serve.run(APP1.options().bind(), name="app1", route_prefix=f"/app1")
# serve.run(APP2.options().bind(), name="app2", route_prefix=f"/app2")

dag = {
    '/app1': APP1.bind(),
    # '/image_editor': ImageEditor.bind(),
    # '/text2image': Text2Image.bind(),
    '/app2': APP2.bind()
}
driver = DAGDriver.bind(dag)
handle = serve.run(driver, port=8000)

@Huaiwei_Sun please keep this one, and give some hints thanks :pray:

cc: @Sihan_Wang to take a look