When app2 has many processing requests and app1 is not processing requests, the GPU previously occupied by app1 cannot be used by app2

I set the minimum replica of app1 to 0, but when app1 has no task processing, it still occupies GPU, causing app2 to be unusable

this config file


# This file was generated using the `serve build` command on Ray v2.42.1.

proxy_location: EveryNode

http_options:

  host: 0.0.0.0

  port: 8000

grpc_options:

  port: 9000

  grpc_servicer_functions: []

logging_config:

  encoding: TEXT

  log_level: INFO

  logs_dir: null

  enable_access_log: true

applications:

- name: app1

  route_prefix: /app1

  import_path: sd1:app

  runtime_env: {}

  deployments:

  - name: StableDiffusionV21
    autoscaling_config:
      min_replicas: 0
      initial_replicas: null
      max_replicas: 10
      target_ongoing_requests: 2
      metrics_interval_s: 10.0
      look_back_period_s: 30.0
      smoothing_factor: 1.0
      upscale_smoothing_factor: null
      downscale_smoothing_factor: null
      upscaling_factor: null
      downscaling_factor: null
      downscale_delay_s: 600.0
      upscale_delay_s: 30.0
    ray_actor_options:
      num_cpus: 1.0
      num_gpus: 0.5

- name: app2

  route_prefix: /app2

  import_path: sd2:app

  runtime_env: {}

  deployments:

  - name: StableDiffusionV22
    autoscaling_config:
      min_replicas: 2
      initial_replicas: null
      max_replicas: 10
      target_ongoing_requests: 2
      metrics_interval_s: 10.0
      look_back_period_s: 30.0
      smoothing_factor: 1.0
      upscale_smoothing_factor: null
      downscale_smoothing_factor: null
      upscaling_factor: null
      downscaling_factor: null
      downscale_delay_s: 600.0
      upscale_delay_s: 30.0
    ray_actor_options:
      num_cpus: 1.0
      num_gpus: 0.5

this is app1 code, model is diffusion model

import ray
from ray import serve
from fastapi import FastAPI
import torch
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
from io import BytesIO
from fastapi.responses import Response

# 初始化 FastAPI 应用
app = FastAPI()

@serve.deployment(
    ray_actor_options={"num_gpus": 0.5},
    autoscaling_config={"min_replicas": 0, "max_replicas": 10},
)
@serve.ingress(app)  # 将 FastAPI 路由注册到 Ray Serve
class StableDiffusionV21:
    def __init__(self):
        # 在启动时加载模型
        self.pipe = StableDiffusionPipeline.from_pretrained(
            pretrained_model_name_or_path="/home/zhouhang/miniconda3/envs/fluxenv2/flux/ray/sd",
            torch_dtype=torch.float16,
            use_safetensors=True,
        )
        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
        self.pipe = self.pipe.to("cuda")  # 将模型加载到 GPU

    @app.post("/generate1")
    async def generate(self, prompt: str, img_size: int = 1024):
        # 校验 prompt 是否为空
        if not prompt:
            return {"error": "Prompt cannot be empty"}

        # 使用 torch.autocast 提升性能
        with torch.autocast("cuda"):
            image = self.pipe(prompt, height=img_size, width=img_size).images[0]

        
        # 清理缓存,释放显存
        torch.cuda.empty_cache()
                
        # 将生成的图像转化为 BytesIO 对象,便于通过 FastAPI 响应返回
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format="PNG")
        img_byte_arr.seek(0)  # 将文件指针定位到文件开始
        
        # 返回图片数据
        return Response(content=img_byte_arr.read(), media_type="image/png")


# 创建并暴露模型的后端和端点
app = StableDiffusionV21.bind()