When I make multiple concurrent requests, the program reports an error, but there is no problem with a single program

This is my definition code:

import ray
from ray import serve
from fastapi import FastAPI
import torch
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
from io import BytesIO
from fastapi.responses import Response

app = FastAPI()

@serve.deployment(
    ray_actor_options={"num_gpus": 0.5},
    autoscaling_config={"min_replicas": 0, "max_replicas": 2},
)
@serve.ingress(app)  
class StableDiffusionV2:
    def __init__(self):
       
        self.pipe = StableDiffusionPipeline.from_pretrained(
            pretrained_model_name_or_path="/home/zhouhang/miniconda3/envs/fluxenv2/flux/ray/sd",
            torch_dtype=torch.float16,
            use_safetensors=True,
        )
        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
        self.pipe = self.pipe.to("cuda")

    @app.post("/generate")
    def generate(self, prompt: str, img_size: int = 512):
      
        if not prompt:
            return {"error": "Prompt cannot be empty"}
        
        # 使用 torch.autocast 提升性能
        with torch.autocast("cuda"):
            image = self.pipe(prompt, height=img_size, width=img_size).images[0]
        
        # 将生成的图像转化为 BytesIO 对象,便于通过 FastAPI 响应返回
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format="PNG")
        img_byte_arr.seek(0)  # 将文件指针定位到文件开始
        
        # 返回图片数据
        return Response(content=img_byte_arr.read(), media_type="image/png")


# 创建并暴露模型的后端和端点
sd_app = StableDiffusionV2.bind()
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor

# 创建存储生成图片的文件夹
output_dir = "generated_images"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 定义多个 prompt(这里只是示例,你可以根据需要增加更多的 prompt)
prompts = [
    "hello",
    "A beautiful sunset over the ocean.",
    # "A futuristic cityscape with flying cars.",
    # "A cup of coffee on a wooden table with sunlight streaming through the window.",
    # "A cup of coffee on a wooden table with sunlight streaming through the window.",
    
]

# 发送请求并保存图像的函数
def generate_and_save_image(prompt, start_index):
    # 发送单个 prompt 请求
    resp = requests.post(f"http://127.0.0.1:8000/generate", params={"prompt": prompt})

    # 如果请求成功,保存图像
    if resp.status_code == 200:
        image_filename = f"{output_dir}/image_{start_index}.png"
        with open(image_filename, 'wb') as f:
            f.write(resp.content)
        print(f"Image {start_index} saved as {image_filename}")
    else:
        print(f"Failed to generate image for prompt {start_index}")

# 记录开始时间
start_time = time.time()

# 使用 ThreadPoolExecutor 并行发送请求
with ThreadPoolExecutor() as executor:
    # 为每个 prompt 提交任务
    futures = []
    for i, prompt in enumerate(prompts):
        futures.append(executor.submit(generate_and_save_image, prompt, i))

    # 等待所有任务完成
    for future in futures:
        future.result()

# 记录结束时间
end_time = time.time()

# 计算并打印总耗时
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

This is my pipe code:
import requests
import os
import time
from concurrent.futures import ThreadPoolExecutor

# 创建存储生成图片的文件夹
output_dir = "generated_images"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 定义多个 prompt(这里只是示例,你可以根据需要增加更多的 prompt)
prompts = [
    "hello",
    "A beautiful sunset over the ocean.",
    # "A futuristic cityscape with flying cars.",
    # "A cup of coffee on a wooden table with sunlight streaming through the window.",
    # "A cup of coffee on a wooden table with sunlight streaming through the window.",
    
]

# 发送请求并保存图像的函数
def generate_and_save_image(prompt, start_index):
    # 发送单个 prompt 请求
    resp = requests.post(f"http://127.0.0.1:8000/generate", params={"prompt": prompt})

    # 如果请求成功,保存图像
    if resp.status_code == 200:
        image_filename = f"{output_dir}/image_{start_index}.png"
        with open(image_filename, 'wb') as f:
            f.write(resp.content)
        print(f"Image {start_index} saved as {image_filename}")
    else:
        print(f"Failed to generate image for prompt {start_index}")

# 记录开始时间
start_time = time.time()

# 使用 ThreadPoolExecutor 并行发送请求
with ThreadPoolExecutor() as executor:
    # 为每个 prompt 提交任务
    futures = []
    for i, prompt in enumerate(prompts):
        futures.append(executor.submit(generate_and_save_image, prompt, i))

    # 等待所有任务完成
    for future in futures:
        future.result()

# 记录结束时间
end_time = time.time()

# 计算并打印总耗时
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

this is error:
ERROR 2025-02-13 07:15:01,323 default_StableDiffusionV2 9qg6nbx0 d92fb15e-6ce8-45da-b65b-905c69206753 – Request failed.
(ServeReplica:default:StableDiffusionV2 pid=1965516) Traceback (most recent call last):
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 465, in _handle_errors_and_metrics
(ServeReplica:default:StableDiffusionV2 pid=1965516) yield _status_code_callback
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 867, in _wrap_user_method_call
(ServeReplica:default:StableDiffusionV2 pid=1965516) yield status_code_callback
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 639, in handle_request_with_rejection
(ServeReplica:default:StableDiffusionV2 pid=1965516) async for result in self._call_user_generator(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 576, in _call_user_generator
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise e from None
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 1595, in call_user_method
(ServeReplica:default:StableDiffusionV2 pid=1965516) result, sync_gen_consumed = await self._call_func_or_gen(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 1313, in _call_func_or_gen
(ServeReplica:default:StableDiffusionV2 pid=1965516) result = await result
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/http_util.py”, line 497, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self._asgi_app(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/applications.py”, line 1054, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await super().call(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/applications.py”, line 112, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.middleware_stack(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/errors.py”, line 187, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/errors.py”, line 165, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.app(scope, receive, _send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/exceptions.py”, line 62, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 53, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 42, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await app(scope, receive, sender)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 715, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.middleware_stack(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 735, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await route.handle(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 288, in handle
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.app(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 76, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await wrap_app_handling_exceptions(app, request)(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 53, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 42, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await app(scope, receive, sender)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 73, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) response = await f(request)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/routing.py”, line 301, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raw_response = await run_endpoint_function(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/routing.py”, line 214, in run_endpoint_function
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await run_in_threadpool(dependant.call, **values)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/concurrency.py”, line 37, in run_in_threadpool
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await anyio.to_thread.run_sync(func)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/to_thread.py”, line 56, in run_sync
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await get_async_backend().run_sync_in_worker_thread(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py”, line 2461, in run_sync_in_worker_thread
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await future
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py”, line 962, in run
(ServeReplica:default:StableDiffusionV2 pid=1965516) result = context.run(func, *args)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/fluxenv2/flux/ray/singleserve/./sd.py”, line 36, in generate
(ServeReplica:default:StableDiffusionV2 pid=1965516) image = self.pipe(prompt, height=img_size, width=img_size).images[0]
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
(ServeReplica:default:StableDiffusionV2 pid=1965516) return func(*args, **kwargs)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py”, line 1055, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/diffusers/schedulers/scheduling_unipc_multistep.py”, line 1005, in step
(ServeReplica:default:StableDiffusionV2 pid=1965516) assert self.this_order > 0
(ServeReplica:default:StableDiffusionV2 pid=1965516) AssertionError
(ServeReplica:default:StableDiffusionV2 pid=1965516) INFO 2025-02-13 07:15:01,325 default_StableDiffusionV2 9qg6nbx0 d92fb15e-6ce8-45da-b65b-905c69206753 – POST /generate 500 1651.5ms
(ServeReplica:default:StableDiffusionV2 pid=1965516) ERROR 2025-02-13 07:15:01,330 default_StableDiffusionV2 9qg6nbx0 33a6d25e-e3f5-4cc5-badb-cfce73609d62 – Request failed.
(ServeReplica:default:StableDiffusionV2 pid=1965516) Traceback (most recent call last):
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 465, in _handle_errors_and_metrics
(ServeReplica:default:StableDiffusionV2 pid=1965516) yield _status_code_callback
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 867, in _wrap_user_method_call
(ServeReplica:default:StableDiffusionV2 pid=1965516) yield status_code_callback
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 639, in handle_request_with_rejection
(ServeReplica:default:StableDiffusionV2 pid=1965516) async for result in self._call_user_generator(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 576, in _call_user_generator
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise e from None
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 1595, in call_user_method
(ServeReplica:default:StableDiffusionV2 pid=1965516) result, sync_gen_consumed = await self._call_func_or_gen(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/replica.py”, line 1313, in _call_func_or_gen
(ServeReplica:default:StableDiffusionV2 pid=1965516) result = await result
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/ray/serve/_private/http_util.py”, line 497, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self._asgi_app(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/applications.py”, line 1054, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await super().call(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/applications.py”, line 112, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.middleware_stack(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/errors.py”, line 187, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/errors.py”, line 165, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.app(scope, receive, _send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/middleware/exceptions.py”, line 62, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 53, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 42, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await app(scope, receive, sender)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 715, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.middleware_stack(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 735, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await route.handle(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 288, in handle
(ServeReplica:default:StableDiffusionV2 pid=1965516) await self.app(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 76, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await wrap_app_handling_exceptions(app, request)(scope, receive, send)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 53, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raise exc
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/_exception_handler.py”, line 42, in wrapped_app
(ServeReplica:default:StableDiffusionV2 pid=1965516) await app(scope, receive, sender)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/routing.py”, line 73, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) response = await f(request)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/routing.py”, line 301, in app
(ServeReplica:default:StableDiffusionV2 pid=1965516) raw_response = await run_endpoint_function(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/fastapi/routing.py”, line 214, in run_endpoint_function
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await run_in_threadpool(dependant.call, **values)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/starlette/concurrency.py”, line 37, in run_in_threadpool
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await anyio.to_thread.run_sync(func)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/to_thread.py”, line 56, in run_sync
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await get_async_backend().run_sync_in_worker_thread(
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py”, line 2461, in run_sync_in_worker_thread
(ServeReplica:default:StableDiffusionV2 pid=1965516) return await future
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py”, line 962, in run
(ServeReplica:default:StableDiffusionV2 pid=1965516) result = context.run(func, *args)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/fluxenv2/flux/ray/singleserve/./sd.py”, line 36, in generate
(ServeReplica:default:StableDiffusionV2 pid=1965516) image = self.pipe(prompt, height=img_size, width=img_size).images[0]
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
(ServeReplica:default:StableDiffusionV2 pid=1965516) return func(*args, **kwargs)
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py”, line 1055, in call
(ServeReplica:default:StableDiffusionV2 pid=1965516) latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
(ServeReplica:default:StableDiffusionV2 pid=1965516) File “/home/zhouhang/miniconda3/envs/ray_env2/lib/python3.10/site-packages/diffusers/schedulers/scheduling_unipc_multistep.py”, line 1005, in step
(ServeReplica:default:StableDiffusionV2 pid=1965516) assert self.this_order > 0
(ServeReplica:default:StableDiffusionV2 pid=1965516) AssertionError
(ServeReplica:default:StableDiffusionV2 pid=1965516) INFO 2025-02-13 07:15:01,331 default_StableDiffusionV2 9qg6nbx0 33a6d25e-e3f5-4cc5-badb-cfce73609d62 – POST /generate 500 1655.4ms

Hi @hang_zhou, welcome to the community! Concurrency / multi-threading doesn’t play nicely with DiffusionPipelines mainly because the pipeline scheduler isn’t thread safe, hence the step error you’re seeing in scheduling_unipc_multistep.py. Here are some possible workarounds you can try:

  1. Set max_ongoing_requests=1 in your deployment options (Configure Ray Serve deployments — Ray 2.42.1). This will ensure each replica only handles one request at a time.
  2. Batch the requests into one call to the model (Dynamic Request Batching — Ray 2.42.1).
  3. On every request, create a new pipeline and scheduler from a shared pipeline so that the concurrent requests won’t interfere with each other. Since they’re created from a shared pipeline, you’re still only loading the model once into the GPU. Here’s an example application code:
import ray
from ray import serve
from fastapi import FastAPI
import torch
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
from io import BytesIO
from fastapi.responses import Response

logger = logging.getLogger("ray.serve")
app = FastAPI()

@serve.deployment(
    ray_actor_options={"num_gpus": 0.5},
    autoscaling_config={"min_replicas": 0, "max_replicas": 2},
)
@serve.ingress(app)  
class StableDiffusionV2:
    def __init__(self):
        self.shared_pipe = StableDiffusionPipeline.from_pretrained(
            pretrained_model_name_or_path="your-model-here",
            torch_dtype=torch.float16,
        )
        self.shared_pipe.scheduler = UniPCMultistepScheduler.from_config(self.shared_pipe.scheduler.config)
        self.shared_pipe = self.shared_pipe.to("cuda")

        # initialize linalg: https://github.com/pytorch/pytorch/issues/90613
        torch.inverse(torch.ones((1, 1), device="cuda:0"))

    @app.post("/generate")
    def generate(self, prompt: str, img_size: int = 512):
        if not prompt:
            return {"error": "Prompt cannot be empty"}

        scheduler = self.shared_pipe.scheduler.from_config(self.shared_pipe.scheduler.config)
        pipeline = StableDiffusionPipeline.from_pipe(self.shared_pipe, scheduler=scheduler)
        
        # 使用 torch.autocast 提升性能
        with torch.autocast("cuda"):
            image = pipeline(prompt, height=img_size, width=img_size).images[0]
        
        # 将生成的图像转化为 BytesIO 对象,便于通过 FastAPI 响应返回
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format="PNG")
        img_byte_arr.seek(0)  # 将文件指针定位到文件开始
        
        # 返回图片数据
        return Response(content=img_byte_arr.read(), media_type="image/png")

# 创建并暴露模型的后端和端点
sd_app = StableDiffusionV2.bind()