Hi. I’m interested in running multiple Diffusers models for inference on one GPU without hitting OOM issues.
I tried to modify the Stable Diffusion and FastAPI tutorial (Serving a Stable Diffusion Model — Ray 2.8.0) to be able to serve both SD XL 1.0 and Kandinsky 2.2. The code is below.
Unfortunately, when I start ray serve (serve run app_name:entrypoint &), inference tasks complete for one model (either one), but when I make request to the other model after that, it crashes with OOM. The self.stable_diffusion_xl_1_0 deployment doesn’t release memory, and neither does the kandinsky.
Could anyone help point me at the correct way to release memory from @serve.deployment environments? I tried to make some actors with the @ray.remote syntax instead, but all the ways I tried crashed, so not sure what to do.
@serve.deployment(
ray_actor_options={"num_cpus": 0, "num_gpus": 0},
autoscaling_config={"min_replicas": 0, "max_replicas": 1},
)
class TextToImage:
def __init__(
self,
stable_diffusion_xl_1_0_model_handle,
kandinsky_2_2_model_handle,
):
self.stable_diffusion_xl_1_0: DeploymentHandle = (
stable_diffusion_xl_1_0_model_handle.options(
use_new_handle_api=True,
)
)
self.kandinsky_2_2: DeploymentHandle = kandinsky_2_2_model_handle.options(
use_new_handle_api=True,
)
async def generate_for_task(self, task: any):
inputs = task.get("inputs")
ai_model = inputs.get("ai_model", "")
if ai_model == "stabilityai/stable-diffusion-xl-base-1.0":
output = await self.stable_diffusion_xl_1_0.generate_for_task.remote(task)
elif ai_model == "kandinsky-2-2":
output = await self.kandinsky_2_2.generate_for_task.remote(task)
print("TextToImage.generate_for_task output", output)
return output
@serve.deployment(
ray_actor_options={"num_gpus": 1, "num_cpus": 0},
autoscaling_config={"min_replicas": 0, "max_replicas": 1},
)
class StableDiffusionXL_1_0:
def __init__(
self,
):
...
@serve.deployment(
ray_actor_options={"num_gpus": 1, "num_cpus": 0},
autoscaling_config={"min_replicas": 0, "max_replicas": 1},
)
class Kandinsky_2_2:
def __init__(
self,
):
...
@serve.deployment(num_replicas=1)
@serve.ingress(app)
class APIIngress:
def __init__(
self,
text_to_image_model_handle: DeploymentHandle):
self.text_to_image: DeploymentHandle = text_to_image_model_handle.options(
use_new_handle_api=True
)
@app.get(
"/run_task",
responses={200: {"content": {"text/plain": {}}}},
response_class=Response,
)
async def generate(self, task_id: str, attempts: int = 0):
task = get_task(task_id)
output = await self.text_to_image.generate_for_task.remote(task)
return Response(content="test success", media_type="text/plain")
sdxl_responder = StableDiffusionXL_1_0.bind()
kandinsky_2_2 = Kandinsky_2_2.bind()
t2i_responder = TextToImage.bind(
sdxl_responder, kandinsky_2_2
)
entrypoint = APIIngress.bind(
t2i_responder,
)
Thanks for any tips on how to correctly release memory for this use case.