I just tried deploying 2 models on 2 nodes. Then I did load testing on both models at the same time.
I noticed that individually one of the models was giving about 178 RPS. But when I test both models at the same time, the RPS drops down.
model1.py
from io import BytesIO
import numpy as np
import ray
import torch
from fastapi import FastAPI
from fastapi.responses import Response
from PIL import Image
from ray import serve
from ray.runtime_env import RuntimeEnv
from ray.serve.handle import DeploymentHandle
app = FastAPI()
@serve.deployment(num_replicas=1)
@serve.ingress(app)
class APIIngressOD:
def __init__(self, object_detection_handle) -> None:
self.handle: DeploymentHandle = object_detection_handle.options(
use_new_handle_api=True,
)
@app.get(
"/",
responses={200: {"content": {"image/jpeg": {}}}},
response_class=Response,
)
async def detect(self, image_url: str):
image = await self.handle.detect.remote(image_url)
file_stream = BytesIO()
image.save(file_stream, "jpeg")
return Response(content=file_stream.getvalue(), media_type="image/jpeg")
@serve.deployment(
ray_actor_options={"num_gpus": 0.25},
autoscaling_config={"min_replicas": 2, "max_replicas": 4, "downscale_delay_s": 60},
)
class ObjectDetection:
def __init__(self):
self.model = torch.hub.load("ultralytics/yolov5", "yolov5s")
self.model.cuda()
@serve.batch(max_batch_size=3, batch_wait_timeout_s=0.1)
def _infer_batch(self, requests):
pass
def detect(self, image_url: str):
result_im = self.model(image_url)
return Image.fromarray(result_im.render()[0].astype(np.uint8))
app = APIIngressOD.bind(ObjectDetection.bind())
serve.run(app, name="object_detection", route_prefix="/detect")
model2.py
import ray
import torch
from fastapi import FastAPI
from ray import serve
from ray.runtime_env import RuntimeEnv
from ray.serve.handle import DeploymentHandle
from transformers import pipeline
app = FastAPI()
@serve.deployment(num_replicas=1)
@serve.ingress(app)
class APIIngress:
def __init__(self, distilbert_model_handle) -> None:
self.handle: DeploymentHandle = distilbert_model_handle.options(
use_new_handle_api=True,
)
@app.get("/")
async def classify(self, sentence: str):
return await self.handle.classify.remote(sentence)
@serve.deployment(
ray_actor_options={"num_gpus": 0.70},
autoscaling_config={"min_replicas": 1, "max_replicas": 2},
)
class DistilBertModel:
def __init__(self):
self.classifier = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased",
framework="pt",
device=torch.device("cuda:0"),
)
def classify(self, sentence: str):
return self.classifier(sentence)
app = APIIngress.bind(DistilBertModel.bind())
serve.run(app, name="classification", route_prefix="/classify")
load_test.py
from locust import HttpUser, task
class WebsiteUser(HttpUser):
@task
def detect(self):
image_url = "https://ultralytics.com/images/zidane.jpg"
self.client.get(f"http://127.0.0.1:8000/detect?image_url={image_url}")
@task
def classify(self):
prompt = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
input = "%20".join(prompt.split(" "))
self.client.get(f"http://127.0.0.1:8000/classify?sentence={input}")
Running the test #1
RAY_ADDRESS='http://127.0.0.1:8265' ray job submit --working-dir . -- python model1.py
RAY_ADDRESS='http://127.0.0.1:8265' ray job submit --working-dir . -- python model2.py
python -m locust -f load_test.py
Running the test #2
Same settings
Can you tell me if it is possible to increase the number of workers for the controller or proxy service? Or how to solve this problem?