Hello, I’m trying to autoscale my application with Ray, the problem I have is that Ray launch multiple application to fit the trafic, but it exceeds the amount of ram that my machine have.
For exemple I serve a translation model, and to fit the current trafic, ray launch 11 translation model in the same time, but I serve also other model (like tts) so it will also add more tts model… And it’s above the capacity of my ram.
I have tried to use ray.remote in decorator to limit Ray but it doesn’t seem to take it into account. In the end, Ray ends up killing applications, and sometimes it crashes my computer.
Here is my code:
from fastapi import FastAPI, UploadFile
from engine.engine import Engine
from ray import serve
import pipeline
import yaml
import ray
import os
import io
ray.init()
config = yaml.safe_load(open('config.yaml', 'r'))
app = FastAPI()
@serve.deployment(
name="Translator",
autoscaling_config={
"target_num_ongoing_requests_per_replica": 1,
"min_replicas": 0,
"initial_replicas": 1,
"max_replicas": os.cpu_count(),
},
)
@serve.ingress(app)
class TranslatorApi:
def __init__(self) -> None:
engine = Engine(config)
self.translation = pipeline.TranslationPipeline(engine)
@ray.remote(memory=1500 * 1024 * 1024)
@app.get("/")
def translatation_route(self, source: str, target: str, text: str):
return self.translation.translate(source, target, text)
@serve.deployment(
name="TextToSpeech",
autoscaling_config={
"target_num_ongoing_requests_per_replica": 1,
"min_replicas": 0,
"initial_replicas": 1,
"max_replicas": os.cpu_count(),
},
)
@serve.ingress(app)
class TTSApi:
def __init__(self) -> None:
engine = Engine(config)
self.tts = pipeline.TextToSpeechPipeline(engine)
@ray.remote(memory=1500 * 1024 * 1024)
@app.get("/")
def tts_route(self, text: str, language: str = "en"):
bs64_audio = self.tts.generate(text, language)
return bs64_audio
@serve.deployment(
name="SpeechToText",
autoscaling_config={
"target_num_ongoing_requests_per_replica": 1,
"min_replicas": 0,
"initial_replicas": 1,
"max_replicas": os.cpu_count(),
},
)
@serve.ingress(app)
class STTApi:
def __init__(self) -> None:
engine = Engine(config)
self.stt = pipeline.SpeechToTextPipeline(engine)
@ray.remote(memory=1500 * 1024 * 1024)
@app.get("/")
def stt_route(self, audio: UploadFile):
return self.stt.generate(
io.BytesIO(audio.file.read())
)
serve.run(TranslatorApi.bind(), route_prefix="/translate", name="Translator")
serve.run(TTSApi.bind(), route_prefix="/tts", name="TextToSpeech")
serve.run(STTApi.bind(), route_prefix="/stt", name="SpeechToText")
I’m a beginner in the usage of ray, so maybe I just made thing bad, any help is very appreciated.