Test:
import base64
import concurrent.futures
import logging
import time
import numpy as np
import requests
import torch
from ray import serve
from torchaudio.transforms import Resample as TResample
from transformers import pipeline
logger = logging.getLogger("ray.serve")
torch.set_num_threads(1)
@serve.deployment(ray_actor_options={"num_gpus": 0.1, "num_cpus": 1}, num_replicas=2)
class Preprocessor:
def __init__(self):
self.default_resampler = TResample(
orig_freq=8000,
new_freq=16000,
dtype=torch.float32,
resampling_method="sinc_interp_kaiser",
).to("cuda")
def __call__(self, audio):
raw_audio = base64.b64decode(audio)
samples = np.frombuffer(raw_audio, dtype=np.int16)
samples = samples.astype(np.float32) / np.iinfo(samples.dtype).max
with torch.inference_mode():
samples = torch.from_numpy(samples).to("cuda")
samples = self.default_resampler(samples)
samples = samples.to("cpu")
return samples.numpy()
@serve.deployment(
ray_actor_options={"num_gpus": 0.33, "num_cpus": 2},
autoscaling_config={"min_replicas": 2, "max_replicas": 3},
)
class Translator:
def __init__(self, process):
self.process = process.options(use_new_handle_api=True)
self.pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-medium",
chunk_length_s=30,
device="cuda",
)
async def translate(self, request) -> str:
samples = await self.process.remote(request["audio"])
with torch.inference_mode():
transcription = self.pipe(samples, generate_kwargs={"language": "russian"})[
"text"
]
return [{"text": transcription}]
async def __call__(self, request):
request = await request.json()
return await self.translate(request)
app = Translator.options(route_prefix="/translate").bind(Preprocessor.bind())
serve.run(app)
def send_request(blob):
s = time.time()
resp = requests.post(f"http://127.0.0.1:8000/translate", json={"audio": blob})
print(resp.json())
return time.time() - s
with open(
"/home/max/projects/models_deployment/tests/resources/long_phrase_etalon.wav", "rb"
) as f:
blob = f.read()
blob = base64.b64encode(blob).decode("ascii")
print(send_request(blob))
print(send_request(blob))
print(send_request(blob))
print(send_request(blob))
time.sleep(3)
print("TEST")
with concurrent.futures.ThreadPoolExecutor() as executor:
results = []
for i in range(2):
results.append(executor.submit(send_request, blob))
print("RESULTS")
for future in concurrent.futures.as_completed(results):
result = future.result()
print(result)
output:
2023-11-30 13:00:33,350 WARNING deployment.py:404 -- DeprecationWarning: `route_prefix` in `@serve.deployment` has been deprecated. To specify a route prefix for an application, pass it into `serve.run` instead.
2023-11-30 13:00:34,964 INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265
(ProxyActor pid=3197416) INFO 2023-11-30 13:00:36,631 proxy 10.80.0.21 proxy.py:1072 - Proxy actor 7f31fcd30dd1ad253d5c382c01000000 starting on node 6bb61fbb55ab6769c9a44e80afde55c19a5a660a435d26bfe02aa3a9.
(ProxyActor pid=3197416) INFO 2023-11-30 13:00:36,635 proxy 10.80.0.21 proxy.py:1257 - Starting HTTP server on node: 6bb61fbb55ab6769c9a44e80afde55c19a5a660a435d26bfe02aa3a9 listening on port 8000
(ProxyActor pid=3197416) INFO: Started server process [3197416]
(ServeController pid=3197339) INFO 2023-11-30 13:00:36,755 controller 3197339 deployment_state.py:1379 - Deploying new version of deployment Preprocessor in application 'default'.
(ServeController pid=3197339) INFO 2023-11-30 13:00:36,759 controller 3197339 deployment_state.py:1379 - Deploying new version of deployment Translator in application 'default'.
(ServeController pid=3197339) INFO 2023-11-30 13:00:36,862 controller 3197339 deployment_state.py:1668 - Adding 2 replicas to deployment Preprocessor in application 'default'.
(ServeController pid=3197339) INFO 2023-11-30 13:00:36,871 controller 3197339 deployment_state.py:1668 - Adding 2 replicas to deployment Translator in application 'default'.
(ServeReplica:default:Translator pid=3197448) Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
(ServeReplica:default:Preprocessor pid=3197446) INFO 2023-11-30 13:00:45,819 Preprocessor default#Preprocessor#AQQHpC 39ed7197-32fd-4a07-9968-bdb7291cea5f /translate default replica.py:726 - __CALL__ OK 73.2ms
[{'text': ' test'}]
0.9717648029327393
(ServeReplica:default:Translator pid=3197449) INFO 2023-11-30 13:00:46,664 Translator default#Translator#XTFRyI 39ed7197-32fd-4a07-9968-bdb7291cea5f /translate default replica.py:726 - __CALL__ OK 924.5ms
(ServeReplica:default:Preprocessor pid=3197446) INFO 2023-11-30 13:00:46,682 Preprocessor default#Preprocessor#AQQHpC 92b3ae19-ab35-4031-82d9-a3e4c662f743 /translate default replica.py:726 - __CALL__ OK 0.7ms
[{'text': ' test'}]
0.8590915203094482
(ServeReplica:default:Preprocessor pid=3197446) INFO 2023-11-30 13:00:47,537 Preprocessor default#Preprocessor#AQQHpC 4b6625d2-1820-4230-be75-d8c67aa84373 /translate default replica.py:726 - __CALL__ OK 0.7ms
(ServeReplica:default:Translator pid=3197448) INFO 2023-11-30 13:00:47,526 Translator default#Translator#YRwxid 92b3ae19-ab35-4031-82d9-a3e4c662f743 /translate default replica.py:726 - __CALL__ OK 851.7ms
[{'text': ' test'}]
0.7534689903259277
(ServeReplica:default:Translator pid=3197449) INFO 2023-11-30 13:00:48,280 Translator default#Translator#XTFRyI 4b6625d2-1820-4230-be75-d8c67aa84373 /translate default replica.py:726 - __CALL__ OK 747.7ms
(ServeReplica:default:Preprocessor pid=3197447) INFO 2023-11-30 13:00:48,364 Preprocessor default#Preprocessor#SSdZWA b73ebbd5-bee3-426e-9b82-a748fcdb0c1d /translate default replica.py:726 - __CALL__ OK 73.3ms
[{'text': ' test'}]
0.8116359710693359
(ServeReplica:default:Translator pid=3197448) INFO 2023-11-30 13:00:49,092 Translator default#Translator#YRwxid b73ebbd5-bee3-426e-9b82-a748fcdb0c1d /translate default replica.py:726 - __CALL__ OK 805.6ms
TEST
RESULTS
(ServeReplica:default:Preprocessor pid=3197447) INFO 2023-11-30 13:00:52,110 Preprocessor default#Preprocessor#SSdZWA 98dd517a-c5f7-4ff6-a37c-1c61d1f94db9 /translate default replica.py:726 - __CALL__ OK 0.8ms
(ServeReplica:default:Preprocessor pid=3197447) INFO 2023-11-30 13:00:52,111 Preprocessor default#Preprocessor#SSdZWA 461e56eb-6b0b-4799-9af3-30ea17eae4da /translate default replica.py:726 - __CALL__ OK 0.7ms
(ServeReplica:default:Translator pid=3197449) Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[{'text': ' test'}]
[{'text': ' test'}]
1.2893760204315186
1.2903039455413818
(ServeReplica:default:Translator pid=3197449) INFO 2023-11-30 13:00:53,385 Translator default#Translator#XTFRyI 98dd517a-c5f7-4ff6-a37c-1c61d1f94db9 /translate default replica.py:726 - __CALL__ OK 1280.6ms
(ServeReplica:default:Translator pid=3197448) INFO 2023-11-30 13:00:53,385 Translator default#Translator#YRwxid 461e56eb-6b0b-4799-9af3-30ea17eae4da /translate default replica.py:726 - __CALL__ OK 1280.1ms
If I remove Preprocessor
- the service response under load will have the expected speed (about 0.5-0.8 sec per each parallel request).
Preprocessor
works fast. Can you tell me how to fix it?