I have a very simple identity method hosted by ray serve:
@serve.deployment(num_replicas=1, ray_actor_options={"num_cpus": 1, "num_gpus": 0})
class IdentityService:
def __init__(self):
pass
@serve.batch(max_batch_size=64, batch_wait_timeout_s=0.01)
async def handle_batch(self, inputs):
print("Our input array has length:", len(inputs))
return inputs
async def __call__(self, request):
return await self.handle_batch(request)
generator = IdentityService.bind()
handle = serve.run(generator)
Emulate as many requests as possible:
import asyncio
async def send_request():
return await handle.remote(torch.randint(low=0, high=3, size=(INPUT_SIZE,)).float())
async def main():
tasks = []
for _ in range(10000):
task = asyncio.create_task(send_request())
tasks.append(task)
return await asyncio.gather(*tasks)
await main()
No matter the different configuration options: num_replicas, num_cpus, max_concurrent_queries. I always get the same performance: process 10k requests in ~35sec and batch size between 5-8.
Can you please explain how to increase performance?