Hi,
I’m learning Ray and want to implement Serve a Large Language Model with vLLM — Ray 2.34.0 but can’t seem to make it work.
My setup is just Docker on DLAMI (AWS Deep Learning AMI, ubuntu 22.04).
aws-config.yaml
cluster_name: llm-cluster
min_workers: 1
max_workers: 2
upscaling_speed: 1.0
docker:
container_name: "ray"
head_image: "123456789.dkr.ecr.us-east-1.amazonaws.com/rbl-worker:ray-nightly-py311-cpu"
worker_image: "123456789.dkr.ecr.us-east-1.amazonaws.com/rbl-worker:ray-latest-py311-gpu"
pull_before_run: True
run_options:
- --ulimit nofile=65536:65536
worker_run_options:
- --runtime=nvidia
idle_timeout_minutes: 10
provider:
type: aws
region: us-east-1
availability_zone: us-east-1a
cache_stopped_nodes: True
auth:
ssh_user: ubuntu
available_node_types:
ray.head.default:
resources: { "CPU": 0 }
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
node_config:
# m7i.xlarge 16.0 GiB 4 vCPUs 0 None 0 GiB EBS only Up to 12.5 Gigabit $0.2016 hourly
InstanceType: m7i.xlarge
IamInstanceProfile:
Arn: arn:aws:iam::123456789:instance-profile/ray-cluster-head-node-instance-profile-f66eafd
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 160
VolumeType: gp3
SubnetId: subnet-123456789
ray.worker.default:
max_workers: 1
resources: { }
node_config:
# LLaMA 3 8B requires around 16GB of disk space and 20GB of VRAM (GPU memory) in FP16.
# g6.xlarge 16.0 GiB 4 vCPUs 1 NVIDIA L4 24 GiB 250 GB NVMe SSD Up to 10 Gigabit $0.8048 hourly
InstanceType: g6.xlarge
ImageId: ami-053912f3a44543f8c
SecurityGroupIds:
- sg-123456789
IamInstanceProfile:
Arn: arn:aws:iam::123456789:instance-profile/ray-cluster-worker-node-instance-profile-c1e39f4
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 160
VolumeType: gp3
SubnetId: subnet-123456789
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default
file_mounts: {
"/rbl/app": "./app",
}
cluster_synced_files: [ ]
file_mounts_sync_continuously: False
rsync_exclude:
- "**/.git"
- "**/.git/**"
rsync_filter:
- ".gitignore"
initialization_commands:
- aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 123456789.dkr.ecr.us-east-1.amazonaws.com
setup_commands: [ ]
head_setup_commands: [ ]
worker_setup_commands: [ ]
head_start_ray_commands:
- ray stop
- ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
worker_start_ray_commands:
- ray stop
- ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
app/requirements.txt
pyarrow
ray[serve]
requests
vllm
urllib3<2
app/llm.py
from typing import Dict, Optional, List
import logging
from fastapi import FastAPI
from starlette.requests import Request
from starlette.responses import StreamingResponse, JSONResponse
import ray
from ray import serve
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.utils import FlexibleArgumentParser
logger = logging.getLogger("ray.serve")
app = FastAPI()
@serve.deployment(
autoscaling_config={
"min_replicas": 1,
"max_replicas": 10,
"target_ongoing_requests": 5,
},
max_ongoing_requests=10,
)
@serve.ingress(app)
class VLLMDeployment:
def __init__(
self,
engine_args: AsyncEngineArgs,
response_role: str,
lora_modules: Optional[List[LoRAModulePath]] = None,
chat_template: Optional[str] = None,
):
logger.info(f"Starting with engine args: {engine_args}")
self.openai_serving_chat = None
self.engine_args = engine_args
self.response_role = response_role
self.lora_modules = lora_modules
self.chat_template = chat_template
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@app.post("/v1/chat/completions")
async def create_chat_completion(
self, request: ChatCompletionRequest, raw_request: Request
):
"""OpenAI-compatible HTTP endpoint.
API reference:
- https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
"""
if not self.openai_serving_chat:
model_config = await self.engine.get_model_config()
# Determine the name of the served model for the OpenAI client.
if self.engine_args.served_model_name is not None:
served_model_names = self.engine_args.served_model_name
else:
served_model_names = [self.engine_args.model]
self.openai_serving_chat = OpenAIServingChat(
self.engine,
model_config,
served_model_names,
self.response_role,
self.lora_modules,
self.chat_template,
)
logger.info(f"Request: {request}")
generator = await self.openai_serving_chat.create_chat_completion(
request, raw_request
)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.code
)
if request.stream:
return StreamingResponse(content=generator, media_type="text/event-stream")
else:
assert isinstance(generator, ChatCompletionResponse)
return JSONResponse(content=generator.model_dump())
def parse_vllm_args(cli_args: Dict[str, str]):
"""Parses vLLM args based on CLI inputs.
Currently uses argparse because vLLM doesn't expose Python models for all of the
config options we want to support.
"""
arg_parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server."
)
parser = make_arg_parser(arg_parser)
arg_strings = []
for key, value in cli_args.items():
arg_strings.extend([f"--{key}", str(value)])
logger.info(arg_strings)
parsed_args = parser.parse_args(args=arg_strings)
return parsed_args
def build_app(cli_args: Dict[str, str]) -> serve.Application:
"""Builds the Serve app based on CLI arguments.
See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server
for the complete set of arguments.
Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html.
""" # noqa: E501
parsed_args = parse_vllm_args(cli_args)
engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
engine_args.worker_use_ray = True
tp = engine_args.tensor_parallel_size
logger.info(f"Tensor parallelism = {tp}")
pg_resources = []
pg_resources.append({"CPU": 1}) # for the deployment replica
for i in range(tp):
pg_resources.append({"CPU": 1, "GPU": 1}) # for the vLLM actors
# We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
# the same Ray node.
return VLLMDeployment.options(
placement_group_bundles=pg_resources, placement_group_strategy="STRICT_PACK"
).bind(
engine_args,
parsed_args.response_role,
parsed_args.lora_modules,
parsed_args.chat_template,
)
entrypoint = build_app({
"model": "NousResearch/Meta-Llama-3-8B-Instruct",
"tensor_parallel_size": 1
})
deployment-config.yaml
# This file was generated using the `serve build` command on Ray v2.34.0.
proxy_location: EveryNode
http_options:
host: 0.0.0.0
port: 8000
grpc_options:
port: 9000
grpc_servicer_functions: []
logging_config:
encoding: TEXT
log_level: INFO
logs_dir: null
enable_access_log: true
applications:
- name: app1
route_prefix: /
import_path: llm:entrypoint
runtime_env: {}
deployments:
- name: VLLMDeployment
max_ongoing_requests: 10
autoscaling_config:
min_replicas: 1
initial_replicas: null
max_replicas: 10
target_ongoing_requests: 5.0
metrics_interval_s: 10.0
look_back_period_s: 30.0
smoothing_factor: 1.0
upscale_smoothing_factor: null
downscale_smoothing_factor: null
upscaling_factor: null
downscaling_factor: null
downscale_delay_s: 600.0
upscale_delay_s: 30.0
placement_group_bundles:
- CPU: 1.0
- CPU: 1.0
GPU: 1.0
placement_group_strategy: STRICT_PACK
head.Dockerfile
FROM rayproject/ray:nightly-py311-cpu
WORKDIR /rbl/app
COPY app/requirements.txt requirements.txt
RUN pip install -r requirements.txt
COPY app/llm.py llm.py
worker.Dockerfile
FROM rayproject/ray:latest-py311-gpu
WORKDIR /rbl/app
COPY app/requirements.txt requirements.txt
RUN pip install -r requirements.txt
COPY app/llm.py llm.py
Create the cluster in AWS:
ray up -y aws-cluster.yaml
Start the dashboard:
ray dashboard aws-cluster.yaml
Deploy the LLM example:
serve deploy deployment-config.yaml
The worker starts, downloads the docker image, a container is created but the deployment doesn’t work; the status is stuck at “Updating”:
Deployment 'VLLMDeployment' in application 'app1' has 1 replicas that have taken more than 30s to be scheduled. This may be due to waiting for the cluster to auto-scale or for a runtime environment to be installed. Resources required for each replica: [{"CPU": 1.0}, {"CPU": 1.0, "GPU": 1.0}], total resources available: {}. Use `ray status` for more details.
Any thoughts or references for examples?