# An unique identifier for the head node and workers of this cluster.
cluster_name: primary-cluster
min_workers: 3
max_workers: 3
upscaling_speed: 1.0
docker:
image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
# image: "rayproject/ray:latest-gpu"
container_name: "ray_container"
pull_before_run: True
run_options: # Extra options to pass into "docker run"
- --ulimit nofile=65536:65536
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 10
provider:
type: aws
region: us-east-1
availability_zone: us-east-1a,us-east-1b
cache_stopped_nodes: True
auth:
ssh_user: ubuntu
available_node_types:
ray.head.default:
resources: {}
node_config:
InstanceType: r5.xlarge
SubnetIds:
- subnet-0xxxxx
IamInstanceProfile:
Arn: arn:aws:iam::xxxxxxxx:instance-profile/ray-cluster
ImageId: ami-0dd6adfad4ad37eec
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 300
ray.worker.default:
min_workers: 3
max_workers: 3
resources: {}
node_config:
InstanceType: r5.xlarge
SubnetIds:
- subnet-0xxxxxx
IamInstanceProfile:
Arn: arn:aws:iam::xxxxxxx:instance-profile/ray-cluster
ImageId: ami-0dd6adfad4ad37eec
InstanceMarketOptions:
MarketType: spot
head_node_type: ray.head.default
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
cluster_synced_files: []
file_mounts_sync_continuously: False
# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
- "**/.git"
- "**/.git/**"
rsync_filter:
- ".gitignore"
initialization_commands: []
setup_commands:
- sudo apt update
- sudo apt install -y python3-pip python-is-python3
- pip install -U "ray[default]"
- pip install boto3==1.4.8
- pip install torch s3fs dask polars pymongo pymongoarrow simplejson opensearch-py
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:
- ray stop
- ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
Policy IAM Role: ray-cluster:
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "ec2:*",
"Effect": "Allow",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "elasticloadbalancing:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "cloudwatch:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "autoscaling:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "iam:CreateServiceLinkedRole",
"Resource": "*",
"Condition": {
"StringEquals": {
"iam:AWSServiceName": [
"autoscaling.amazonaws.com",
"ec2scheduled.amazonaws.com",
"elasticloadbalancing.amazonaws.com",
"spot.amazonaws.com",
"spotfleet.amazonaws.com",
"transitgateway.amazonaws.com"
]
}
}
}
]
}
source: AUTOSCALER Failed to launch 3 node(s) of type ray.worker.default. (UnauthorizedOperation): You are not authorized to perform this operation. Encoded authorization failure message