Hello Jules! yes we do use different types of worker nodes and we need separate setup per worker node type. This is one of our example cluster yaml:
available_node_types:
# Configurations for the head node.
head:
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {"head_workers": 64}
node_config: # comes from https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
#InstanceType: m6i.8xlarge
InstanceType: r6id.16xlarge
#ImageId: ami-0dc2e3e2f9cca7c15
#ImageId: ami-0aa7d40eeae50c9a9
ImageId: ami-006dcf34c09e50022
UserData: "\n#!/bin/bash\nyum install -y htop\nyum install -y amazon-cloudwatch-agent\n/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a start -m ec2 -c default\n"
IamInstanceProfile:
Arn: arn:aws:iam::146036223160:instance-profile/ray-autoscaler-v1
KeyName: gilvikra_146036223160
SubnetIds:
- subnet-004c4ce44e904a937
- subnet-012696da88467548c
- subnet-094edaca2285f2d5f
- subnet-0c4cdee1bb4356ffb
- subnet-0d9b2559c86202f90
- subnet-0b6aa8113286f3b69
EbsOptimized: True
BlockDeviceMappings:
# root device is xvda for al2, ubuntu cannot mount more than 2TB by default as root volume, https://aws.amazon.com/premiumsupport/knowledge-center/ec2-ubuntu-convert-mbr-to-gpt/, https://www.dolthub.com/blog/2022-05-02-use-more-than-2TB-ubuntu-ec2/
#- DeviceName: /dev/sdb
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 5000
VolumeType: io2
# can go up to 64000
Iops: 15000
# Configurations for the worker nodes.
# Relevant info: https://github.com/aws-neuron/aws-neuron-eks-samples/blob/master/dp_bert_hf_pretrain/cfn/eks_ng_stack.yaml
worker_trn:
# To experiment with autoscaling, set min_workers to 0.
min_workers: 1
#min_workers: 6
max_workers: 16
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {"custom_trn_vcpu": 128}
node_config:
InstanceType: trn1.32xlarge
ImageId: ami-005f9685cb30f234b
UserData: "\n#!/bin/bash \n\ntouch /home/ec2-user/TRN1_MC\nprintf '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' > /etc/yum.repos.d/neuron.repo\nrpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB\n\nyum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y\n\nyum update -y\n\nyum install git -y\n\nyum remove aws-neuron-dkms -y\nyum remove aws-neuronx-dkms -y\nyum remove aws-neuronx-oci-hook -y\nyum remove aws-neuronx-runtime-lib -y\nyum remove aws-neuronx-collectives -y\nyum install aws-neuronx-dkms-2.* -y\nyum install aws-neuronx-oci-hook-2.* -y\nyum install aws-neuronx-runtime-lib-2.* -y\nyum install aws-neuronx-collectives-2.* -y\n\ncurl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz\nwget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key\ncat aws-efa-installer.key | gpg --fingerprint\nwget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig\ntar -xvf aws-efa-installer-latest.tar.gz\ncd aws-efa-installer && bash efa_installer.sh --yes\ncd\nrm -rf aws-efa-installer-latest.tar.gz aws-efa-installer\n\nyum remove aws-neuron-tools -y\nyum remove aws-neuronx-tools -y\nyum install aws-neuronx-tools-2.* -y\n\nexport PATH=/opt/aws/neuron/bin:$PATH\n\npython3 -m pip config set global.extra-index-url 'https://pip.repos.neuron.amazonaws.com'\n\npython3 -m pip install neuronx-cc==2.* torch-neuronx torchvision torchmetrics\n\nyum install -y htop\nyum install -y amazon-cloudwatch-agent\n/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a start -m ec2 -c default\n\ntouch /home/ec2-user/TRN1_SETUP_DONE\n"
#UserData: "\n#!/bin/bash\n\nyum install -y htop\nyum install -y amazon-cloudwatch-agent\n/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a start -m ec2 -c default\n"
IamInstanceProfile:
Arn: arn:aws:iam::146036223160:instance-profile/ray-autoscaler-v1
KeyName: gilvikra_146036223160
SubnetIds:
- subnet-004c4ce44e904a937
- subnet-012696da88467548c
- subnet-094edaca2285f2d5f
- subnet-0c4cdee1bb4356ffb
- subnet-0d9b2559c86202f90
- subnet-0b6aa8113286f3b69
EbsOptimized: True
#Placement:
#GroupName: A9VSPhotonUsEast1b
#NetworkInterfaces:
#- AssociatePublicIpAddress: True
# DeleteOnTermination: True
# InterfaceType: efa
# SubnetId: subnet-094edaca2285f2d5f
# Groups: [sg-0b7b434da6b0c24c2]
# DeviceIndex: 0
# NetworkCardIndex: 0
BlockDeviceMappings:
# root device is xvda for al2, ubunto cannot mount more than 2TB by default as root volume, https://aws.amazon.com/premiumsupport/knowledge-center/ec2-ubuntu-convert-mbr-to-gpt/, https://www.dolthub.com/blog/2022-05-02-use-more-than-2TB-ubuntu-ec2/
#- DeviceName: /dev/sdb
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 1000
VolumeType: io2
# can go up to 64000
Iops: 15000
# Configurations for the worker nodes.
worker_gpu:
# To experiment with autoscaling, set min_workers to 0.
min_workers: 0
#min_workers: 6
max_workers: 16
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {"custom_gpu_vcpu": 198}
node_config:
#InstanceType: p3.16xlarge
InstanceType: g5.48xlarge
#ImageId: ami-0c86e4eaf4fdd2e76
#ImageId: ami-0dc2e3e2f9cca7c15
ImageId: ami-01e65c3550dee3f5b
UserData: "\n#!/bin/bash\nyum install -y htop\nyum install -y amazon-cloudwatch-agent\n/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a start -m ec2 -c default\n"
IamInstanceProfile:
Arn: arn:aws:iam::146036223160:instance-profile/ray-autoscaler-v1
KeyName: gilvikra_146036223160
SubnetIds:
- subnet-004c4ce44e904a937
- subnet-012696da88467548c
- subnet-094edaca2285f2d5f
- subnet-0c4cdee1bb4356ffb
- subnet-0d9b2559c86202f90
- subnet-0b6aa8113286f3b69
EbsOptimized: True
BlockDeviceMappings:
# root device is xvda for al2, ubunto cannot mount more than 2TB by default as root volume, https://aws.amazon.com/premiumsupport/knowledge-center/ec2-ubuntu-convert-mbr-to-gpt/, https://www.dolthub.com/blog/2022-05-02-use-more-than-2TB-ubuntu-ec2/
#- DeviceName: /dev/sdb
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 1000
VolumeType: io2
# can go up to 64000
Iops: 15000
worker_cpu:
# To experiment with autoscaling, set min_workers to 0.
min_workers: 0
#min_workers: 6
max_workers: 20
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
resources: {"custom_cpu_vcpu": 32}
node_config:
#InstanceType: p3.16xlarge
#InstanceType: r6id.16xlarge
#InstanceType: i4i.8xlarge
#InstanceType: i3.8xlarge
#InstanceType: r5d.8xlarge
InstanceType: i3.8xlarge
#IimageId: latest_dlami
#ImageId: ami-0aa7d40eeae50c9a9
#ImageId: ami-0c86e4eaf4fdd2e76
#ImageId: ami-0dc2e3e2f9cca7c15
ImageId: ami-006dcf34c09e50022
UserData: "\n#!/bin/bash\nyum install -y htop\nyum install -y amazon-cloudwatch-agent\n/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a start -m ec2 -c default\n"
IamInstanceProfile:
Arn: arn:aws:iam::146036223160:instance-profile/ray-autoscaler-v1
KeyName: gilvikra_146036223160
SubnetIds:
- subnet-004c4ce44e904a937
- subnet-012696da88467548c
- subnet-094edaca2285f2d5f
- subnet-0c4cdee1bb4356ffb
- subnet-0d9b2559c86202f90
- subnet-0b6aa8113286f3b69
EbsOptimized: True
BlockDeviceMappings:
# root device is xvda for al2, ubunto cannot mount more than 2TB by default as root volume, https://aws.amazon.com/premiumsupport/knowledge-center/ec2-ubuntu-convert-mbr-to-gpt/, https://www.dolthub.com/blog/2022-05-02-use-more-than-2TB-ubuntu-ec2/
#- DeviceName: /dev/sdb
- DeviceName: /dev/xvda
Ebs:
VolumeSize: 1000
VolumeType: io2
# can go up to 64000
Iops: 30000
head_node_type: head