This is the YAML output of the raycluster resource:
apiVersion: cluster.ray.io/v1
kind: RayCluster
metadata:
annotations:
kopf.zalando.org/last-handled-configuration: |
{"spec":{"headPodType":"head-node","headServicePorts":[{"name":"client","port":10001,"targetPort":10001},{"name":"dashboard","port":8265,"targetPort":8265},{"name":"ray-serve","port":8000,"targetPort":8000}],"headStartRayCommands":["ray stop","ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 &> /tmp/raylogs"],"idleTimeoutMinutes":5,"maxWorkers":2,"podTypes":[{"name":"head-node","podConfig":{"apiVersion":"v1","kind":"Pod","metadata":{"generateName":"head-"},"spec":{"containers":[{"args":["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"],"command":["/bin/bash","-c","--"],"image":"ci-artifacts.c3.ai/c3:custom-ray","imagePullPolicy":"Always","name":"ray-node","ports":[{"containerPort":6379,"protocol":"TCP"},{"containerPort":10001,"protocol":"TCP"},{"containerPort":8265,"protocol":"TCP"},{"containerPort":8000,"protocol":"TCP"}],"resources":{"limits":{"cpu":"1200m","memory":"4Gi","nvidia.com/gpu":"0"},"requests":{"cpu":"1200m","ephemeral-storage":"1Gi","memory":"4Gi","nvidia.com/gpu":"0"}},"volumeMounts":[{"mountPath":"/dev/shm","name":"dshm"},{"mountPath":"/usr/local/share","name":"environment-shared"}]}],"restartPolicy":"Always","volumes":[{"emptyDir":{"medium":"Memory"},"name":"dshm"},{"name":"environment-shared","persistentVolumeClaim":{"claimName":"test"}}]}},"rayResources":{"CPU":0}},{"maxWorkers":2,"minWorkers":2,"name":"worker-node","podConfig":{"apiVersion":"v1","kind":"Pod","metadata":{"generateName":"worker-"},"spec":{"containers":[{"args":["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"],"command":["/bin/bash","-c","--"],"image":"ci-artifacts.c3.ai/c3:custom-ray","imagePullPolicy":"Always","name":"ray-node","resources":{"limits":{"cpu":"1200m","memory":"2Gi","nvidia.com/gpu":"0"},"requests":{"cpu":"1200m","ephemeral-storage":"1Gi","memory":"2Gi","nvidia.com/gpu":"0"}},"volumeMounts":[{"mountPath":"/dev/shm","name":"dshm"},{"mountPath":"/usr/local/share","name":"environment-shared"}]}],"restartPolicy":"Always","volumes":[{"emptyDir":{"medium":"Memory"},"name":"dshm"},{"name":"environment-shared","persistentVolumeClaim":{"claimName":"test"}}]}}}],"upscalingSpeed":1,"workerStartRayCommands":["ray stop","ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 &> /tmp/raylogs"]},"status":{"autoscalerRetries":0}}
creationTimestamp: "2022-02-11T23:37:45Z"
finalizers:
- kopf.zalando.org/KopfFinalizerMarker
generation: 1
name: blue-k8sray-cloud
namespace: khk
resourceVersion: "79671714"
uid: e6e5226f-1ab5-4de6-ab5b-233f1dd37568
spec:
headPodType: head-node
headServicePorts:
- name: client
port: 10001
targetPort: 10001
- name: dashboard
port: 8265
targetPort: 8265
- name: ray-serve
port: 8000
targetPort: 8000
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 &> /tmp/raylogs
idleTimeoutMinutes: 5
maxWorkers: 2
podTypes:
- name: head-node
podConfig:
apiVersion: v1
kind: Pod
metadata:
generateName: head-
spec:
containers:
- args:
- 'trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity
& wait;'
command:
- /bin/bash
- -c
- --
image: ci-artifacts.c3.ai/c3:custom-ray
imagePullPolicy: Always
name: ray-node
ports:
- containerPort: 6379
protocol: TCP
- containerPort: 10001
protocol: TCP
- containerPort: 8265
protocol: TCP
- containerPort: 8000
protocol: TCP
resources:
limits:
cpu: 1200m
memory: 4Gi
nvidia.com/gpu: "0"
requests:
cpu: 1200m
ephemeral-storage: 1Gi
memory: 4Gi
nvidia.com/gpu: "0"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /usr/local/share
name: environment-shared
restartPolicy: Always
volumes:
- emptyDir:
medium: Memory
name: dshm
- name: environment-shared
persistentVolumeClaim:
claimName: ctptest
rayResources:
CPU: 0
- maxWorkers: 2
minWorkers: 2
name: worker-node
podConfig:
apiVersion: v1
kind: Pod
metadata:
generateName: worker-
spec:
containers:
- args:
- 'trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity
& wait;'
command:
- /bin/bash
- -c
- --
image: ci-artifacts.c3.ai/c3:custom-ray
imagePullPolicy: Always
name: ray-node
resources:
limits:
cpu: 1200m
memory: 2Gi
nvidia.com/gpu: "0"
requests:
cpu: 1200m
ephemeral-storage: 1Gi
memory: 2Gi
nvidia.com/gpu: "0"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /usr/local/share/c3/server
name: environment-shared
restartPolicy: Always
volumes:
- emptyDir:
medium: Memory
name: dshm
- name: environment-shared
persistentVolumeClaim:
claimName: test
upscalingSpeed: 1
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 &> /tmp/raylogs
status:
autoscalerRetries: 1
kopf:
progress: {}
phase: AutoscalingExceptionRecovery
Also the YAML output for the head pod:
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: "2022-02-11T23:37:45Z"
generateName: blue-k8sray-cloud-head-
labels:
cluster.ray.io/component: blue-k8sray-cloud-ray-head
ray-cluster-name: blue-k8sray-cloud
ray-file-mounts-contents: da39a3ee5e6b4b0d3255bfef95601890afd80709
ray-launch-config: 4f78c0890df358fb3fed7aa1a7c10e51a68c9dd1
ray-node-name: ray-blue-k8sray-cloud-head
ray-node-status: up-to-date
ray-node-type: head
ray-node-uuid: 93d8c86a-3382-45ca-9f88-98533af4f9d5
ray-runtime-config: 6828672a6e8cc7a7ae88f76f458d2a8124f6784f
ray-user-node-type: head-node
name: blue-k8sray-cloud-head-kwqb9
namespace: khk
ownerReferences:
- apiVersion: cluster.ray.io/v1
blockOwnerDeletion: true
controller: true
kind: RayCluster
name: blue-k8sray-cloud
uid: e6e5226f-1ab5-4de6-ab5b-233f1dd37568
resourceVersion: "78839290"
uid: 737e8d15-5e4b-4f74-8eb5-fb1ddbcfeebb
spec:
containers:
- args:
- 'trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity &
wait;'
command:
- /bin/bash
- -c
- --
image: ci-artifacts.c3.ai/c3:custom-ray
imagePullPolicy: Always
name: ray-node
ports:
- containerPort: 6379
protocol: TCP
- containerPort: 10001
protocol: TCP
- containerPort: 8265
protocol: TCP
- containerPort: 8000
protocol: TCP
resources:
limits:
cpu: 1200m
memory: 4Gi
nvidia.com/gpu: "0"
requests:
cpu: 1200m
ephemeral-storage: 1Gi
memory: 4Gi
nvidia.com/gpu: "0"
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /usr/local/share/c3/server
name: environment-shared
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-ltflp
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: aks-agentpool-67207258-vmss000008
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
- effect: NoSchedule
key: node.kubernetes.io/memory-pressure
operator: Exists
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
volumes:
- emptyDir:
medium: Memory
name: dshm
- name: environment-shared
persistentVolumeClaim:
claimName: test
- name: kube-api-access-ltflp
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2022-02-11T23:37:45Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2022-02-11T23:37:47Z"
status: "True"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2022-02-11T23:37:47Z"
status: "True"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2022-02-11T23:37:45Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://e468980660c376910e92169894857baca2a520a963ccd6fb04a8b71176293fc7
image: ci-artifacts.c3.ai/c3:custom-ray
imageID: ci-artifacts.c3.ai/c3@sha256:...
lastState: {}
name: ray-node
ready: true
restartCount: 0
started: true
state:
running:
startedAt: "2022-02-11T23:37:47Z"
hostIP: <hidden>
phase: Running
podIP: <hidden>
podIPs:
- ip: <hidden>
qosClass: Guaranteed
startTime: "2022-02-11T23:37:45Z"