When trying to run a tune job I’m getting this zero division error from the auto-scaler.
Ray version: 1.2.0
Python version: 3.8
Stacktrace:
2021-02-25 16:57:18,262 ERROR autoscaler.py:139 -- StandardAutoscaler: Error during autoscaling.
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 137, in update
self._update()
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/autoscaler.py", line 238, in _update
to_launch = self.resource_demand_scheduler.get_nodes_to_launch(
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py", line 212, in get_nodes_to_launch
nodes_to_add_based_on_demand = get_nodes_for(
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py", line 631, in get_nodes_for
score = _utilization_score(node_resources, resources)
File "/usr/local/lib/python3.8/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py", line 680, in _utilization_score
util = (v - remaining[k]) / v
ZeroDivisionError: division by zero
cluster.yaml
cluster_name: my-cluster
min_workers: 0
max_workers: 20
upscaling_speed: 1.0
idle_timeout_minutes: 5
provider:
type: kubernetes
use_internal_ips: true
namespace: default
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: autoscaler
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: autoscaler
subjects:
- kind: ServiceAccount
name: autoscaler
roleRef:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
- apiVersion: v1
kind: Service
metadata:
name: ray-head
spec:
selector:
component: ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000
- apiVersion: v1
kind: Service
metadata:
name: ray-workers
spec:
selector:
component: ray-worker
ports:
- protocol: TCP
port: 8000
targetPort: 8000
head_node:
apiVersion: v1
kind: Pod
metadata:
generateName: ray-head-
labels:
component: ray-head
annotations:
iam.amazonaws.com/role: my-role
spec:
serviceAccountName: autoscaler
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: eks.amazonaws.com/capacityType
operator: In
values:
- ON_DEMAND
- key: kubernetes.io/arch
operator: In
values:
- amd64
containers:
- name: ray-node
imagePullPolicy: Always
image: my-image
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
volumeMounts:
- name: dshm
mountPath: /dev/shm
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
memory: 6Gi
env:
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
generateName: ray-worker-
labels:
component: ray-worker
annotations:
iam.amazonaws.com/role: my-role
spec:
serviceAccountName: default
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: eks.amazonaws.com/capacityType
operator: In
values:
- SPOT
- key: kubernetes.io/arch
operator: In
values:
- amd64
containers:
- name: ray-node
imagePullPolicy: Always
image: my-image
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
volumeMounts:
- name: dshm
mountPath: /dev/shm
resources:
requests:
cpu: 4000m
memory: 2Gi
limits:
memory: 6Gi
env:
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
file_mounts: {}
cluster_synced_files: []
file_mounts_sync_continuously: False
initialization_commands: []
setup_commands: []
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076