Hi and thanks for developing this awesome libary!
I’d like to ask for some guidance on how to debug this problem:
I currently can’t get a Tune job to autoscale workers on GCP Kubernetes. The submitted job runs just on the head done.
When I submit a basic remote job (not a Tune job), the autoscaler works fine and starts the worker nodes.
I am monitoring the autoscaler and it doesn’t display any errors.
I use the rayproject/ray-ml:nightly
image. I submit the jobs with ray submit example-full.yaml dummy.py
.
My local Ray version is 2.0.0.dev0
, installed from the nightly wheel with pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
.
The dummy remote job that autoscales ok is:
import ray
ray.init(address='auto')
@ray.remote
def f(start):
""" Dummy load """
primes = []
for candidate in range(2, start+1):
prime = True
for i in range(2, candidate):
if candidate % i == 0:
prime = False
break
if prime:
primes.append(candidate)
return primes
futures = [f.remote(100000) for i in range(50)]
print(len(ray.get(futures)))
The Tune job that doesn’t autoscale is:
import ray
from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer
def objective(config):
""" Dummy load """
for step in range(config["steps"]):
start = 100000
primes = []
for candidate in range(2, start+1):
prime = True
for i in range(2, candidate):
if candidate % i == 0:
prime = False
break
if prime:
primes.append(candidate)
tune.report(loss=1.0)
if __name__ == "__main__":
steps = 5
num_samples = 10
scheduler = AsyncHyperBandScheduler(grace_period=steps-2, max_t=steps)
sync_config = tune.SyncConfig(
sync_to_driver=NamespacedKubernetesSyncer("my-namespace")
)
ray.init(address="auto")
analysis = tune.run(
objective,
scheduler=scheduler,
metric="loss",
mode="min",
num_samples=num_samples,
config={
"steps": steps
},
resources_per_trial={"cpu": 1},
sync_config=sync_config
)
print("Best hyperparameters found were: ", analysis.best_config)
My config is from here, just the namespace changed and images changed to rayproject/ray-ml:nightly
:
# A unique identifier for the head node and workers of this cluster.
cluster_name: example-cluster
# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
type: kubernetes
# Exposing external IP addresses for ray pods isn't currently supported.
use_internal_ips: true
# Namespace to use for all resources created.
namespace: my-namespace
# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
# contain a user-created service account with the proper permissions.
autoscaler_service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: autoscaler
# Role created by the autoscaler for the head node pod that it runs in.
# If this field isn't provided, the role referenced in
# autoscaler_role_binding must exist and have at least these permissions.
autoscaler_role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "patch"]
# RoleBinding created by the autoscaler for the head node pod that it runs
# in. If this field isn't provided, the head pod config below must contain
# a user-created service account with the proper permissions.
autoscaler_role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: autoscaler
subjects:
- kind: ServiceAccount
name: autoscaler
roleRef:
kind: Role
name: autoscaler
apiGroup: rbac.authorization.k8s.io
services:
# Service that maps to the head node of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: example-cluster-ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: example-cluster-ray-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
worker_node:
# Minimum number of Ray workers of this Pod type.
min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
# User-specified custom resources for use by Ray. Object with string keys and integer values.
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
resources: {"foo": 1, "bar": 2}
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-worker-
spec:
restartPolicy: Never
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray-ml:nightly
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
head_node:
node_config:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
# Must match the head node service selector above if a head node
# service is required.
labels:
component: example-cluster-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: rayproject/ray-ml:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
ports:
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379