Hi, I’m new to Ray, and I need help with an issue that arose after I updated the Ray operator (deployed with Helm) from version 1.0.0 to 1.1.0.
When I deploy a Ray cluster using YAML with a custom image, the head node is created but the workers are not.
If I try to run a Python script that executes tasks on the 3 workers, it doesn’t do anything and keeps processing indefinitely.
Attached is the YAML manifest with which I deploy on Kubernetes and the Dockerfile with the custom image.
This is my manifest.yaml:
# Source: ray-cluster/templates/raycluster-cluster.yaml
apiVersion: ray.io/v1
kind: RayCluster
metadata:
labels:
helm.sh/chart: ray-cluster-1.1.0
app.kubernetes.io/instance: raycluster
app.kubernetes.io/managed-by: Helm
name: raycluster-kuberay
namespace: trading-app
spec:
rayVersion: "2.10.0"
enableInTreeAutoscaling: true
autoscalerOptions:
upscalingMode: Default
idleTimeoutSeconds: 10
resources:
limits:
cpu: "5"
memory: 5Gi
requests:
cpu: "5"
memory: 5Gi
headGroupSpec:
serviceType: NodePort
rayStartParams:
dashboard-host: "0.0.0.0"
num-cpus: "0"
num-gpus: "0"
template:
spec:
imagePullSecrets:
[]
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: log-volume
name: ray-head
image: 192.168.0.30:5000/ray
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "1"
memory: 5Gi
requests:
cpu: "1"
memory: 5Gi
#nvidia.com/gpu: 1 # requesting 1 GPU
securityContext:
{}
env:
[]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
volumes:
- emptyDir: {}
name: log-volume
affinity:
{}
tolerations:
[]
nodeSelector:
kubernetes.io/hostname: "master"
metadata:
annotations:
{}
labels:
helm.sh/chart: ray-cluster-1.1.0
app.kubernetes.io/instance: raycluster
app.kubernetes.io/managed-by: Helm
workerGroupSpecs:
- rayStartParams: {}
replicas: 1
minReplicas: 1
maxReplicas: 1
groupName: gpu6.1-worker-group
template:
spec:
restartPolicy: OnFailure
imagePullSecrets: []
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: log-volume
name: ray-cpu-worker
image: 192.168.0.30:5000/ray
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "10"
memory: 28Gi
nvidia.com/gpu: 1
requests:
cpu: "10"
memory: 28Gi
nvidia.com/gpu: 1
securityContext: {}
env: []
ports:
null
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
volumes:
- emptyDir: {}
name: log-volume
affinity: {}
tolerations: []
nodeSelector:
nvidia.com/gpu.product : "NVIDIA-GeForce-GTX-1080-Ti"
metadata:
annotations: {}
labels:
helm.sh/chart: ray-cluster-1.1.0
app.kubernetes.io/instance: raycluster
app.kubernetes.io/managed-by: Helm
- rayStartParams: {}
replicas: 1
minReplicas: 1
maxReplicas: 1
groupName: gpui8.9-worker-group
template:
spec:
restartPolicy: OnFailure
imagePullSecrets: []
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: log-volume
name: ray-gpu-worker
image: 192.168.0.30:5000/ray
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "18"
memory: 165Gi
nvidia.com/gpu: 1
requests:
cpu: "18"
memory: 165Gi
nvidia.com/gpu: 1
securityContext: {}
env: []
ports:
null
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
volumes:
- emptyDir: {}
name: log-volume
affinity: {}
tolerations: []
nodeSelector:
nvidia.com/gpu.product : "NVIDIA-GeForce-RTX-4090"
metadata:
annotations: {}
labels:
helm.sh/chart: ray-cluster-1.1.0
app.kubernetes.io/instance: raycluster
app.kubernetes.io/managed-by: Helm
- rayStartParams: {}
replicas: 1
minReplicas: 1
maxReplicas: 1
groupName: gpu8.6-worker-group
template:
spec:
restartPolicy: OnFailure
imagePullSecrets: []
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: log-volume
name: ray-gpu-worker
image: 192.168.0.30:5000/ray
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "10"
memory: 28Gi
nvidia.com/gpu: 1
requests:
cpu: "10"
memory: 28Gi
nvidia.com/gpu: 1
securityContext: {}
env: []
ports:
null
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
volumes:
- emptyDir: {}
name: log-volume
affinity: {}
tolerations: []
nodeSelector:
nvidia.com/gpu.product : "NVIDIA-GeForce-RTX-3080-Ti"
metadata:
annotations: {}
labels:
helm.sh/chart: ray-cluster-1.1.0
app.kubernetes.io/instance: raycluster
app.kubernetes.io/managed-by: Helm
---
apiVersion: v1
kind: Service
metadata:
name: kuberay-head-svc
namespace: trading-app
spec:
type: NodePort
ports:
- name: client
port: 10001
targetPort: 10001
nodePort: 32001
protocol: TCP
- name: dashboard
port: 8265
targetPort: 8265
nodePort: 32002
protocol: TCP
- name: gcs
port: 6379
targetPort: 6379
#nodePort: 32003
protocol: TCP
- name: serve
port: 8000
targetPort: 8000
#nodePort: 32004
protocol: TCP
selector:
app.kubernetes.io/name: kuberay
And this is the Dockerfile with the custom image:
FROM rayproject/ray:latest-py311
ARG NB_USER="ray"
ARG NB_UID="1000"
ARG NB_GID="100"
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
USER root
# Install all OS dependencies for the Server that starts
# but lacks all features (e.g., download as all possible file formats)
ENV DEBIAN_FRONTEND noninteractive
RUN apt-get update --yes && \
# - `apt-get upgrade` is run to patch known vulnerabilities in apt-get packages as
# the Ubuntu base image is rebuilt too seldom sometimes (less than once a month)
apt-get upgrade --yes && \
apt-get install --yes --no-install-recommends \
# - bzip2 is necessary to extract the micromamba executable.
bzip2 \
ca-certificates \
locales \
sudo \
# - tini is installed as a helpful container entrypoint that reaps zombie
# processes and such of the actual executable we want to start, see
# https://github.com/krallin/tini#why-tini for details.
tini \
wget && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \
locale-gen
#CONDA_DIR = /opt/conda \
# Configure environment
ENV CONDA_DIR="/home/${NB_USER}/anaconda3" \
SHELL=/bin/bash \
NB_USER="${NB_USER}" \
NB_UID=${NB_UID} \
NB_GID=${NB_GID} \
LC_ALL=en_US.UTF-8 \
LANG=en_US.UTF-8 \
LANGUAGE=en_US.UTF-8
ENV PATH="${CONDA_DIR}/bin:${PATH}" \
HOME="/home/${NB_USER}"
#ENV PATH="${CONDA_DIR}/bin:${PATH}" \
# Copy a script that we will use to correct permissions after running certain commands
COPY fix-permissions /usr/local/bin/fix-permissions
RUN chmod a+rx /usr/local/bin/fix-permissions
# Enable prompt color in the skeleton .bashrc before creating the default NB_USER
# hadolint ignore=SC2016
RUN sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc && \
# More information in: https://github.com/jupyter/docker-stacks/pull/2047
# and docs: https://docs.conda.io/projects/conda/en/latest/dev-guide/deep-dives/activation.html
echo 'eval "$(conda shell.bash hook)"' >> /etc/skel/.bashrc
# Create NB_USER with name jovyan user with UID=1000 and in the 'users' group
# and make sure these dirs are writable by the `users` group.
RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \
sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \
sed -i.bak -e 's/^%sudo/#%sudo/' /etc/sudoers && \
mkdir -p "${CONDA_DIR}" && \
chown "${NB_USER}:${NB_GID}" "${CONDA_DIR}" && \
chmod g+w /etc/passwd && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
USER ${NB_UID}
# Pin the Python version here, or set it to "default"
ARG PYTHON_VERSION=3.11.6
# Setup work directory for backward-compatibility
RUN mkdir "/home/${NB_USER}/work" && \
fix-permissions "/home/${NB_USER}"
# Download and install Micromamba, and initialize the Conda prefix.
# <https://github.com/mamba-org/mamba#micromamba>
# Similar projects using Micromamba:
# - Micromamba-Docker: <https://github.com/mamba-org/micromamba-docker>
# - repo2docker: <https://github.com/jupyterhub/repo2docker>
# Install Python, Mamba, and jupyter_core
# Cleanup temporary files and remove Micromamba
# Correct permissions
# Do all this in a single RUN command to avoid duplicating all of the
# files across image layers when the permissions change
COPY --chown="${NB_UID}:${NB_GID}" initial-condarc "${CONDA_DIR}/.condarc"
WORKDIR /tmp
RUN set -x && \
arch=$(uname -m) && \
if [ "${arch}" = "x86_64" ]; then \
# Should be simpler, see <https://github.com/mamba-org/mamba/issues/1437>
arch="64"; \
fi && \
wget --progress=dot:giga -O /tmp/micromamba.tar.bz2 \
"https://micromamba.snakepit.net/api/micromamba/linux-${arch}/latest" && \
tar -xvjf /tmp/micromamba.tar.bz2 --strip-components=1 bin/micromamba && \
rm /tmp/micromamba.tar.bz2 && \
PYTHON_SPECIFIER="python=${PYTHON_VERSION}" && \
if [[ "${PYTHON_VERSION}" == "default" ]]; then PYTHON_SPECIFIER="python"; fi && \
# Install the packages
./micromamba install \
--root-prefix="${CONDA_DIR}" \
--prefix="${CONDA_DIR}" \
--yes \
"${PYTHON_SPECIFIER}" \
'mamba' && \
rm micromamba && \
# Pin major.minor version of python
mamba list python | grep '^python ' | tr -s ' ' | cut -d ' ' -f 1,2 >> "${CONDA_DIR}/conda-meta/pinned" && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
# Configure container startup
ENTRYPOINT ["tini", "-g", "--"]
CMD ["start.sh"]
# Copy local files as late as possible to avoid cache busting
COPY run-hooks.sh start.sh /usr/local/bin/
USER root
# Create dirs for startup hooks
RUN mkdir /usr/local/bin/start-notebook.d && \
mkdir /usr/local/bin/before-notebook.d
COPY 10activate-conda-env.sh /usr/local/bin/before-notebook.d/
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
gcc python-dev-is-python3 zlib1g-dev libgl1-mesa-dev libgtk2.0-dev\
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -y install sudo dialog apt-utils
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb \
&& sudo dpkg -i cuda-keyring_1.1-1_all.deb \
&& sudo apt-get update \
&& sudo apt-get -y install cuda-toolkit-12-4
USER ${NB_UID}
# Name your environment and choose the Python version
ARG env_name=python3116
ARG py_ver=3.11.6
# You can add additional libraries here
RUN mamba create --yes -p "${CONDA_DIR}/envs/${env_name}" \
python=${py_ver} \
"ipykernel" \
"jupyterlab" &&\
mamba install --yes -p "${CONDA_DIR}/envs/${env_name}" "cuda" -c nvidia &&\
mamba install --yes -p "${CONDA_DIR}/envs/${env_name}" "numba" &&\
mamba install --yes -p "${CONDA_DIR}/envs/${env_name}" "cupy" &&\
mamba clean --all -f -y
#conda update -n base -c conda-forge conda &&\
RUN conda install --yes -p "${CONDA_DIR}/envs/${env_name}" --channel=conda-forge "nb_conda_kernels"
# Create Python kernel and link it to jupyter
RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m ipykernel install --user --name="${env_name}" &&\
fix-permissions "${CONDA_DIR}" &&\
fix-permissions "/home/${NB_USER}"
################################## TA-Lib #############################################
RUN sudo apt-get update
RUN sudo apt install -y make
COPY --chown=${NB_UID}:${NB_GID} ta-lib-0.4.0-src.tar.gz /tmp/
RUN tar -xzf /tmp/ta-lib-0.4.0-src.tar.gz -C ${CONDA_DIR}/envs/${env_name} &&\
cd ${CONDA_DIR}/envs/${env_name}/ta-lib/ &&\
./configure --prefix="${CONDA_DIR}/envs/${env_name}" &&\
make &&\
make install
#RUN export TA_LIBRARY_PATH="${CONDA_DIR}/envs/${env_name}/lib" &&\
# export TA_INCLUDE_PATH="${CONDA_DIR}/envs/${env_name}/include"
RUN rm -R ${CONDA_DIR}/envs/${env_name}/ta-lib /tmp/ta-lib-0.4.0-src.tar.gz
#########################################################################################
# Any additional `pip` installs can be added by using the following line
# Using `mamba` is highly recommended though
RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m pip install --upgrade pip
RUN export CPATH=$CPATH:/usr/local/cuda/include \
&& export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
&& export PATH=/usr/local/cuda/bin:$PATH \
&& sudo apt-get install python-dev -y\
&& export CUDA_ROOT=/usr/local/cuda \
&& "${CONDA_DIR}/envs/${env_name}/bin/pip" install --no-cache-dir -U pycuda
COPY --chown=${NB_UID}:${NB_GID} requirements.txt /tmp/
RUN "${CONDA_DIR}/envs/${env_name}/bin/pip" install --no-cache-dir -U \
"ray[default]" \
'flake8' \
"apache-airflow==2.8.1" --requirement /tmp/requirements.txt
# This changes the custom Python kernel so that the custom environment will
# be activated for the respective Jupyter Notebook and Jupyter Console
# hadolint ignore=DL3059
#RUN /opt/setup-scripts/activate_notebook_custom_env.py "${env_name}"
# Comment the line above and uncomment the section below instead to activate the custom environment by default
# Note: uncommenting this section makes "${env_name}" default both for Jupyter Notebook and Terminals
# More information here: https://github.com/jupyter/docker-stacks/pull/2047
USER root
RUN \
#This changes a startup hook, which will activate the custom environment for the process \
echo conda activate "${env_name}" >> /usr/local/bin/before-notebook.d/10activate-conda-env.sh && \
#This makes the custom environment default in Jupyter Terminals for all users which might be created later \
echo conda activate "${env_name}" >> /etc/skel/.bashrc && \
#This makes the custom environment default in Jupyter Terminals for already existing NB_USER \
echo conda activate "${env_name}" >> "/home/${NB_USER}/.bashrc"
USER ${NB_UID}