How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I have the following docker-compose.yml
configuration:
version: '3.9'
services:
postgres:
image: postgres:15.1
ports:
- 5432:5432
environment:
- POSTGRES_USER=postgres
- POSTGRES_DB=stylqr
- POSTGRES_PASSWORD=12345
networks:
- mynetwork
ray-head:
image: ray_head
build:
context: ./
dockerfile: tools/ray_head/Dockerfile
command: ray start --head --node-ip-address=0.0.0.0 --disable-usage-stats --port=6023 --include-dashboard=True --dashboard-host=0.0.0.0 --dashboard-port=8265 --block
deploy:
resources:
limits:
cpus: '2'
memory: 1500MB
reservations:
cpus: '2'
memory: 1G
shm_size: '16gb'
ports:
- 6023:6023
- 8265:8265
networks:
- mynetwork
ray-worker-1:
image: ray_worker
build:
context: ./
dockerfile: tools/ray_worker/Dockerfile
command: ray start -v --log-style=auto --address=ray-head:6023 --block
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
shm_size: '16gb'
depends_on:
- ray-head
networks:
- mynetwork
networks:
mynetwork:
With the following Dockerfile
:
FROM ubuntu:22.04
WORKDIR /app
ENV LANG=C.UTF-8
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV DEBIAN_FRONTEND=noninteractive
ENV HOME /root
RUN apt-get update && \
apt-get install -y --no-install-recommends --allow-unauthenticated\
nano \
software-properties-common \
gnupg-agent \
ca-certificates \
apt-transport-https \
curl \
git \
wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y --no-install-recommends \
python3.11 \
pip \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
RUN pip install pydantic==1.10.10
RUN pip install gpustat==1.0.0 # 1.1 crashes
RUN pip install "ray[default]==2.5.1"
Now when I start the following script that lives on my local machine, outside docker:
import ray
ray.init(address='auto')
print(f'This cluster consists of {len(ray.nodes())} nodes in total {ray.cluster_resources()["CPU"]} CPU resources in total.')
I get the following:
❯ python script.py
2023-07-10 14:12:12,446 INFO worker.py:1452 -- Connecting to existing Ray cluster at address: 0.0.0.0:6023...
[2023-07-10 14:12:12,449 I 99920 99920] global_state_accessor.cc:356: This node has an IP address of 192.168.8.130, but we cannot find a local Raylet with the same address. This can happen when you connect to the Ray cluster with a different IP address or when connecting to a container.
2023-07-10 14:12:12,463 INFO worker.py:1627 -- Connected to Ray cluster. View the dashboard at http://0.0.0.0:8265
[2023-07-10 14:12:21,467 E 99920 99928] core_worker_process.cc:217: Failed to get the system config from raylet because it is dead. Worker will terminate. Status: GrpcUnavailable: RPC Error message: failed to connect to all addresses; RPC Error details: .Please see `raylet.out` for more details.
Finally:
❯ cat /tmp/ray/session_latest/logs/python-core-driver-08000000ffffffffffffffffffffffffffffffffffffffffffffffff_99920.log
[2023-07-10 14:12:12,464 I 99920 99920] core_worker_process.cc:107: Constructing CoreWorkerProcess. pid: 99920
[2023-07-10 14:12:21,467 E 99920 99928] core_worker_process.cc:217: Failed to get the system config from raylet because it is dead. Worker will terminate. Status: GrpcUnavailable: RPC Error message: failed to connect to all addresses; RPC Error details: .Please see `raylet.out` for more details.
Any idea what is happening? If I try to use the IP of the docker image instead of auto
in ray.init()
it’s not working.
Thanks!