Hi, im using this image: horovod/horovod-ray:0.24.3 on 3 machines deployed with this:
version: "3.8"
services:
ray-head:
image: horovod/horovod-ray:0.24.3
ports:
- "6379:6379"
- "8265:8265"
- "10001:10001"
command: bash -c "ray start --head --dashboard-port=8265 --port=6379 --dashboard-host=0.0.0.0 --redis-password=passwd --block"
shm_size: 2g
deploy:
placement:
constraints:
- "node.role==manager"
resources:
limits:
cpus: '1'
memory: '2g'
networks:
- ray_net
ray-worker:
image: horovod/horovod-ray:0.24.3
ports:
- "9500:9500"
depends_on:
- ray-head
command: bash -c "ray start --address=ray-head:6379 --redis-password=passwd --num-cpus=2 --block"
shm_size: 2g
deploy:
replicas: 2
placement:
constraints:
- "node.role==worker"
resources:
limits:
cpus: '2'
memory: '2g'
networks:
- ray_net
networks:
ray_net:
it works properly until I call executor.start()
Sometimes I get the error that I have 5 CPU on cluster but was required {CPU: 1, CPU: 1}
Any idea? I had same problem with kuberay example