Hi, I am a new Ray user and am trying to run it on a slurm cluster. I have posted my slurm batch script below where I request 1 cpu per task, 1 task per node, 1 task per core, and 5 nodes. So ideally, after I run trainer.py attached below, I should see 4 nodes = (5 - 1 head node) scheduled and in the print output I should see that different nodes are being used. However, the ip address counter continues to show that only one of the nodes with ip address 10.1.0.32 is used by the processes. Any help would be appreciated!
SLURM BATCH SCRIPT
#!/bin/bash
#SBATCH --job-name=test
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=1GB
#SBATCH --nodes=5
#SBATCH --ntasks-per-node=1
#SBATCH --ntasks-per-core=1
#SBATCH --time=12:00:00
#SBATCH -C centos7 #Request only Centos7 nodes
#SBATCH -p sched_mit_hill #Run on partition
#SBATCH -o output_%j.txt #redirect output to output_JOBID.txt
#SBATCH -e error_%j.txt #redirect errors to error_JOBID.txt
#SBATCH --mail-type=BEGIN,END #Mail when job starts and ends
#SBATCH --mail-user=gstepan@mit.edu #email recipient
let "worker_num=(${SLURM_NTASKS} - 1)"
# Define the total number of CPU cores available to ray
let "total_cores=${worker_num} * ${SLURM_CPUS_PER_TASK}"
module add engaging/anaconda/2.3.0
module add engaging/Ray/2.3.1
source activate py36
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=( $nodes )
node1=${nodes_array[0]}
ip_prefix=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address
suffix=':6379'
ip_head=$ip_prefix$suffix
redis_password=$(uuidgen)
export ip_head # Exporting for latter access by trainer.py
srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --port=6379 --redis-password=$redis_password & # Starting the head
sleep 5
# Make sure the head successfully starts before any worker does, otherwise
# the worker will not be able to connect to redis. In case of longer delay,
# adjust the sleeptime above to ensure proper order.
for (( i=1; i<=$worker_num; i++ ))
do
node2=${nodes_array[$i]}
srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head --redis-password=$redis_password & # Starting the workers
# Flag --block will keep ray process alive on each compute node.
sleep 5
done
python -u trainer.py $redis_password ${total_cores} # Pass the total number of allocated CPUs
trainer.py CODE FILE
from collections import Counter
import os
import sys
import time
import ray
import psutil
redis_password = sys.argv[1]
num_cpus = int(sys.argv[2])
ray.init(address=os.environ["ip_head"], _redis_password=redis_password)
print("Nodes in the Ray cluster:")
print(ray.nodes())
print(ray.cluster_resources())
@ray.remote(num_cpus=1)
def f():
print('hello')
time.sleep(60)
return ray._private.services.get_node_ip_address()
# The following takes one second (assuming that ray was able to access all of the allocated nodes).
for i in range(60):
start = time.time()
ip_addresses = ray.get([f.remote() for _ in range(num_cpus)])
print(Counter(ip_addresses))
end = time.time()
print(end - start)
OUTPUT OF RUN
Nodes in the Ray cluster:
[{'NodeID': '0f318fe40c5c5e0360c67fe35c7708d7b031312310d42751b343ab7e', 'Alive': True, 'NodeManagerAddress': '10.1.0.32', 'NodeManagerHostname': 'node032', 'NodeManagerPort': 53181, 'ObjectManagerPort': 41626, 'ObjectStoreSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/raylet', 'MetricsExportPort': 60340, 'alive': True, 'Resources': {'memory': 739.0, 'node:10.1.0.32': 1.0, 'CPU': 16.0, 'object_store_memory': 255.0, 'GPU': 1.0}}, {'NodeID': '836df0105c707f63d0a315fc98769a5fd11b61c4907c17dbb59082df', 'Alive': True, 'NodeManagerAddress': '10.1.3.82', 'NodeManagerHostname': 'node382', 'NodeManagerPort': 47231, 'ObjectManagerPort': 39475, 'ObjectStoreSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/raylet', 'MetricsExportPort': 45048, 'alive': True, 'Resources': {'object_store_memory': 238.0, 'memory': 806.0, 'node:10.1.3.82': 1.0, 'CPU': 20.0}}, {'NodeID': '54d41a42d6d23cf3c5d90dc14e525ca698574036a89eeba472bfad47', 'Alive': True, 'NodeManagerAddress': '10.1.3.70', 'NodeManagerHostname': 'node370', 'NodeManagerPort': 54569, 'ObjectManagerPort': 43297, 'ObjectStoreSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/raylet', 'MetricsExportPort': 65395, 'alive': True, 'Resources': {'memory': 825.0, 'CPU': 20.0, 'node:10.1.3.70': 1.0, 'object_store_memory': 244.0}}, {'NodeID': '11a1f14ac4d5d998846f1e0e59491fad3464cbfb9df8b7a92d489e87', 'Alive': True, 'NodeManagerAddress': '10.1.3.83', 'NodeManagerHostname': 'node383', 'NodeManagerPort': 53786, 'ObjectManagerPort': 46172, 'ObjectStoreSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/raylet', 'MetricsExportPort': 57737, 'alive': True, 'Resources': {'memory': 854.0, 'object_store_memory': 252.0, 'node:10.1.3.83': 1.0, 'CPU': 20.0}}, {'NodeID': '72eaf797f9ca333ece9f1ae138022be20967d18a8f00bae513409fb2', 'Alive': True, 'NodeManagerAddress': '10.1.3.69', 'NodeManagerHostname': 'node369', 'NodeManagerPort': 62447, 'ObjectManagerPort': 43913, 'ObjectStoreSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2021-01-28_15-52-00_234710_13057/sockets/raylet', 'MetricsExportPort': 63544, 'alive': True, 'Resources': {'CPU': 20.0, 'object_store_memory': 248.0, 'memory': 840.0, 'node:10.1.3.69': 1.0}}]
{'CPU': 96.0, 'object_store_memory': 1237.0, 'node:10.1.0.32': 1.0, 'memory': 4064.0, 'GPU': 1.0, 'node:10.1.3.82': 1.0, 'node:10.1.3.70': 1.0, 'node:10.1.3.83': 1.0, 'node:10.1.3.69': 1.0}
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
67.03512859344482
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06458353996277
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.064570903778076
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06623387336731
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.07250666618347
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06637930870056
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06863617897034
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06762766838074
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06600785255432
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.065871477127075
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.064775228500366
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06629490852356
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.05808687210083
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.0690484046936
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.067530393600464
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.16660666465759
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.06879186630249
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.067288637161255
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.05646514892578
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello
Counter({'10.1.0.32': 4})
60.09117889404297
(pid=13246) hello
(pid=13248) hello
(pid=13247) hello
(pid=13249) hello