I am using Ray on a cluster managed using SLURM and the Ray actor just uses one core even though I have allocated more cores to it.
Here is the python code:
import numpy as np
import ray
import time
import os
@ray.remote(num_cpus = 12)
class CPUActor(object):
def __init__(self, n):
self.A = np.random.randn(n, n)
self.B = np.random.randn(n, n)
def mul(self):
C = np.matmul(self.A, self.B)
return np.mean(C)
n = 5000
ray.init(address = 'auto')
actor = CPUActor.remote(n)
start_time = time.time()
K = 10
for i in range(K):
temp = ray.get(actor.mul.remote())
used_time = time.time() - start_time
print(f"used time in ray: {used_time:.4f}", flush = True)
This is the slurm script I used to run the Python code:
#!/bin/bash
#SBATCH --job-name=test_ray
#SBATCH --partition=xeon-p8
#SBATCH --time=00:10:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=36
#SBATCH --open-mode=truncate
#SBATCH --output=./use_ray.txt
## get node names
nodelist=$(scontrol show hostnames $SLURM_JOB_NODELIST)
echo "node list: $nodelist"
nodes_array=($nodelist)
srun --nodes=1 --ntasks=1 --nodelist=${nodes_array[0]} \
--output=./ip_address_${SLURM_JOB_ID}.txt \
--open-mode=truncate \
--error=/dev/null \
hostname --ip-address
ip_prefix=$(cat ./ip_address_${SLURM_JOB_ID}.txt) # making redis-address
suffix=':6379'
ip_head=$ip_prefix$suffix
export ip_head
echo "ip_head: ${ip_head}"
echo "STARTING HEAD at ${nodes_array[0]}"
echo "num of cpus:", $SLURM_CPUS_PER_TASK
srun --nodes=1 --ntasks=1 --nodelist=${nodes_array[0]} \
ray start --head --block \
--port 6379 --temp-dir=/home/gridsan/dingxq/tmp/ray \
--num-cpus=$SLURM_CPUS_PER_TASK &
sleep 10
export RAY_ADDRESS=$ip_head
python ./use_ray.py
exit