Hello, I am trying to use 2 nodes, each with 4 gpus, and perform distributed training, let’s say 1 training with 8 gpus.
Below is the sbatch file.
#!/bin/bash
#SBATCH --job-name=RAYTUNE
#SBATCH --nodes=2
#SBATCH --gpus-per-task=4
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=50gb
#SBATCH --time=1:00:00
#SBATCH --output=slurm_%j.out
#SBATCH --error=slurm_%j.err
#SBATCH --wait-all-nodes=1
set -x
export NCCL_IB_DISABLE=1
export NCCL_DEBUG=INFO
export CUDA_LAUNCH_BLOCKING=1
redis_password=$(uuidgen)
export redis_password
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") # Getting the node names
nodes_array=($nodes)
node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w "$node_1" hostname --ip-address) # making redis-address
# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<< "$ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
ip=${ADDR[1]}
else
ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $ip"
fi
port=6379
head_node_ip=$ip
ip_head=$ip:$port
export head_node_ip
export ip_head
echo "IP Head: $ip_head"
echo "STARTING HEAD at $node_1"
srun --nodes=1 --ntasks=1 --mem=50G --gres=gpu:4 -c $SLURM_CPUS_ON_NODE -w "$node_1" \
ray start --head --node-ip-address="$ip" --port=$port --redis-password="$redis_password" --block &
sleep 30
worker_num=$((SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "STARTING WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 --mem=50G --gres=gpu:4 -c $SLURM_CPUS_ON_NODE -w "$node_i" ray start --address "$ip_head" --redis-password="$redis_password" --block &
sleep 5
done
##############################################################################################
#### call your code below
python3 -u main.py ${ARGS} &
wait
Below is the tuner
ray.init(address=os.environ["ip_head"], _node_ip_address=os.environ["head_node_ip"])
print_config(ray_tune_param_space)
async_hyper_band_scheduler = AsyncHyperBandScheduler(
grace_period=5, max_t=config["last_epoch"]
)
resources_per_trial = {"cpu": 8, "gpu": 8}
scaling_config = ScalingConfig(
num_workers=1, use_gpu=True, resources_per_worker={"CPU": 2, "GPU": 8}
)
ray_tune_param_space["scaling_config"] = scaling_config
tuner = tune.Tuner(
tune.with_resources(my_trainer, resources_per_trial),
tune_config=tune.TuneConfig(
metric="train_loss",
mode="min",
max_concurrent_trials=1,
scheduler=async_hyper_band_scheduler,
),
run_config=ray.train.RunConfig(
name=config["rt_name"],
storage_path=config["rt_storage_path"],
stop=model_wrapper.get_stop_conditions(),
checkpoint_config=ray.train.CheckpointConfig(num_to_keep=1),
verbose=2,
),
param_space=ray_tune_param_space,
)
results = tuner.fit()
But i get the following error
e[33m(autoscaler +2m26s)e[0m Error: No available node types can fulfill resource request {'GPU': 8.0, 'CPU': 8.0}. Add suitable node types to this cluster to resolve this issue.
Trial status: 50 PENDING
Current time: 2024-07-13 18:05:40. Total running time: 2min 30s
Logical resource usage: 0/80 CPUs, 0/8 GPUs (0.0/2.0 accelerator_type:T4)
This is output from status
$ ray status -v --address ####:6379
======== Autoscaler status: 2024-07-13 12:11:09.148586 ========
GCS request time: 0.001648s
Node Provider non_terminated_nodes time: 0.000018s
Node status
---------------------------------------------------------------
Active:
1 node_0c94c6e82406a1d10416e813d9701c8f951c9d0d9c2cb2378008bff2
1 node_04e75ba416ba00dd4dd0df77a886be7cd1d30c794419877f12e71d7a
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Total Usage:
0.0/80.0 CPU
0.0/8.0 GPU
0.0/2.0 accelerator_type:RTX
0B/317.22GiB memory
0B/38.00GiB object_store_memory
Total Demands:
{'CPU': 8.0, 'GPU': 8.0} * 1 (PACK): 1+ pending placement groups
Node: 0c94c6e82406a1d10416e813d9701c8f951c9d0d9c2cb2378008bff2 (node_0c94c6e82406a1d10416e813d9701c8f951c9d0d9c2cb2378008bff2)
Usage:
0.0/40.0 CPU
0.0/4.0 GPU
0.0/1.0 accelerator_type:RTX
0B/153.87GiB memory
0B/19.00GiB object_store_memory
Node: 04e75ba416ba00dd4dd0df77a886be7cd1d30c794419877f12e71d7a (node_04e75ba416ba00dd4dd0df77a886be7cd1d30c794419877f12e71d7a)
Usage:
0.0/40.0 CPU
0.0/4.0 GPU
0.0/1.0 accelerator_type:RTX
0B/163.34GiB memory
0B/19.00GiB object_store_memory
If someone could please tell me if this is not possible, or I’m not understanding it correctly.
Thank you