I tried to run the XGBoost training example as described here on a Ray Cluster created using ray-ml:2.7.0-py310-cpu
on GKE.
When I submit the job by running the script in the link, I get PermissionError: [Errno 13] Cannot create directory '/mnt/cluster_storage'. Detail: [errno 13] Permission denied
during the training process. Attaching the Traceback:
Traceback (most recent call last):
File "/home/ray/ray/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py", line 57, in run
super(MyProcess, self).run()
File "/home/ray/anaconda3/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/ray/ray/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py", line 102, in run_xgboost_training
result = trainer.fit()
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/train/base_trainer.py", line 653, in fit
result_grid = tuner.fit()
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/tune/tuner.py", line 372, in fit
return self._local_tuner.fit()
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 579, in fit
analysis = self._fit_internal(trainable, param_space)
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 699, in _fit_internal
analysis = run(
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/tune/tune.py", line 851, in run
experiments[i] = Experiment(
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/tune/experiment/experiment.py", line 204, in __init__
self.storage = StorageContext(
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/train/_internal/storage.py", line 474, in __init__
self._create_validation_file()
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/train/_internal/storage.py", line 498, in _create_validation_file
self.storage_filesystem.create_dir(self.experiment_fs_path)
File "pyarrow/_fs.pyx", line 593, in pyarrow._fs.FileSystem.create_dir
File "pyarrow/error.pxi", line 113, in pyarrow.lib.check_status
PermissionError: [Errno 13] Cannot create directory '/mnt/cluster_storage'. Detail: [errno 13] Permission denied
Ray Cluster yaml
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
name: raycluster-autoscaler
spec:
rayVersion: '2.7.0'
enableInTreeAutoscaling: true
autoscalerOptions:
upscalingMode: Default
imagePullPolicy: IfNotPresent
securityContext: {}
env: []
envFrom: []
resources:
limits:
cpu: "500m"
memory: "512Mi"
requests:
cpu: "500m"
memory: "512Mi"
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
#pod template
template:
spec:
nodeSelector:
cloud.google.com/gke-nodepool: default-pool
containers:
# The Ray head container
- name: ray-head
image: rayproject/ray-ml:2.7.0-py310-cpu
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "3"
memory: "12G"
requests:
cpu: "2"
memory: "11G"
workerGroupSpecs:
- replicas: 1
minReplicas: 0
maxReplicas: 10
groupName: small-group
rayStartParams:
resources: '"{\"small_jobs\": 1}"'
#pod template
template:
spec:
nodeSelector:
cloud.google.com/gke-nodepool: small-pool
containers:
- name: ray-worker
image: rayproject/ray-ml:2.7.0-py310-cpu
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "3"
memory: "12G"
requests:
cpu: "3"
memory: "12G"
- replicas: 1
minReplicas: 0
maxReplicas: 10
groupName: large-group
rayStartParams:
resources: '"{\"large_jobs\": 1}"'
#pod template
template:
spec:
nodeSelector:
cloud.google.com/gke-nodepool: large-pool
containers:
- name: ray-worker
image: rayproject/ray-ml:2.7.0-py310-cpu
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "6"
memory: "25G"
requests:
cpu: "6"
memory: "25G"