How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Hello everyone, I am still trying to create a usable cluster. Since the cluster launcher did not work for me on GCP. I am now using 3 instances with 2 cores, 8gb and 100gb hd on google cloud. I set up the swarm with all three and I’m using this compose file:
services:
ray-head:
image: camazaqui/ray-horovod:latest
ports:
- "6379:6379"
- "8265:8265"
- "10001:10001"
env_file:
- .env
command: bash -c "ray start --head --dashboard-port=8265 --port=6379 --dashboard-host=0.0.0.0 --redis-password=passwd --num-cpus=0 --block"
shm_size: 2g
deploy:
mode: global
placement:
constraints:
- "node.role==manager"
resources:
limits:
cpus: '1'
memory: '2g'
networks:
- ray_net
ray-worker:
image: camazaqui/ray-horovod:latest
ports:
- "9500:9500"
depends_on:
- ray-head
env_file:
- .env
command: bash -c "ray start --address=ray-head:6379 --redis-password=passwd --num-cpus=2 --block"
shm_size: 2g
deploy:
mode: global
placement:
constraints:
- "node.role==worker"
resources:
limits:
cpus: '2'
memory: '2g'
networks:
- ray_net
networks:
ray_net:
The container that I’m using is the following:
FROM rayproject/ray-ml:latest-cpu
RUN sudo apt-get update && sudo apt-get install -y wget
RUN sudo apt-get -y install build-essential
RUN sudo apt-get -y install cmake gcc g++
ENV DOCKERIZE_VERSION v0.6.1
RUN sudo wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
&& sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
&& sudo rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
RUN HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_TENSORFLOW=1 pip install 'horovod[ray,tensorflow,keras]'
even tho it takes some time for the cluster to setup it looks like that I can access it.
i have a script that runs this:
address = f'{os.environ["NODE_IP_ADDRESS"]}:{os.environ["HOST_PORT"]}'
runtime_env = {"py_modules": [training_function], "pip": "./requirements.txt"}
ray.init(address=address, runtime_env=runtime_env)
and it works properly with this message:
INFO packaging.py:414 -- Creating a file package for local directory '/usr/src/binary_executor/training_function'.
INFO packaging.py:258 -- Pushing file package 'gcs://_ray_pkg_94e3d405b1033ff1.zip' (0.01MiB) to Ray cluster...
INFO packaging.py:267 -- Successfully pushed file package 'gcs://_ray_pkg_94e3d405b1033ff1.zip'.
When I try to connect with this script:
import ray
from horovod.ray import RayExecutor
def train(num_epochs):
import tensorflow as tf
import horovod.tensorflow.keras as hvd
hvd.init()
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)
train_dat = tf.keras.preprocessing.image.ImageDataGenerator(
width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
preprocessing_function=tf.keras.applications.resnet50.preprocess_input
)
test_dat = tf.keras.preprocessing.image.ImageDataGenerator(
zoom_range=(0.875, 0.875),
preprocessing_function=tf.keras.applications.resnet50.preprocess_input
)
model = tf.keras.applications.ResNet50(
include_top=False,
weights=None,
input_shape=[32, 32, 3],
classes=10,
)
scaled_lr = 0.0125 * hvd.size()
opt = tf.optimizers.SGD(learning_rate=scaled_lr, momentum=0.9)
opt = hvd.DistributedOptimizer(
opt
)
model_config = model.get_config()
for layer, layer_config in zip(model.layers, model_config['layers']):
if hasattr(layer, 'kernel_regularizer'):
regularizer = tf.keras.regularizers.l2(0.00005)
layer_config['config']['kernel_regularizer'] = \
{'class_name': regularizer.__class__.__name__,
'config': regularizer.get_config()}
if type(layer) == tf.keras.layers.BatchNormalization:
layer_config['config']['momentum'] = 0.9
layer_config['config']['epsilon'] = 1e-5
model = tf.keras.models.Model.from_config(model_config)
model.compile(
loss=tf.losses.categorical_crossentropy,
optimizer=opt,
metrics=['accuracy', 'top_k_categorical_accuracy'],
)
callbacks = [
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
hvd.callbacks.MetricAverageCallback(),
hvd.callbacks.LearningRateWarmupCallback(
initial_lr=scaled_lr, warmup_epochs=5, verbose=1
)
]
if hvd.rank() == 0:
callbacks.append(tf.keras.callbacks.ModelCheckpoint("./checkpoint-{epoch}.h5"))
callbacks.append(tf.keras.callbacks.TensorBoard('./logs'))
verbose = 1 if hvd.rank() == 0 else 0
model.fit_generator(train_dat.flow(x_train, y_train, batch_size=32),
steps_per_epoch=len(x_train) // hvd.size(),
callbacks=callbacks,
epochs=num_epochs,
verbose=verbose,
validation_data=test_dat.flow(x_test, y_test, batch_size=32),
validation_steps=3 * len(x_test) // hvd.size(),
)
score = hvd.allreduce(model.evaluate(test_dat.flow(x_test, y_test)))
if verbose:
print('Test loss:', score[0])
print('Test accuracy:', score[1])
if __name__ == '__main__':
ray.init(address='ray://<head-address>:10001')
settings = RayExecutor.create_settings(timeout_s=300)
executor = RayExecutor(
settings, num_workers=2, use_gpu=False)
executor.start()
executor.run(train, kwargs=dict(num_epochs=10))
executor.shutdown()
it hangs on the executor.start().
head-log:
worker-log
Any red flags or places that I am missing? I just tried again and the dashboard is accessible but very slow.