Reproducibility of ray.tune with seeds

Peter_Pirog · July 24, 2022, 2:40pm

@philmax @mannyv I have solved the problem with reproducibillity by making function:

def set_reproducibillity(seed=None):
    if seed is None:
        seed = 42
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)
    tf.config.experimental.enable_op_determinism() #tested with tensorflow=2.9.1
    np.random.seed(seed)
    random.seed(seed)

This function must be placed in two places:

main body
trainable function
the example code is below:

import random
import numpy as np
import ray
import tensorflow as tf
from ray import tune
from ray.tune.integration.keras import TuneReportCallback
from ray.tune.schedulers import ASHAScheduler
from tensorflow.keras.datasets import mnist


def set_reproducibillity(seed=None):
    if seed is None:
        seed = 42
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)
    tf.config.experimental.enable_op_determinism()
    np.random.seed(seed)
    random.seed(seed)


def train_mnist(config):

    if config["reproducibility_active"]:
        set_reproducibillity()
    batch_size = config["batch"]
    num_classes = 10
    epochs = 200

    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # define model
    inputs = tf.keras.layers.Input(shape=(28, 28))
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.LayerNormalization()(x)
    for i in range(config["layers"]):
        x = tf.keras.layers.Dense(units=config["hidden"], activation=config["activation"])(x)
        x = tf.keras.layers.Dropout(config["dropout"])(x)
    outputs = tf.keras.layers.Dense(units=num_classes, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="mnist_model")
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=config["learning_rate"]),
        metrics=["accuracy"])

    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=0,
        validation_data=(x_test, y_test),
        callbacks=[TuneReportCallback({
            "mean_accuracy": "val_accuracy"  ##optional values ['loss', 'accuracy', 'val_loss', 'val_accuracy']
        })])


if __name__ == "__main__":

    print('Is cuda available for container:', tf.config.list_physical_devices('GPU'))
    ray.init()
    config = {
        "reproducibility_active": True,
        "learning_rate": tune.choice([1e-5, 1e-4, 1e-3, 1e-2]),
        "hidden": tune.choice([16, 32, 64, 128]),
        "dropout": tune.choice([0.01, 0.02, 0.05, 0.1, 0.2]),  # tune.uniform(0.01, 0.2)
        "activation": tune.choice(["relu", "elu"]),
        "layers": tune.choice([1, 2, 3]),
        "batch": tune.choice([4, 8, 16, 32, 64, 128]),
    }

    if config["reproducibility_active"]:
        set_reproducibillity()

    sched_asha = ASHAScheduler(time_attr="training_iteration",
                               max_t=100,
                               grace_period=10,
                               # mode='max', #find maximum, do not define here if you define in tune.run
                               reduction_factor=3,
                               # brackets=1
                               )

    analysis = tune.run(
        train_mnist,
        name="exp",
        scheduler=sched_asha,
        # Checkpoint settings
        keep_checkpoints_num=3,
        checkpoint_freq=3,
        checkpoint_at_end=True,
        # Optimalization
        metric="mean_accuracy",
        mode="max",
        stop={  # trial is finished if this value is reached
            "mean_accuracy": 0.96,
            "training_iteration": 10,
            'time_this_iter_s': 50,
            # 'timesteps_total': 1000,
            # 'episodes_total': 1000,
            # 'time_total_s': 1000,
        },
        time_budget_s=200,  # Global time budget in seconds after which all trials are stopped.
        num_samples=10,  # number of tested configurations from hyperspace
        reuse_actors=True,

        local_dir='../ray_results',  # default value is ~/ray_results
        resources_per_trial={
            "cpu": 1,
            "gpu": 0
        },
        config=config,
        verbose=3,  # values 0 to 3
    )
    print("Best hyperparameters found were: ", analysis.best_config)

Topic		Replies	Views
Confusing behavior in PPO training loop (train_batch_size, sgd_minibatch_size, num_sgd_iter) RLlib	1	336	July 27, 2022
PPO configuration parameters: num_rollout_workers & train_batch_size Configure Algorithm, Training, Evaluation, Scaling	1	348	November 2, 2023
Cannot reproduce training results in evaluation even on same dataset RLlib	1	403	November 20, 2022
Bad inference after perfect training. What am I missing? RLlib	3	601	June 8, 2022
PPO order of actions/obs/rewards scrambled RLlib	1	408	January 15, 2022

Reproducibility of ray.tune with seeds

Related Topics