Tune.run not executing actual trials


from ray import tune
import ray
import os
os.add_dll_directory(r'e:\cuda\bin')
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers


def numpy_circle(x_center, y_center, radius=5, nparr: np.array = None, dim=None):
    if nparr is None:
        circle = np.zeros(dim)
    else:
        dim = (radius * 2, radius * 2)
        circle = nparr

    x, y = np.meshgrid(np.arange(dim[0]), np.arange(dim[1]))
    r = np.abs((x - dim[0] / 2) ** 2 + (y - dim[1] / 2) ** 2 - radius ** 2)

    m1 = r.min(axis=1, keepdims=True)
    m2 = r.min(axis=0, keepdims=True)
    rr = np.logical_or(r == m1, r == m2)
    l_x_lim = int(dim[0] / 2 - radius)
    u_x_lim = int(dim[0] / 2 + radius + 1)
    l_y_lim = int(dim[0] / 2 - radius)
    u_y_lim = int(dim[0] / 2 + radius + 1)

    circle[l_x_lim:u_x_lim, l_y_lim:u_y_lim][rr[l_x_lim:u_x_lim, l_y_lim:u_y_lim]] = 1
    circle = np.roll(circle, int(dim[0] / 2) + y_center, axis=0)
    circle = np.roll(circle, int(dim[1] / 2) + x_center, axis=1)
    return circle.flatten()


def get_circle_X_y():
    max_range = 8
    valid_x = range(max_range)
    valid_y = range(max_range)
    valid_r = range(1, int(max_range / 2))
    options = [[x, y, r] for x in valid_x for y in valid_y for r in valid_r if
               ((min(x, y) - r) >= 0) and ((max(x, y) + r) < max_range - 1)]
    options = np.array(options)
    # np.random.shuffle(options)
    x = []
    y = []
    for choice in options:
        # print(choice)
        x = x + [choice]
        y = y + [numpy_circle(choice[0], choice[1], choice[2], dim=[max_range, max_range])]

    return np.array(x), np.array(y)


def get_p_to_c_model(config):
    import tensorflow as tf
    from filelock import FileLock
    with FileLock(os.path.expanduser("~/.data.lock")):
        img_inputs = keras.Input(shape=3)
        x = layers.Flatten()(img_inputs)

        x = layers.Dense(64, activation="tanh", dtype="float32")(x)
        x = layers.Dense(64, activation="tanh", dtype="float32")(x)
        x = layers.Dense(64, activation="tanh", dtype="float32")(x)
        x = layers.Dense(64, activation="tanh", dtype="float32")(x)
        x = layers.Dense(64, activation="tanh", dtype="float32")(x)

        outputs = layers.Dense(64, activation=tf.keras.activations.hard_sigmoid, dtype="float32")(x)
        # outputs=layers.Reshape((8,8))(outputs)
        model_p_to_c = keras.Model(inputs=img_inputs, outputs=outputs, name="FC_Model")
    return model_p_to_c


def train(config):
    import tensorflow as tf
    os.add_dll_directory(r'e:\cuda\bin')
    train_X, train_y = get_circle_X_y()
    from ray.tune.integration.keras import TuneReportCallback
    def get_train_score():
        history = model.fit(train_X, train_y, epochs=100, shuffle=False,
                            verbose=1,
                            validation_split=(1 / 8) * 6,
                            batch_size=batch_size,
                            callbacks=[TuneReportCallback({"mean_loss": "val_loss"})])
        # return history.history["val_loss"][-1]

    batch_size = config['batch_size']
    # Create FCN model
    model = get_p_to_c_model(config)

    # Compile model with losses and metrics
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=config['lr']), loss='MAE')
    a = get_train_score()

    tune.report(mean_loss=a)


# %%
if __name__ == "__main__":
    search_space = {
        "lr": tune.choice([0.00001, 0.0001, 0.001, 0.01, 0.1]),
        "batch_size": tune.choice([4, 8, 16, 32, 64, 128])
    }

    #logger.info("Initializing ray")
    ray.init(configure_logging=False,num_cpus=12,num_gpus=1)
    # tune.run(train)

    from ray.tune.schedulers import AsyncHyperBandScheduler

    sched = AsyncHyperBandScheduler(time_attr="training_iteration", max_t=400, grace_period=20)
    analysis = tune.run(train, scheduler=sched, config=search_space, time_budget_s=120, num_samples=25,
                        metric="mean_loss", mode="min",
                        resources_per_trial={"cpu": 1,"gpu":1},
                        stop={"mean_loss": .04})

    print("Best hyperparameters found were: ", analysis.best_config)

E:\lambda\labs\ds-test-2\venv2\Scripts\python.exe E:/lambda/labs/ds-test-2/structured_experiments/scratch1.py
Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
== Status ==
Current time: 2021-12-09 07:43:01 (running for 00:00:00.21)
Memory usage on this node: 20.7/32.0 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 320.000: None | Iter 80.000: None | Iter 20.000: None
Resources requested: 0/36 CPUs, 0/2 GPUs, 0.0/13.15 GiB heap, 0.0/6.57 GiB objects
Result logdir: C:\Users\Tasha\ray_results\train_2021-12-09_07-43-00
Number of trials: 25/25 (25 PENDING)
+-------------------+----------+-------+--------------+--------+
| Trial name        | status   | loc   |   batch_size |     lr |
|-------------------+----------+-------+--------------+--------|
| train_8b409_00000 | PENDING  |       |           32 | 0.0001 |
| train_8b409_00001 | PENDING  |       |           16 | 0.01   |
| train_8b409_00002 | PENDING  |       |            4 | 0.1    |
| train_8b409_00003 | PENDING  |       |           32 | 0.001  |
| train_8b409_00004 | PENDING  |       |            8 | 0.1    |
| train_8b409_00005 | PENDING  |       |          128 | 0.01   |
| train_8b409_00006 | PENDING  |       |           64 | 0.1    |
| train_8b409_00007 | PENDING  |       |            4 | 0.0001 |
| train_8b409_00008 | PENDING  |       |            8 | 1e-05  |
| train_8b409_00009 | PENDING  |       |           64 | 0.0001 |
| train_8b409_00010 | PENDING  |       |            8 | 1e-05  |
| train_8b409_00011 | PENDING  |       |            4 | 0.1    |
| train_8b409_00012 | PENDING  |       |            4 | 0.001  |
| train_8b409_00013 | PENDING  |       |          128 | 0.1    |
| train_8b409_00014 | PENDING  |       |           16 | 1e-05  |
| train_8b409_00015 | PENDING  |       |           16 | 0.01   |
| train_8b409_00016 | PENDING  |       |           16 | 0.1    |
| train_8b409_00017 | PENDING  |       |            8 | 0.001  |
| train_8b409_00018 | PENDING  |       |            8 | 0.1    |
| train_8b409_00019 | PENDING  |       |           64 | 0.1    |
+-------------------+----------+-------+--------------+--------+
... 5 more trials not shown (5 PENDING)


...
 
== Status ==
Current time: 2021-12-09 07:45:01 (running for 00:02:00.21)
Memory usage on this node: 20.7/32.0 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 320.000: None | Iter 80.000: None | Iter 20.000: None
Resources requested: 0/36 CPUs, 0/2 GPUs, 0.0/13.15 GiB heap, 0.0/6.57 GiB objects
Result logdir: C:\Users\Tasha\ray_results\train_2021-12-09_07-43-00
Number of trials: 25/25 (25 TERMINATED)
+-------------------+------------+-------+--------------+--------+
| Trial name        | status     | loc   |   batch_size |     lr |
|-------------------+------------+-------+--------------+--------|
| train_8b409_00000 | TERMINATED |       |           32 | 0.0001 |
| train_8b409_00001 | TERMINATED |       |           16 | 0.01   |
| train_8b409_00002 | TERMINATED |       |            4 | 0.1    |
| train_8b409_00003 | TERMINATED |       |           32 | 0.001  |
| train_8b409_00004 | TERMINATED |       |            8 | 0.1    |
| train_8b409_00005 | TERMINATED |       |          128 | 0.01   |
| train_8b409_00006 | TERMINATED |       |           64 | 0.1    |
| train_8b409_00007 | TERMINATED |       |            4 | 0.0001 |
| train_8b409_00008 | TERMINATED |       |            8 | 1e-05  |
| train_8b409_00009 | TERMINATED |       |           64 | 0.0001 |
| train_8b409_00010 | TERMINATED |       |            8 | 1e-05  |
| train_8b409_00011 | TERMINATED |       |            4 | 0.1    |
| train_8b409_00012 | TERMINATED |       |            4 | 0.001  |
| train_8b409_00013 | TERMINATED |       |          128 | 0.1    |
| train_8b409_00014 | TERMINATED |       |           16 | 1e-05  |
| train_8b409_00015 | TERMINATED |       |           16 | 0.01   |
| train_8b409_00016 | TERMINATED |       |           16 | 0.1    |
| train_8b409_00017 | TERMINATED |       |            8 | 0.001  |
| train_8b409_00018 | TERMINATED |       |            8 | 0.1    |
| train_8b409_00019 | TERMINATED |       |           64 | 0.1    |
| train_8b409_00020 | TERMINATED |       |            8 | 0.0001 |
| train_8b409_00021 | TERMINATED |       |          128 | 0.1    |
| train_8b409_00022 | TERMINATED |       |            4 | 0.01   |
| train_8b409_00023 | TERMINATED |       |            4 | 1e-05  |
| train_8b409_00024 | TERMINATED |       |          128 | 0.01   |
+-------------------+------------+-------+--------------+--------+


Best hyperparameters found were:  None
Could not find best trial. Did you pass the correct `metric` parameter?

Process finished with exit code 0

I’m not sure why it’s not executing the actual experiments.

Hi @Natasha_Upchurch, your code looks good - and when I run it on my machine (Mac, without the dll calls) it trains and returns results as expected.

My guess is that there are problems with the Windows setup. Does your code run when you just call train(), i.e. execute it without Ray Tune?

Thank you for taking the time. Once I reset my computer the code started working.