Question about Ray Cluster/ Ray on prem

Hi, I tried a simple ray tune code in local, HPC and k8s and it worked well. However, now I am getting a error related to k8s again and again in HPC env with same code. This is strange because I didn’t set up any cluster in the code. I used ray.init()… It seems like ray has some integrations with k8s, how can I get rid of it and just run it on prem? Thanks for any advices.

Here’s my error:

File "/usr/local/lib/python3.8/dist-packages/ray/tune/integration/kubernetes.py", line 71, in __init__
    self.local_node = self._get_kubernetes_node_by_ip(self.local_ip)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/integration/kubernetes.py", line 86, in _get_kubernetes_node_by_ip
    kubernetes.config.load_incluster_config()
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 118, in load_incluster_config
    InClusterConfigLoader(
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 54, in load_and_set
    self._load_config()
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 62, in _load_config
    raise ConfigException("Service host/port is not set.")

Here’s my code:

import tensorflow as tf

# Import standard library.
import os

# Import Ray library.
import ray
from ray import tune
from ray.tune.integration.keras import TuneReportCallback

import pickle
import numpy as np

# Set up global variables.
BATCH_SIZE = 32
EPOCHS = 2

# Change here.
OUTPUT_PATH= "~/ray_hpc_results"
EXPERIMENT_NAME = "test"

# Change here.
DATA_DIR = "~/Data/cifar10"

# Build model.
class TargetModel(tf.keras.Model):
    def __init__(self, dropout_rate):
        super(TargetModel, self).__init__()

        self.dense1 = tf.keras.layers.Dense(
            units=64,
            activation="relu",
            name="dense1",
        )

        self.dense2 = tf.keras.layers.Dense(
            units=64,
            activation="relu",
            name="dense2",
        )

        self.dropout = tf.keras.layers.Dropout(
            rate=dropout_rate,
            name="dropout",
        )

        self.flatten = tf.keras.layers.Flatten()

        self.out = tf.keras.layers.Dense(
            units=10,
            activation="softmax",
            name="predictions",
        )

    def call(self, inputs):
        dense1_out = self.dense1(inputs)
        dense2_out = self.dense2(dense1_out)
        dropout_out = self.dropout(dense2_out)
        flatten_out = self.flatten(dropout_out)

        return self.out(flatten_out)

# Build training function.
def train_cifar10(config):       
    # Load CIFAR10 data.
    train_data, test_data = load_cifar_10_data(DATA_DIR, BATCH_SIZE)

    # Get model.
    model = TargetModel(config["dropout_rate"])

    # Compile the model.
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(lr=config["lr"]),
        metrics=["accuracy"]
    )

    # Train the model.
    model.fit(
        train_data,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        verbose=0,
        validation_data=test_data,
        callbacks=[
            TuneReportCallback(
                {
                    "mean_accuracy": "accuracy"
                }
            )
        ]
    )

def unpickle(file):
    """Load CIFAR10 data from files.
    :param file: The CIFAR10 data file path.
    :return data: The dictionary contains CIFAR10 data and labels.
    """

    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

def load_cifar_10_data(data_dir, batch_size):
    """Load CIFAR10 data and prepare it for model training.
    :param data_dir: The name of MinIO project folder whee data is stored in the minIO
        bucket.
    :param batch_size: The number of inputs for each batch.
    """
    # get the meta_data_dict
    # num_cases_per_batch: 1000
    # label_names: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    # num_vis: :3072
    meta_data_dict = unpickle(os.path.join(data_dir, "batches.meta"))
    cifar_label_names = meta_data_dict[b'label_names']
    cifar_label_names = np.array(cifar_label_names)

    # training data
    x_train = None
    y_train = []

    # cifar_train_data_dict
    # 'batch_label': 'training batch 5 of 5'
    # 'data': ndarray
    # 'filenames': list
    # 'labels': list
    # Load train data from files.
    for i in range(1, 6):
        cifar_train_data_dict = unpickle(data_dir + "/data_batch_{}".format(i))
        if i == 1:
            x_train = cifar_train_data_dict[b'data']
        else:
            x_train = np.vstack((x_train, cifar_train_data_dict[b'data']))
        y_train += cifar_train_data_dict[b'labels']

    # Reshape data.
    x_train = x_train.reshape((len(x_train), 32, 32, 3)).astype('float32') / 255
    y_train = np.array(y_train)

    # Prepare the training dataset.
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size)

    # test data
    # cifar_test_data_dict
    # 'batch_label': 'testing batch 1 of 1'
    # 'data': ndarray
    # 'filenames': list
    # 'labels': list
    # Load test data from files.
    cifar_test_data_dict = unpickle(data_dir + "/test_batch")
    x_test = cifar_test_data_dict[b'data']
    y_test = cifar_test_data_dict[b'labels']
    # Reshape data.
    x_test = x_test.reshape((len(x_test), 32, 32, 3)).astype('float32') / 255
    y_test = np.array(y_test)

    # Prepare the validation dataset.
    test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    test_dataset = test_dataset.batch(batch_size)

    return train_dataset, test_dataset

if __name__ == "__main__":
    # Set up scheduler. 
    config = {
        "time_attr":"training_iteration",
        "max_t":10,
        "grace_period":5
    }

    scheduler = tune.create_scheduler("async_hyperband", **config)

    # Set up parameters config.
    param_config = {
        "dropout_rate": tune.uniform(0.0, 1.0),
        "lr": tune.uniform(0.00001, 0.01),
    }

    # Init ray.
    ray.init(num_cpus=1, num_gpus=1, local_mode=True)
    analysis = tune.run(
        train_cifar10,
        local_dir=OUTPUT_PATH,
        name=EXPERIMENT_NAME,
        scheduler=scheduler,
        metric="mean_accuracy",
        mode="max",
        stop={
            "mean_accuracy": 0.99,
            "training_iteration": 10
        },
        num_samples=2,
        config=param_config,
    )
    print("Best hyperparameters found were: ", analysis.best_config)

cc @Dmitri

Could you clarify how you are running this code? did you ssh into a container? Did you use the Ray Operator to create a cluster? It seems like not, but it’s not clear to me whether or not you want the workload to be distributed or whether or not you just want to run it in a container on k8s.

Thanks for pointing out these issues.

Looks like Tune incorrectly assumed that the code was running inside a k8s cluster and then failed to load a k8s config.

Cc @rliaw

As @bill-anyscale mentioned, we could use more context on how the code was executed.
If there’s a more complete stack trace for the error, that could help too.

Hi Bill and Dmitri,

@Dmitri @bill-anyscale
Let me clarify my code. I am simply benchmarking Ray, most use cases are using Ray Tune. During my benchmarking, I’d like to run a dummy model in 3 different environments: local, our HPC Batch environment and k8s. It worked all well at first try, I could see correct results from all environments. For k8s, I used Ray Cluster Launcher to create a cluster. However, when I tried to run it in HPC Batch environment again, I got the error. So I guess, it is something behind the Ray code. Ray might not have the correct ClusterRoleBinding or something like. I say so because the issue occurring after I create k8s ray cluster. Also, one of my colleague is facing the same issue.

Back to my code, I am not using a cluster. I ran it in HPC batch environment, with my custom image. I ran it with a normal way, not distributed training. I also tested a distributed training job, same issue.

I posted the full error I got, hope this helpful. And thanks for helping with this.

2021-06-11 15:47:57.802258: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-06-11 15:48:02,119	INFO services.py:1267 -- View the Ray dashboard at e[1me[32mhttp://127.0.0.1:8265e[39me[22m
2021-06-11 15:48:02,123	WARNING services.py:1716 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=10.24gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.
2021-06-11 15:48:03,555	WARNING function_runner.py:544 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
2021-06-11 15:48:04,307	WARNING tune.py:494 -- Tune detects GPUs, but no trials are using GPUs. To enable trials to use GPUs, set tune.run(resources_per_trial={'gpu': 1}...) which allows Tune to expose 1 GPU to each trial. You can also override `Trainable.default_resource_request` if using the Trainable API.
2021-06-11 15:48:05.643545: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-06-11 15:48:05.645264: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-06-11 15:48:05.700922: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Found device 0 with properties: 
pciBusID: 0000:b3:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-06-11 15:48:05.700967: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-06-11 15:48:05.710983: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-06-11 15:48:05.711073: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-06-11 15:48:05.712466: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-06-11 15:48:05.715400: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-06-11 15:48:05.726683: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11
2021-06-11 15:48:05.729156: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2021-06-11 15:48:05.729381: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-06-11 15:48:05.732575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1888] Adding visible gpu devices: 0
2021-06-11 15:48:05.735253: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-06-11 15:48:05.736912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Found device 0 with properties: 
pciBusID: 0000:b3:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-06-11 15:48:05.736940: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-06-11 15:48:05.736965: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-06-11 15:48:05.736976: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-06-11 15:48:05.736988: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-06-11 15:48:05.737002: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-06-11 15:48:05.737015: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11
2021-06-11 15:48:05.737028: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2021-06-11 15:48:05.737040: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-06-11 15:48:05.740108: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1888] Adding visible gpu devices: 0
2021-06-11 15:48:05.740139: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-06-11 15:48:06.583424: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1287] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-06-11 15:48:06.583479: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1293]      0 
2021-06-11 15:48:06.583490: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1306] 0:   N 
2021-06-11 15:48:06.588326: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 31019 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:b3:00.0, compute capability: 7.0)
2021-06-11 15:48:08.824852: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-06-11 15:48:08.828834: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2600000000 Hz
2021-06-11 15:48:08.911467: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-06-11 15:48:09.428751: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
Traceback (most recent call last):
  File "./demo.py", line 195, in <module>
    analysis = tune.run(
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/tune.py", line 520, in run
    runner.step()
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/trial_runner.py", line 509, in step
    if _start_trial(next_trial):
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/trial_runner.py", line 500, in _start_trial
    self._callbacks.on_trial_start(
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/callback.py", line 192, in on_trial_start
    callback.on_trial_start(**info)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/syncer.py", line 434, in on_trial_start
    self._get_trial_syncer(trial)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/syncer.py", line 379, in _get_trial_syncer
    self._syncers[trial] = self._create_trial_syncer(trial)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/syncer.py", line 383, in _create_trial_syncer
    return get_node_syncer(
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/syncer.py", line 354, in get_node_syncer
    _syncers[key] = sync_function(local_dir, remote_dir, None)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/integration/kubernetes.py", line 71, in __init__
    self.local_node = self._get_kubernetes_node_by_ip(self.local_ip)
  File "/usr/local/lib/python3.8/dist-packages/ray/tune/integration/kubernetes.py", line 86, in _get_kubernetes_node_by_ip
    kubernetes.config.load_incluster_config()
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 118, in load_incluster_config
    InClusterConfigLoader(
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 54, in load_and_set
    self._load_config()
  File "/usr/local/lib/python3.8/dist-packages/kubernetes/config/incluster_config.py", line 62, in _load_config
    raise ConfigException("Service host/port is not set.")
kubernetes.config.config_exception.ConfigException: Service host/port is not set.

And here is the correct outputs from my first try:

2021-06-04 17:34:27.089827: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-06-04 17:34:38,461	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-06-04 17:34:38,462	WARNING services.py:1716 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=10.24gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.
2021-06-04 17:34:39,808	WARNING function_runner.py:544 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
[2m[36m(pid=384)[0m 2021-06-04 17:34:40.754853: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=383)[0m 2021-06-04 17:34:40.754547: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=384)[0m 2021-06-04 17:34:43.154259: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=384)[0m 2021-06-04 17:34:43.155650: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
[2m[36m(pid=384)[0m 2021-06-04 17:34:43.167126: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2m[36m(pid=384)[0m 2021-06-04 17:34:43.167170: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
[2m[36m(pid=384)[0m 2021-06-04 17:34:43.170551: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=383)[0m 2021-06-04 17:34:43.156428: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=383)[0m 2021-06-04 17:34:43.157328: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
[2m[36m(pid=383)[0m 2021-06-04 17:34:43.168038: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2m[36m(pid=383)[0m 2021-06-04 17:34:43.168075: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
[2m[36m(pid=383)[0m 2021-06-04 17:34:43.171136: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=384)[0m WARNING:tensorflow:AutoGraph could not transform <bound method TargetModel.call of <__main__.TargetModel object at 0x7f0893204ac0>> and will run it as-is.
[2m[36m(pid=384)[0m Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
[2m[36m(pid=384)[0m Cause: Unknown node type <gast.gast.Import object at 0x7f08983f8790>
[2m[36m(pid=384)[0m To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
[2m[36m(pid=383)[0m WARNING:tensorflow:AutoGraph could not transform <bound method TargetModel.call of <__main__.TargetModel object at 0x7f0bf4e64df0>> and will run it as-is.
[2m[36m(pid=383)[0m Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
[2m[36m(pid=383)[0m Cause: Unknown node type <gast.gast.Import object at 0x7f0bfa04fd30>
[2m[36m(pid=383)[0m To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
[2m[36m(pid=384)[0m 2021-06-04 17:34:44.764081: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=384)[0m 2021-06-04 17:34:44.765873: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2800000000 Hz
[2m[36m(pid=383)[0m 2021-06-04 17:34:44.769646: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=383)[0m 2021-06-04 17:34:44.771455: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2800000000 Hz
2021-06-04 17:35:19,296	INFO tune.py:549 -- Total run time: 39.49 seconds (38.91 seconds for the tuning loop).

== Status ==
Memory usage on this node: 40.4/755.4 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 5.000: None
Resources requested: 1.0/2 CPUs, 0/1 GPUs, 0.0/48.03 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /s/hjalnko/test1/test
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+---------------------------+----------+-------+----------------+------------+
| Trial name                | status   | loc   |   dropout_rate |         lr |
|---------------------------+----------+-------+----------------+------------|
| train_cifar10_23cc0_00000 | RUNNING  |       |       0.362538 | 0.00609616 |
| train_cifar10_23cc0_00001 | PENDING  |       |       0.277606 | 0.00791673 |
+---------------------------+----------+-------+----------------+------------+
Result for train_cifar10_23cc0_00001:
  date: 2021-06-04_17-35-01
  done: false
  experiment_id: 4feb6facd2ef4201b657628be6dedb26
  hostname: 55ae3abd60a1
  iterations_since_restore: 1
  mean_accuracy: 0.3625600039958954
  node_ip: 172.17.0.7
  pid: 384
  time_since_restore: 20.0603506565094
  time_this_iter_s: 20.0603506565094
  time_total_s: 20.0603506565094
  timestamp: 1622828101
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 23cc0_00001
== Status ==
Memory usage on this node: 43.9/755.4 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 5.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/48.03 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100)
Current best trial: 23cc0_00001 with mean_accuracy=0.3625600039958954 and parameters={'dropout_rate': 0.2776055652252952, 'lr': 0.007916730475299056}
Result logdir: /s/hjalnko/test1/test
Number of trials: 2/2 (2 RUNNING)
+---------------------------+----------+----------------+----------------+------------+---------+--------+------------------+
| Trial name                | status   | loc            |   dropout_rate |         lr |     acc |   iter |   total time (s) |
|---------------------------+----------+----------------+----------------+------------+---------+--------+------------------|
| train_cifar10_23cc0_00000 | RUNNING  |                |       0.362538 | 0.00609616 |         |        |                  |
| train_cifar10_23cc0_00001 | RUNNING  | 172.17.0.7:384 |       0.277606 | 0.00791673 | 0.36256 |      1 |          20.0604 |
+---------------------------+----------+----------------+----------------+------------+---------+--------+------------------+
Result for train_cifar10_23cc0_00000:
  date: 2021-06-04_17-35-02
  done: false
  experiment_id: fe6a82913d5b48ecbf9de505a2120669
  hostname: 55ae3abd60a1
  iterations_since_restore: 1
  mean_accuracy: 0.4079200029373169
  node_ip: 172.17.0.7
  pid: 383
  time_since_restore: 20.157799005508423
  time_this_iter_s: 20.157799005508423
  time_total_s: 20.157799005508423
  timestamp: 1622828102
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 23cc0_00000
Result for train_cifar10_23cc0_00001:
  date: 2021-06-04_17-35-18
  done: false
  experiment_id: 4feb6facd2ef4201b657628be6dedb26
  hostname: 55ae3abd60a1
  iterations_since_restore: 2
  mean_accuracy: 0.44005998969078064
  node_ip: 172.17.0.7
  pid: 384
  time_since_restore: 36.387415170669556
  time_this_iter_s: 16.327064514160156
  time_total_s: 36.387415170669556
  timestamp: 1622828118
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 23cc0_00001
== Status ==
Memory usage on this node: 43.2/755.4 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 5.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/48.03 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100)
Current best trial: 23cc0_00001 with mean_accuracy=0.44005998969078064 and parameters={'dropout_rate': 0.2776055652252952, 'lr': 0.007916730475299056}
Result logdir: /s/hjalnko/test1/test
Number of trials: 2/2 (2 RUNNING)
+---------------------------+----------+----------------+----------------+------------+---------+--------+------------------+
| Trial name                | status   | loc            |   dropout_rate |         lr |     acc |   iter |   total time (s) |
|---------------------------+----------+----------------+----------------+------------+---------+--------+------------------|
| train_cifar10_23cc0_00000 | RUNNING  | 172.17.0.7:383 |       0.362538 | 0.00609616 | 0.40792 |      1 |          20.1578 |
| train_cifar10_23cc0_00001 | RUNNING  | 172.17.0.7:384 |       0.277606 | 0.00791673 | 0.44006 |      2 |          36.3874 |
+---------------------------+----------+----------------+----------------+------------+---------+--------+------------------+
Result for train_cifar10_23cc0_00001:
  date: 2021-06-04_17-35-18
  done: true
  experiment_id: 4feb6facd2ef4201b657628be6dedb26
  experiment_tag: 1_dropout_rate=0.27761,lr=0.0079167
  hostname: 55ae3abd60a1
  iterations_since_restore: 2
  mean_accuracy: 0.44005998969078064
  node_ip: 172.17.0.7
  pid: 384
  time_since_restore: 36.387415170669556
  time_this_iter_s: 16.327064514160156
  time_total_s: 36.387415170669556
  timestamp: 1622828118
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 23cc0_00001
Result for train_cifar10_23cc0_00000:
  date: 2021-06-04_17-35-19
  done: false
  experiment_id: fe6a82913d5b48ecbf9de505a2120669
  hostname: 55ae3abd60a1
  iterations_since_restore: 2
  mean_accuracy: 0.4736599922180176
  node_ip: 172.17.0.7
  pid: 383
  time_since_restore: 37.23727893829346
  time_this_iter_s: 17.079479932785034
  time_total_s: 37.23727893829346
  timestamp: 1622828119
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 23cc0_00000
Result for train_cifar10_23cc0_00000:
  date: 2021-06-04_17-35-19
  done: true
  experiment_id: fe6a82913d5b48ecbf9de505a2120669
  experiment_tag: 0_dropout_rate=0.36254,lr=0.0060962
  hostname: 55ae3abd60a1
  iterations_since_restore: 2
  mean_accuracy: 0.4736599922180176
  node_ip: 172.17.0.7
  pid: 383
  time_since_restore: 37.23727893829346
  time_this_iter_s: 17.079479932785034
  time_total_s: 37.23727893829346
  timestamp: 1622828119
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 23cc0_00000
== Status ==
Memory usage on this node: 41.4/755.4 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 5.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/48.03 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100)
Current best trial: 23cc0_00000 with mean_accuracy=0.4736599922180176 and parameters={'dropout_rate': 0.36253802326481943, 'lr': 0.00609616464974135}
Result logdir: /s/hjalnko/test1/test
Number of trials: 2/2 (2 TERMINATED)
+---------------------------+------------+-------+----------------+------------+---------+--------+------------------+
| Trial name                | status     | loc   |   dropout_rate |         lr |     acc |   iter |   total time (s) |
|---------------------------+------------+-------+----------------+------------+---------+--------+------------------|
| train_cifar10_23cc0_00000 | TERMINATED |       |       0.362538 | 0.00609616 | 0.47366 |      2 |          37.2373 |
| train_cifar10_23cc0_00001 | TERMINATED |       |       0.277606 | 0.00791673 | 0.44006 |      2 |          36.3874 |
+---------------------------+------------+-------+----------------+------------+---------+--------+------------------+
Best hyperparameters found were:  {'dropout_rate': 0.36253802326481943, 'lr': 0.00609616464974135}

Hi @Dmitri @bill-anyscale

I kind of figure out the problem why I got tune.integration.kubernetes error. My HPC env and k8s are sharing same drive. I mounted the same drive, same code folder to Ray cluster head pod. So when I back to run same code again in HPC Batch env, Ray somehow detect something and assume I am running program in k8s env. I deleted all my folders in my drive and tried again. The program worked well though.

Is there any hidden files are created during Ray program running with k8s? Is there any way for me to automatically clean those files after job finish? Thank you

Hmm, it might be an autoscaler artifact (like /home/ubuntu/ray_bootstrap_config.yaml). Try clearing those folders.

Also, before running the HPC experiment, I’d recommend stopping and restarting ray with something like ray stop; ray start --num-cpus=<…>