I am repeatedly getting the error below, and I could not find in the documentation how to programmatically increase the timeout period. Docs say that timeout is configured when I use TuneConfig parameter time_budget_s, but does not say what I can do about it. Help.
System is Ubuntu Linux 22.04, python 10.8, Ray 2.2.0, Optuna 3.1.0, and I am using ASHAScheduler
Below is
1: The PYTHON METHOD for building ray tuner
2: The ERROR MESSAGE after 500-1600 successful terminations
1: PYTHON METHOD
def _set_ray_tuner(self, grid_search=False):
“”“Set ray tuner”“”
# List of strings from the self.search_space dictionary which should be reported.
# Include only the parameters which have more than one item listed in the search space.
parameters_to_report = []
for key, value in self.search_space.items():
if key == "num_models":
continue
if len(value) > 1:
parameters_to_report.append(key)
print(f"parameters_to_report: {parameters_to_report}")
reporter = CLIReporter(
metric_columns=[
"time_total_s",
"iteration",
"train_loss",
"val_loss",
"mse",
"ssim",
"kid_mean",
"kid_std",
],
parameter_columns=parameters_to_report,
)
trainable = tune.with_resources(TrainableVAE, {"gpu": self.gpu_fraction})
trainable_with_parameters = tune.with_parameters(
trainable,
data_dict={
"train_data": self.train_data,
"train_labels": self.train_labels,
"val_data": self.val_data,
"val_labels": self.val_labels,
},
device=self.device,
methods={
"_train_epoch": self._train_epoch,
"_validate_epoch": self._validate_epoch,
"_augment_and_get_dataloader": self._augment_and_get_dataloader,
},
)
if grid_search:
param_space = {
"lr": tune.grid_search(self.search_space["lr"]),
"latent_dim": tune.grid_search(self.search_space["latent_dim"]),
"ksp": tune.grid_search(self.search_space["ksp"]),
"channels": tune.grid_search(self.search_space["channels"]),
"batch_size": tune.grid_search(self.search_space["batch_size"]),
"conv_layers": tune.grid_search(self.search_space["conv_layers"]),
"batch_norm": tune.grid_search(self.search_space["batch_norm"]),
"rotation": tune.grid_search(self.search_space["rotation"]),
"translation": tune.grid_search(self.search_space["translation"]),
"noise": tune.grid_search(self.search_space["noise"]),
"model_id": tune.grid_search(
[
"model_{}".format(i)
for i in range(self.search_space["num_models"])
]
),
}
# Efficient hyperparameter selection. Search Algorithms are wrappers around open-source
# optimization libraries. Each library has a
# specific way of defining the search space.
# https://docs.ray.io/en/latest/ray-air/package-ref.html#ray.tune.tune_config.TuneConfig
tune_config = tune.TuneConfig(
search_alg=tune.search.basic_variant.BasicVariantGenerator(
constant_grid_search=True,
),
)
else:
initial_params = [
{
"lr": 0.0003,
"latent_dim": 2,
"ksp": "k7s1",
"channels": 16,
"batch_size": 64,
"conv_layers": 3,
"batch_norm": False,
"rotation": 0,
"translation": 0,
"noise": 0.02,
"model_id": "model_0",
}
]
# tune (log)uniform etc require two positional arguments, so we need to unpack the list
param_space = {
"lr": tune.loguniform(
self.search_space["lr"][0], self.search_space["lr"][-1]
),
"latent_dim": tune.choice(self.search_space["latent_dim"]),
"ksp": tune.choice(self.search_space["ksp"]),
"channels": tune.choice(self.search_space["channels"]),
"batch_size": tune.choice(self.search_space["batch_size"]),
"conv_layers": tune.choice(self.search_space["conv_layers"]),
"batch_norm": tune.choice(self.search_space["batch_norm"]),
"rotation": tune.uniform(
self.search_space["rotation"][0], self.search_space["rotation"][-1]
),
"translation": tune.uniform(
self.search_space["translation"][0],
self.search_space["translation"][-1],
),
"noise": tune.uniform(
self.search_space["noise"][0], self.search_space["noise"][-1]
),
"model_id": tune.choice(
[
"model_{}".format(i)
for i in range(self.search_space["num_models"])
]
),
}
# Efficient hyperparameter selection. Search Algorithms are wrappers around open-source
# optimization libraries. Each library has a
# specific way of defining the search space.
# https://docs.ray.io/en/latest/ray-air/package-ref.html#ray.tune.tune_config.TuneConfig
tune_config = tune.TuneConfig(
# Local optuna search will generate study name "optuna" indicating in-memory storage
search_alg=OptunaSearch(
sampler=TPESampler(),
metric=self.multi_objective["metric"],
mode=self.multi_objective["mode"],
points_to_evaluate=initial_params,
),
scheduler=ASHAScheduler(
time_attr="training_iteration",
metric=self.multi_objective["metric"][
0
], # Only 1st metric used for pruning
mode=self.multi_objective["mode"][0],
max_t=self.epochs,
grace_period=50,
reduction_factor=2,
),
time_budget_s=self.time_budget,
num_samples=-1,
)
# Runtime configuration that is specific to individual trials. Will overwrite the run config passed to the Trainer.
# for API, see https://docs.ray.io/en/latest/ray-air/package-ref.html#ray.air.config.RunConfig
run_config = (
air.RunConfig(
stop={"training_iteration": self.epochs},
progress_reporter=reporter,
local_dir=self.ray_dir,
# callbacks=[MyCallback()],
checkpoint_config=air.CheckpointConfig(
checkpoint_score_attribute=self.multi_objective["metric"][0],
checkpoint_score_order=self.multi_objective["mode"][0],
num_to_keep=1,
checkpoint_at_end=False,
checkpoint_frequency=0,
),
verbose=1,
),
)
tuner = tune.Tuner(
trainable_with_parameters,
param_space=param_space,
run_config=run_config[0],
tune_config=tune_config,
)
return tuner
2: ERROR MESSAGE
[2023-03-29 13:21:11,915 C 2107118 2107754] gcs_rpc_client.h:537: Check failed: absl::ToInt64Seconds(absl::Now() - gcs_last_alive_time_) < ::RayConfig::instance().gcs_rpc_server_reconnect_timeout_s() Failed to connect to GCS within 60 seconds
*** StackTrace Information ***
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xce4b2a) [0x7fd2c41a8b2a] ray::operator<<()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xce6612) [0x7fd2c41aa612] ray::SpdLogMessage::Flush()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray6RayLogD1Ev+0x37) [0x7fd2c41aa927] ray::RayLog::~RayLog()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0x73b46d) [0x7fd2c3bff46d] ray::rpc::GcsRpcClient::CheckChannelStatus()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(_ZN5boost4asio6detail12wait_handlerIZN3ray3rpc12GcsRpcClient15SetupCheckTimerEvEUlNS_6system10error_codeEE_NS0_9execution12any_executorIJNS9_12context_as_tIRNS0_17execution_contextEEENS9_6detail8blocking7never_tILi0EEENS9_11prefer_onlyINSG_10possibly_tILi0EEEEENSJ_INSF_16outstanding_work9tracked_tILi0EEEEENSJ_INSN_11untracked_tILi0EEEEENSJ_INSF_12relationship6fork_tILi0EEEEENSJ_INSU_14continuation_tILi0EEEEEEEEE11do_completeEPvPNS1_19scheduler_operationERKS7_m+0x303) [0x7fd2c3bff913] boost::asio::detail::wait_handler<>::do_complete()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xcf57bb) [0x7fd2c41b97bb] boost::asio::detail::scheduler::do_run_one()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xcf69f1) [0x7fd2c41ba9f1] boost::asio::detail::scheduler::run()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xcf6c60) [0x7fd2c41bac60] boost::asio::io_context::run()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core10CoreWorker12RunIOServiceEv+0xcd) [0x7fd2c3ae0fed] ray::core::CoreWorker::RunIOService()
/opt2/software/miniconda3/envs/ret_pt/lib/python3.10/site-packages/ray/_raylet.so(+0xe2aa10) [0x7fd2c42eea10] execute_native_thread_routine
/lib/x86_64-linux-gnu/libc.so.6(+0x94b43) [0x7fd4b7267b43]
/lib/x86_64-linux-gnu/libc.so.6(+0x126a00) [0x7fd4b72f9a00]