How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Hello,
I ran the code at the bottom,
but the following error message occurred
2022-04-21 14:06:21,043 ERROR trial_runner.py:1372 -- Trial tune_model_95909902: Error stopping trial.
Traceback (most recent call last):
File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 1354, in stop_trial
self._scheduler_alg.on_trial_remove(self, trial)
File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 286, in on_trial_remove
self._process_bracket(trial_runner, bracket)
File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 262, in _process_bracket
f"Trial with unexpected bad status " f"encountered: {t.status}"
ray.tune.error.TuneError: Trial with unexpected bad status encountered: TERMINATED
2022-04-21 14:06:21,046 WARNING util.py:171 -- The `choose_trial_to_run` operation took 0.818 s, which may be a performance bottleneck.
Traceback (most recent call last):
File "~/bohb_application.py", line 403, in <module>
File "~/bohb_application.py", line 352, in main_041922
local_dir="~/log/bohb"
File "~/python3.6/site-packages/ray/tune/tune.py", line 672, in run
runner.step()
File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 765, in step
next_trial = self._update_trial_queue_and_get_next_trial()
File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 698, in _update_trial_queue_and_get_next_trial
next_trial = self._get_next_trial() # blocking
File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 955, in _get_next_trial
trial = self._scheduler_alg.choose_trial_to_run(self)
File "~/python3.6/site-packages/ray/tune/schedulers/hb_bohb.py", line 132, in choose_trial_to_run
self._process_bracket(trial_runner, bracket)
File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 262, in _process_bracket
f"Trial with unexpected bad status " f"encountered: {t.status}"
ray.tune.error.TuneError: Trial with unexpected bad status encountered: TERMINATED
after this status message and ‘Result for tune model’ were displayed.
== Status ==pid=114114) Current time: 2022-04-21 14:06:16 (running for 02:27:46.22) Memory usage on this node: 21.1/187.3 GiB Using HyperBand: num_stopped=0 total_brackets=1 Round #0: Bracket(Max Size (n)=729, Milestone (r)=1, completed=100.0%): {PAUSED: 99, RUNNING: 1} Resources requested: 12.0/24 CPUs, 2.0/2 GPUs, 0.0/119.78 GiB heap, 0.0/55.33 GiB objects (0.0/1.0 accelerator_type:V100) Result logdir: ~/log/bohb/tune_model_2022-04-21_11-38-30 Number of trials: 100/100 (99 PAUSED, 1 RUNNING)
I don’t know what I’m doing wrong and why successive halving wasn’t working. Any help would be really appreciated.
class TuneReporterCallback(tf.keras.callbacks.Callback):
"""Tune Callback for Keras.
The callback is invoked every epoch.
"""
def __init__(self, logs=None):
if logs is None:
logs = {}
self.iteration = 0
super(TuneReporterCallback, self).__init__()
def on_epoch_end(self, batch, logs=None):
if logs is None:
logs = {}
self.iteration += 1
tune.report(keras_info=logs, mean_accuracy=logs.get("val_my_r2_score_scipy"), mean_loss=logs.get("loss"),
val_loss=logs.get("val_loss"))
# end class
def tune_model(config, checkpoint_dir=None):
global BATCH_SIZE
global LR
global L2_RATE
global DROPRATE
global MOMENTUM
global ACT
global BP_KER_1
global BP_KER_2
global BP_RES_NUM
global UTR_KER_1
global UTR_KER_2
global UTR_RES_NUM
global CONCAT_KER_1
global CONCAT_KER_2
global CONCAT_RES_NUM
global MIR_KER_1
global MIR_KER_2
global MIR_RES_NUM
global FINAL_NUM
global FINAL_CHANNEL
BATCH_SIZE = config['BATCH_SIZE']
LR = config['LR']
L2_RATE = config['L2_RATE']
DROPRATE = config['DROPRATE']
MOMENTUM = config['MOMENTUM']
ACT = config['ACT']
BP_KER_1 = config['BP_KER_1']
BP_KER_2 = config['BP_KER_2']
BP_RES_NUM = config['BP_RES_NUM']
UTR_KER_1 = config['UTR_KER_1']
UTR_KER_2 = config['UTR_KER_2']
UTR_RES_NUM = config['UTR_RES_NUM']
CONCAT_KER_1 = config['CONCAT_KER_1']
CONCAT_KER_2 = config['CONCAT_KER_2']
CONCAT_RES_NUM = config['CONCAT_RES_NUM']
MIR_KER_1 = config['MIR_KER_1']
MIR_KER_2 = config['MIR_KER_2']
MIR_RES_NUM = config['MIR_RES_NUM']
FINAL_NUM = config['FINAL_NUM']
FINAL_CHANNEL = config['FINAL_CHANNEL']
df_train = pd.read_csv(DATALISTDIR.format('train'), sep='\t', header=0)
df_train = df_train.sample(frac=1)
df_valid = pd.read_csv(DATALISTDIR.format('valid'), sep='\t', header=0)
df_valid = df_valid.sample(frac=1)
df_test = pd.read_csv(DATALISTDIR.format('test'), sep='\t', header=0)
train_id = df_train['datID'].tolist()
valid_id = df_valid['datID'].tolist()
test_id = df_test['datID'].tolist()
train_labels = df_train['lfc'].tolist()
valid_labels = df_valid['lfc'].tolist()
test_labels = df_test['lfc'].tolist()
batched_train_dataset = gen_dataset(train_id, batch_size=BATCH_SIZE, shuffle=False)
batched_valid_dataset = gen_dataset(valid_id, batch_size=BATCH_SIZE, shuffle=False)
batched_test_dataset = gen_dataset(test_id, batch_size=BATCH_SIZE, shuffle=False)
steps_per_epoch = len(df_train) / BATCH_SIZE
valid_steps = len(df_valid) / BATCH_SIZE
now = time.strftime('%m%d%y_%H%M%S')
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
model, optimizer = build_MTpred_v041722(verbose=False)
# plot_model(model, to_file=FIGDIR.format(f'{VERSION_NAME}_{now}.png'), show_shapes=True)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.losses.MSE, my_r2_score_scipy])
print_policy(model)
model_ckpt = keras.callbacks.ModelCheckpoint(MODELDIR.format(version=VERSION_NAME, modelname=f'{now}'),
monitor='val_loss', save_best_only=True, mode='min')
earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, mode='min')
callbacks = [model_ckpt, earlystopping, TuneReporterCallback()]
model.fit(batched_train_dataset, epochs=MAX_EPOCH, steps_per_epoch=steps_per_epoch,
callbacks=callbacks, validation_data=batched_valid_dataset,
validation_steps=valid_steps, verbose=1)
return None
def main_041922():
hyperparameter_space = {
'LR': tune.loguniform(1e-5, 0.05),
'L2_RATE': tune.loguniform(1e-5, 0.01),
'DROPRATE': tune.uniform(0.0, 0.5),
'MOMENTUM': tune.choice([0.9, 0.98, 0.99]),
'BATCH_SIZE': tune.choice([64, 128, 256]),
'ACT': tune.choice(['relu', 'elu']),
'BP_KER_1': tune.choice([3, 5, 7]),
'BP_KER_2': tune.choice([3, 5, 7]),
'BP_RES_NUM': tune.choice([1, 2, 3, 4]),
'UTR_KER_1': tune.choice([3, 5, 7]),
'UTR_KER_2': tune.choice([3, 5, 7]),
'UTR_RES_NUM': tune.choice([1, 2, 3]),
'CONCAT_KER_1': tune.choice([3, 5, 7]),
'CONCAT_KER_2': tune.choice([3, 5, 7]),
'CONCAT_RES_NUM': tune.choice([1, 2, 3]),
'MIR_KER_1': tune.choice([3, 5, 7]),
'MIR_KER_2': tune.choice([3, 5, 7]),
'MIR_RES_NUM': tune.choice([1, 2, 3]),
'FINAL_NUM': tune.choice([1, 2, 3]),
'FINAL_CHANNEL': tune.choice([12, 16, 20, 24]),
'num_gpus': 2
}
ray.shutdown()
ray.init()
algo = TuneBOHB(metric='val_loss', mode='min')
bohb = HyperBandForBOHB(
time_attr='time_total_s',
metric='val_loss',
mode='min',
max_t=1000
)
analysis = ray.tune.run(
tune_model,
verbose=3,
config=hyperparameter_space,
scheduler=bohb,
search_alg=algo,
num_samples=100,
resources_per_trial={'cpu': 12, 'gpu': 2},
local_dir="~/log/bohb"
)
best_config = analysis.get_best_config(metric="val_loss", mode='min')
print('val_loss_min config:', best_config)
best_config = analysis.get_best_config(metric='val_my_r2_score_scipy', mode='max')
print('val r2 score max config', best_config)
print(analysis.dataframe())
best_trial = analysis.get_best_trial(metric="val_my_r2_score_scipy", mode="max", scope="all")
best_checkpoint = analysis.get_best_checkpoint(best_trial, metric="val_my_r2_score_scipy")
return None