TuneError: Trial with unexpected bad status encountered: TERMINATED

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

Hello,
I ran the code at the bottom,
but the following error message occurred

2022-04-21 14:06:21,043 ERROR trial_runner.py:1372 -- Trial tune_model_95909902: Error stopping trial.
Traceback (most recent call last):
  File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 1354, in stop_trial
    self._scheduler_alg.on_trial_remove(self, trial)
  File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 286, in on_trial_remove
    self._process_bracket(trial_runner, bracket)
  File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 262, in _process_bracket
    f"Trial with unexpected bad status " f"encountered: {t.status}"
ray.tune.error.TuneError: Trial with unexpected bad status encountered: TERMINATED
2022-04-21 14:06:21,046 WARNING util.py:171 -- The `choose_trial_to_run` operation took 0.818 s, which may be a performance bottleneck.
Traceback (most recent call last):
  File "~/bohb_application.py", line 403, in <module>

  File "~/bohb_application.py", line 352, in main_041922
    local_dir="~/log/bohb"
  File "~/python3.6/site-packages/ray/tune/tune.py", line 672, in run
    runner.step()
  File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 765, in step
    next_trial = self._update_trial_queue_and_get_next_trial()
  File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 698, in _update_trial_queue_and_get_next_trial
    next_trial = self._get_next_trial()  # blocking
  File "~/python3.6/site-packages/ray/tune/trial_runner.py", line 955, in _get_next_trial
    trial = self._scheduler_alg.choose_trial_to_run(self)
  File "~/python3.6/site-packages/ray/tune/schedulers/hb_bohb.py", line 132, in choose_trial_to_run
    self._process_bracket(trial_runner, bracket)
  File "~/python3.6/site-packages/ray/tune/schedulers/hyperband.py", line 262, in _process_bracket
    f"Trial with unexpected bad status " f"encountered: {t.status}"
ray.tune.error.TuneError: Trial with unexpected bad status encountered: TERMINATED

after this status message and ‘Result for tune model’ were displayed.
== Status ==pid=114114) Current time: 2022-04-21 14:06:16 (running for 02:27:46.22) Memory usage on this node: 21.1/187.3 GiB Using HyperBand: num_stopped=0 total_brackets=1 Round #0: Bracket(Max Size (n)=729, Milestone (r)=1, completed=100.0%): {PAUSED: 99, RUNNING: 1} Resources requested: 12.0/24 CPUs, 2.0/2 GPUs, 0.0/119.78 GiB heap, 0.0/55.33 GiB objects (0.0/1.0 accelerator_type:V100) Result logdir: ~/log/bohb/tune_model_2022-04-21_11-38-30 Number of trials: 100/100 (99 PAUSED, 1 RUNNING)
I don’t know what I’m doing wrong and why successive halving wasn’t working. Any help would be really appreciated.

class TuneReporterCallback(tf.keras.callbacks.Callback):
    """Tune Callback for Keras.

    The callback is invoked every epoch.
    """

    def __init__(self, logs=None):
        if logs is None:
            logs = {}
        self.iteration = 0
        super(TuneReporterCallback, self).__init__()

    def on_epoch_end(self, batch, logs=None):
        if logs is None:
            logs = {}
        self.iteration += 1
        tune.report(keras_info=logs, mean_accuracy=logs.get("val_my_r2_score_scipy"), mean_loss=logs.get("loss"),
                    val_loss=logs.get("val_loss"))


# end class


def tune_model(config, checkpoint_dir=None):
    global BATCH_SIZE
    global LR
    global L2_RATE
    global DROPRATE
    global MOMENTUM
    global ACT
    global BP_KER_1
    global BP_KER_2
    global BP_RES_NUM
    global UTR_KER_1
    global UTR_KER_2
    global UTR_RES_NUM
    global CONCAT_KER_1
    global CONCAT_KER_2
    global CONCAT_RES_NUM
    global MIR_KER_1
    global MIR_KER_2
    global MIR_RES_NUM
    global FINAL_NUM
    global FINAL_CHANNEL
    BATCH_SIZE = config['BATCH_SIZE']
    LR = config['LR']
    L2_RATE = config['L2_RATE']
    DROPRATE = config['DROPRATE']
    MOMENTUM = config['MOMENTUM']
    ACT = config['ACT']
    BP_KER_1 = config['BP_KER_1']
    BP_KER_2 = config['BP_KER_2']
    BP_RES_NUM = config['BP_RES_NUM']
    UTR_KER_1 = config['UTR_KER_1']
    UTR_KER_2 = config['UTR_KER_2']
    UTR_RES_NUM = config['UTR_RES_NUM']
    CONCAT_KER_1 = config['CONCAT_KER_1']
    CONCAT_KER_2 = config['CONCAT_KER_2']
    CONCAT_RES_NUM = config['CONCAT_RES_NUM']
    MIR_KER_1 = config['MIR_KER_1']
    MIR_KER_2 = config['MIR_KER_2']
    MIR_RES_NUM = config['MIR_RES_NUM']
    FINAL_NUM = config['FINAL_NUM']
    FINAL_CHANNEL = config['FINAL_CHANNEL']

    df_train = pd.read_csv(DATALISTDIR.format('train'), sep='\t', header=0)
    df_train = df_train.sample(frac=1)
    df_valid = pd.read_csv(DATALISTDIR.format('valid'), sep='\t', header=0)
    df_valid = df_valid.sample(frac=1)
    df_test = pd.read_csv(DATALISTDIR.format('test'), sep='\t', header=0)

    train_id = df_train['datID'].tolist()
    valid_id = df_valid['datID'].tolist()
    test_id = df_test['datID'].tolist()

    train_labels = df_train['lfc'].tolist()
    valid_labels = df_valid['lfc'].tolist()
    test_labels = df_test['lfc'].tolist()

    batched_train_dataset = gen_dataset(train_id, batch_size=BATCH_SIZE, shuffle=False)
    batched_valid_dataset = gen_dataset(valid_id, batch_size=BATCH_SIZE, shuffle=False)
    batched_test_dataset = gen_dataset(test_id, batch_size=BATCH_SIZE, shuffle=False)
    steps_per_epoch = len(df_train) / BATCH_SIZE
    valid_steps = len(df_valid) / BATCH_SIZE
    now = time.strftime('%m%d%y_%H%M%S')
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        model, optimizer = build_MTpred_v041722(verbose=False)
    # plot_model(model, to_file=FIGDIR.format(f'{VERSION_NAME}_{now}.png'), show_shapes=True)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.losses.MSE, my_r2_score_scipy])
    print_policy(model)
    model_ckpt = keras.callbacks.ModelCheckpoint(MODELDIR.format(version=VERSION_NAME, modelname=f'{now}'),
                                                 monitor='val_loss', save_best_only=True, mode='min')
    earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, mode='min')
    callbacks = [model_ckpt, earlystopping, TuneReporterCallback()]
    model.fit(batched_train_dataset, epochs=MAX_EPOCH, steps_per_epoch=steps_per_epoch,
              callbacks=callbacks, validation_data=batched_valid_dataset,
              validation_steps=valid_steps, verbose=1)
    return None


def main_041922():
    hyperparameter_space = {
        'LR': tune.loguniform(1e-5, 0.05),
        'L2_RATE': tune.loguniform(1e-5, 0.01),
        'DROPRATE': tune.uniform(0.0, 0.5),
        'MOMENTUM': tune.choice([0.9, 0.98, 0.99]),
        'BATCH_SIZE': tune.choice([64, 128, 256]),
        'ACT': tune.choice(['relu', 'elu']),
        'BP_KER_1': tune.choice([3, 5, 7]),
        'BP_KER_2': tune.choice([3, 5, 7]),
        'BP_RES_NUM': tune.choice([1, 2, 3, 4]),
        'UTR_KER_1': tune.choice([3, 5, 7]),
        'UTR_KER_2': tune.choice([3, 5, 7]),
        'UTR_RES_NUM': tune.choice([1, 2, 3]),
        'CONCAT_KER_1': tune.choice([3, 5, 7]),
        'CONCAT_KER_2': tune.choice([3, 5, 7]),
        'CONCAT_RES_NUM': tune.choice([1, 2, 3]),
        'MIR_KER_1': tune.choice([3, 5, 7]),
        'MIR_KER_2': tune.choice([3, 5, 7]),
        'MIR_RES_NUM': tune.choice([1, 2, 3]),
        'FINAL_NUM': tune.choice([1, 2, 3]),
        'FINAL_CHANNEL': tune.choice([12, 16, 20, 24]),
        'num_gpus': 2
    }
    ray.shutdown()
    ray.init()
    algo = TuneBOHB(metric='val_loss', mode='min')
    bohb = HyperBandForBOHB(
        time_attr='time_total_s',
        metric='val_loss',
        mode='min',
        max_t=1000
    )
    analysis = ray.tune.run(
        tune_model,
        verbose=3,
        config=hyperparameter_space,
        scheduler=bohb,
        search_alg=algo,
        num_samples=100,
        resources_per_trial={'cpu': 12, 'gpu': 2},
        local_dir="~/log/bohb"
    )
    best_config = analysis.get_best_config(metric="val_loss", mode='min')
    print('val_loss_min config:', best_config)
    best_config = analysis.get_best_config(metric='val_my_r2_score_scipy', mode='max')
    print('val r2 score max config', best_config)
    print(analysis.dataframe())
    best_trial = analysis.get_best_trial(metric="val_my_r2_score_scipy", mode="max", scope="all")
    best_checkpoint = analysis.get_best_checkpoint(best_trial, metric="val_my_r2_score_scipy")
    return None

Hello!
The error message seems to suggest that the scheduler encounters an unexpected trial status - it thinks the trial should still be alive while it is actually already terminated. It sounds like a bug in Tune side.
I cannot think of a scenario that this could happen right now. Usually the scheduler keeps track of what trials are alive via TuneCallback - on_trial_remove() and on_trial_complete().
Do you consistently see this? Do you notice any pattern when it happens?
Let me know how it goes. I cannot run your script (it has some custom methods) but I am happy to debug if needed.
Thanks!