How to deal with TuneError: ('Trials did not complete',...)

I have following code running into TuneError.

I tried change num_samples=6, time_budget_s=600 to a larger values, says num_samples=6, time_budget_s=1800 , or num_samples=600, time_budget_s=1800 the same error occurred.

What causes it? How do you fix it?

%%time
import time
import ray
from ray import tune
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.tabular import * 
from scipy.stats import loguniform

# load a sample dataset
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df.head()

# simple split data into train & valid
valid_idx = range(len(df)-2000, len(df))

# define local variables
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

def train_clf_config2(config, path=path, df=df, dep_var=dep_var, valid_idx=valid_idx, cat_names=cat_names,return_learner=False):
    """this function is inherited from train_clf_config() with 'config' added """
    # data preprocessing
    procs = [FillMissing, Categorify, Normalize]

    # prep data for tabular_learner()
    data = TabularDataBunch.from_df(path, df, dep_var=dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)

    #metrics
    f1=FBeta()
    precision = Precision()
    recall = Recall()

    print('================ training classifier================')

    # choose hyperparameters: layers, ps, emb_drop
    layers=config['layers'], 
    ps = config['ps']
    emb_drop = config['emb_drop']

    # define a tabular classifier
    learn = tabular_learner(data, layers=layers, ps=ps, emb_drop=emb_drop, emb_szs={'native-country': 10}, metrics=[accuracy, precision, recall, f1])
    
    # auto find learning rate
    try:
        lr = find_appropriate_lr(model=learn, plot=True)
        print(f'clf uses estimated lr={lr}')
    except:
        lr = 1e-2
        print(f'clf uses pre-defined lr={lr}')
    
    # train n_epoch
    n_epochs = config['n_epochs']
    learn.fit_one_cycle(n_epochs, moms=(lr*0.01,lr))
    
    # build validation performance metrics
    valid_metrics = dict(zip(['accuracy',	'precision',	'recall',	'f1'], [x.item() for x in learn.recorder.metrics[0]]))
    
    # send metrics to tune
    tune.report(**valid_metrics)

    if return_learner:
        return learn, valid_metrics

################# tuning fastai classifier ################### 

# create HyperBand scheduler to have more efficient training
from ray.tune.schedulers import HyperBandScheduler

# Create HyperBand scheduler and maximize f1
hyperband = HyperBandScheduler(metric="f1", mode="max")

# tune hyperparameters defined by conditional search space
analysis = tune.run(
    train_clf_config2,
    config = {"n_layers": tune.sample_from(lambda _: np.random.choice([2,3,4,5,6])), 
              "layers": tune.sample_from(lambda spec: [np.random.choice([100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]) for i in range(spec.config.n_layers-1)]),
              "ps": tune.sample_from(lambda spec: [float(loguniform.rvs(1e-4, 1e0, size=1)) for i in range(spec.config.n_layers-1)]),
              "n_epochs": tune.lograndint(lower=1, upper=100),
              "emb_drop": tune.loguniform(lower=1e-4, upper=8e-1)
              },
    num_samples=6,
    scheduler=hyperband,
    time_budget_s=600 # time budget in seconds
)

Eorr message is:

TuneError: ('Trials did not complete', [train_clf_config2_ca838_00000, train_clf_config2_ca838_00001, train_clf_config2_ca838_00002, train_clf_config2_ca838_00003, train_clf_config2_ca838_00004, train_clf_config2_ca838_00005])

Hi, are there any other outputs? What does the result table look like?

If errors occur, you’ll usually find more information in a error.txt in the trial directory (take a look at ~/ray_results - the link to this file is usually printed in the output.

1 Like