I am trying to tune a fastai tabular model but get some error. Here is the code (can be run on Colab)
!pip install ray
# import packages
from ray import tune
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.tabular import *
# define path
path = untar_data(URLs.ADULT_SAMPLE)
# load data
df = pd.read_csv(path/'adult.csv')
# simple split data into train & valid
valid_idx = range(len(df)-2000, len(df))
# define local variables
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
def train_clf_config(config):
"""this function is inherited from train_clf() with 'config' added """
# data preprocessing
procs = [FillMissing, Categorify, Normalize]
# prep data for tabular_learner()
data = TabularDataBunch.from_df(path, df, dep_var=dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
#metrics
f1=FBeta()
precision = Precision()
recall = Recall()
print('================ training classifier================')
# choose hyperparameters: layers, ps, emb_drop
n_layers = config['n_layers']
layers, ps = [], []
for i in range(n_layers -1):
# number of units for each layer
n_units = config['n_units']
layers.append(n_units)
# dropout ratio for each layer
p = config['n_ps']
ps.append(p)
#emb_drop = config['emb_drop']
# define a tabular classifier
learn = tabular_learner(data, layers=layers, ps=ps, emb_szs={'native-country': 10}, metrics=[accuracy, precision, recall, f1])
# auto find learning rate
try:
lr = find_appropriate_lr(model=clf, plot=True)
print(f'clf uses estimated lr={lr}')
except:
lr = 1e-2
print(f'clf uses pre-defined lr={lr}')
# train n_epoch
n_epochs = 5 # config['n_epochs']
clf.fit_one_cycle(n_epochs, moms=(lr*0.01,lr))
# validation performance metrics
valid_metrics = dict(zip(['accuracy', 'precision', 'recall', 'f1'], [x.item() for x in clf.recorder.metrics[0]]))
tune.track.log(mean_accuracy=valid_metrics['accuracy'])
analysis = tune.run(
train_clf_config,
config = {
"n_layers": tune.choice([2,3,4,5]),
"n_units": tune.choice([200,300,400,500,600,700,800,900,1000,1100,1200]), #tune.randint(lower=200, upper=1200),
"n_ps": tune.loguniform(lower=1e-4, upper=1e-1)
}
)
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
# Get a dataframe for analyzing trial results.
df = analysis.dataframe()
The error was:
Trial name # failures error file
train_clf_config_824e0_00000 1 /root/ray_results/train_clf_config_2021-10-01_21-21-14/train_clf_config_824e0_00000_0_n_layers=4,n_ps=0.00086754,n_units=600_2021-10-01_21-21-15/error.txt
---------------------------------------------------------------------------
TuneError Traceback (most recent call last)
<ipython-input-12-a82093f27fe0> in <module>()
4 "n_layers": tune.choice([2,3,4,5]),
5 "n_units": tune.choice([200,300,400,500,600,700,800,900,1000,1100,1200]), #tune.randint(lower=200, upper=1200),
----> 6 "n_ps": tune.loguniform(lower=1e-4, upper=1e-1)
7 }
8 )
/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote)
553 if incomplete_trials:
554 if raise_on_failed_trial and not state[signal.SIGINT]:
--> 555 raise TuneError("Trials did not complete", incomplete_trials)
556 else:
557 logger.error("Trials did not complete: %s", incomplete_trials)
TuneError: ('Trials did not complete', [train_clf_config_824e0_00000])
Any advise would be much appreciated!