How do I tune fastai tabular model with Ray?

I am trying to tune a fastai tabular model but get some error. Here is the code (can be run on Colab)

!pip install ray

# import packages
from ray import tune
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.tabular import * 

# define path
path = untar_data(URLs.ADULT_SAMPLE)

# load data
df = pd.read_csv(path/'adult.csv')

# simple split data into train & valid
valid_idx = range(len(df)-2000, len(df))

# define local variables
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


def train_clf_config(config):
    """this function is inherited from train_clf() with 'config' added """
    # data preprocessing
    procs = [FillMissing, Categorify, Normalize]

    # prep data for tabular_learner()
    data = TabularDataBunch.from_df(path, df, dep_var=dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)

    #metrics
    f1=FBeta()
    precision = Precision()
    recall = Recall()

    print('================ training classifier================')

    # choose hyperparameters: layers, ps, emb_drop
    n_layers = config['n_layers']
    layers, ps = [], []
    for i in range(n_layers -1):
        # number of units for each layer
        n_units = config['n_units']
        layers.append(n_units)

        # dropout ratio for each layer
        p = config['n_ps']
        ps.append(p)
    #emb_drop = config['emb_drop']

    # define a tabular classifier
    learn = tabular_learner(data, layers=layers, ps=ps, emb_szs={'native-country': 10}, metrics=[accuracy, precision, recall, f1])
    
    # auto find learning rate
    try:
        lr = find_appropriate_lr(model=clf, plot=True)
        print(f'clf uses estimated lr={lr}')
    except:
        lr = 1e-2
        print(f'clf uses pre-defined lr={lr}')
    
    # train n_epoch
    n_epochs = 5 # config['n_epochs']
    clf.fit_one_cycle(n_epochs, moms=(lr*0.01,lr))
    
    # validation performance metrics
    valid_metrics = dict(zip(['accuracy',	'precision',	'recall',	'f1'], [x.item() for x in clf.recorder.metrics[0]]))
    tune.track.log(mean_accuracy=valid_metrics['accuracy'])

analysis = tune.run(
    train_clf_config,
    config = {
              "n_layers": tune.choice([2,3,4,5]), 
              "n_units": tune.choice([200,300,400,500,600,700,800,900,1000,1100,1200]), #tune.randint(lower=200, upper=1200),
              "n_ps": tune.loguniform(lower=1e-4, upper=1e-1)
              }
)

print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

The error was:


Trial name	# failures	error file
train_clf_config_824e0_00000	1	/root/ray_results/train_clf_config_2021-10-01_21-21-14/train_clf_config_824e0_00000_0_n_layers=4,n_ps=0.00086754,n_units=600_2021-10-01_21-21-15/error.txt

---------------------------------------------------------------------------
TuneError                                 Traceback (most recent call last)
<ipython-input-12-a82093f27fe0> in <module>()
      4               "n_layers": tune.choice([2,3,4,5]),
      5               "n_units": tune.choice([200,300,400,500,600,700,800,900,1000,1100,1200]), #tune.randint(lower=200, upper=1200),
----> 6               "n_ps": tune.loguniform(lower=1e-4, upper=1e-1)
      7               }
      8 )

/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote)
    553     if incomplete_trials:
    554         if raise_on_failed_trial and not state[signal.SIGINT]:
--> 555             raise TuneError("Trials did not complete", incomplete_trials)
    556         else:
    557             logger.error("Trials did not complete: %s", incomplete_trials)

TuneError: ('Trials did not complete', [train_clf_config_824e0_00000])

Any advise would be much appreciated!

I ran the exact script and got the same error message, plus the following before it:

(pid=473) 2021-10-01 21:41:23,362	ERROR function_runner.py:266 -- Runner Thread raised error.
(pid=473) Traceback (most recent call last):
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=473)     self._entrypoint()
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=473)     self._status_reporter.get_checkpoint())
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=473)     output = fn()
(pid=473)   File "<ipython-input-1-a9884adaa850>", line 65, in train_clf_config
(pid=473) NameError: name 'clf' is not defined
(pid=473) Exception in thread Thread-2:
(pid=473) Traceback (most recent call last):
(pid=473)   File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
(pid=473)     self.run()
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 279, in run
(pid=473)     raise e
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=473)     self._entrypoint()
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=473)     self._status_reporter.get_checkpoint())
(pid=473)   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=473)     output = fn()
(pid=473)   File "<ipython-input-1-a9884adaa850>", line 65, in train_clf_config
(pid=473) NameError: name 'clf' is not defined
(pid=473) 
2021-10-01 21:41:23,450	ERROR trial_runner.py:773 -- Trial train_clf_config_516a5_00000: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 739, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 746, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 82, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1621, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): ray::ImplicitFunc.train_buffered() (pid=473, ip=172.28.0.2, repr=<types.ImplicitFunc object at 0x7f76e1cf4490>)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 178, in train_buffered
    result = self.train()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 237, in train
    result = self.step()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 379, in step
    self._report_thread_runner_error(block=True)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 527, in _report_thread_runner_error
    ("Trial raised an exception. Traceback:\n{}".format(err_tb_str)
ray.tune.error.TuneError: Trial raised an exception. Traceback:
ray::ImplicitFunc.train_buffered() (pid=473, ip=172.28.0.2, repr=<types.ImplicitFunc object at 0x7f76e1cf4490>)
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
    self._entrypoint()
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
    self._status_reporter.get_checkpoint())
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
    output = fn()
  File "<ipython-input-1-a9884adaa850>", line 65, in train_clf_config
NameError: name 'clf' is not defined
(pid=473) ================ training classifier================
(pid=473) clf uses pre-defined lr=0.01
Result for train_clf_config_516a5_00000:
  {}

I changed the following line and was able to get the script to continue executing:

-    clf.fit_one_cycle(n_epochs, moms=(lr*0.01,lr))
+    learn.fit_one_cycle(n_epochs, moms=(lr*0.01,lr))

Not sure if this is expected from the training logic or not, but it just repeatedly prints:

(pid=472) <IPython.core.display.HTML object> 
(pid=472) <IPython.core.display.HTML object> 
(pid=472) <IPython.core.display.HTML object> 
(pid=472) <IPython.core.display.HTML object> 
(pid=472) <IPython.core.display.HTML object> 

Edit:

The script can be run with the following changes:

  1. Same as above.
-    valid_metrics = dict(zip(['accuracy',	'precision',	'recall',	'f1'], [x.item() for x in clf.recorder.metrics[0]]))
+    valid_metrics = dict(zip(['accuracy',	'precision',	'recall',	'f1'], [x.item() for x in learn.recorder.metrics[0]]))
  1. tune.track is deprecated, tune.report is the correct API to use to complete an iteration.
-    tune.track.log(mean_accuracy=valid_metrics['accuracy'])
+    tune.report(mean_accuracy=valid_metrics['accuracy'])
  1. get_best_config requires a mode argument to know if you’re maximizing or minimizing your metric.
- print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
+ print("Best config: ", analysis.get_best_config(metric="mean_accuracy", mode="max"))

Output:

2021-10-01 22:02:09,156	INFO tune.py:561 -- Total run time: 178.84 seconds (178.67 seconds for the tuning loop).
Best config:  {'n_layers': 4, 'n_units': 1000, 'n_ps': 0.0017481654625269758}

Note that this only runs 1 trial - you can increase this by setting num_samples in tune.run!

2 Likes

As a general practice I would advise running the training function directly with 1 hard-coded config (without ray.tune). That way, it’s a lot more obvious whether or not the error is coming from ray.tune!

1 Like

@matthewdeng Great job! & Thank you!

I have another question here: in order to randomly decide n_layers (says, tune.choice([2,3,4,5]) and then randomly choose the number of units on each layers, the result looks like

  • when n_layers = 2, units [500, 300]

  • when n_layers = 2, units [800, 200]

  • when n_layers =3, units [700, 500, 300]

  • when n_layers = 3, units [800, 200, 400]

The implement in optuna is as following; what is the corresponding implementation in Ray?

def objective(trial: optuna.Trial):
    num_layers = trial.suggest_int('n_layers', 1, 5)  # `num_layers` is 1, 2, 3, 4, or 5.
    layers, ps = [], []  # define the number of unit of each layer / the ratio of dropout of each layer
    for i in range(n_layers - 1):  # `TabularModel` automatically adds the last layer.
        num_units = trial.suggest_categorical(f'num_units_layer_{i}', [800, 900, 1000, 1100, 1200])
        p = trial.suggest_discrete_uniform(f'dropout_p_layer_{i}', 0, 1, 0.05)
        layers.append(num_units); ps.append(p)
        
    emb_drop = trial.suggest_discrete_uniform('emb_drop', 0, 1, 0.05)
    learn = tabular_learner(data, layers=layers, ps=ps, emb_drop=emb_drop, y_range=y_range, metrics=exp_rmspe)
    
    learn.fit_one_cycle(5, 1e-3, wd=0.2)
    return learn.validate()[-1].item()  # Of course you can use the last record of `learn.recorder`.
    
study = optuna.create_study()
study.optimize(objective)
best_trial = study.best_trial