I’m trying to tune a XGBoost model with the PBT schedule in Ray. For most it appears to be working well, though I have some questions:
xgb.train(
config,
train_set,
evals=[(test_set, "eval")],
verbose_eval=False,
num_boost_round=n_estimators,
callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])
It appears from the implementation of TuneReportCheckpointCallback()
that this does the whole job, including saving ‘step’ information. Restoring also seems to work “automagically” like a charm - if one have remembered to set ‘num_boost_round’. This could be stated more clearly in the documentation - if I have understood it correctly.
- Have I understood correctly?
However, I still have a few issues:
-
As the perturbation multiply by 1.2 and the resulting new_config is not checked for the limits given in the hyperparameters_mutations, it is possible for a given hyperparameter to exceed its bounds resulting in:
(pid=1445879) xgboost.core.XGBoostError: value 1.11478 for Parameter subsample exceed bound [0,1] (pid=1445879) subsample: Row subsample ratio of training instance.
Is there a way to avoid that except excluding it?
-
From time to time, I get
2021-04-10 21:43:51,982 WARNING worker.py:1107 -- A worker died or was killed while executing task ffffffffffffffff76e3e21ca358acff9fa4f6a601000000. Result for train_af2db_00015:
Why? Can I avoid that too?
Any insight is much appreciated. I have enclosed the whole code below - there might be other issues as well:
import sklearn.datasets
import sklearn.metrics
import numpy as np
import xgboost as xgb
import os
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.integration.xgboost import TuneReportCheckpointCallback
n_estimators = 500
# Load dataset
data = np.genfromtxt("trial.csv", usecols=range(0,8), delimiter=",",
names=['truecl','cep50', 'fiel', 'v0', 'acc', 'rcs'],
dtype=('S2', int, float, float, int, float))
le = preprocessing.LabelEncoder()
y = le.fit_transform(data['truecl'])
X = np.array([list(r)[2:] for r in data])
def train(config, checkpoint_dir=None):
# Split into train and test set
train_x, test_x, train_y, test_y = train_test_split(
X, y, test_size=0.25)
# Build input matrices for XGBoost
train_set = xgb.DMatrix(train_x, label=train_y)
test_set = xgb.DMatrix(test_x, label=test_y)
# Train the classifier
xgb.train(
config,
train_set,
evals=[(test_set, "eval")],
verbose_eval=False,
num_boost_round=n_estimators,
callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])
# Return prediction accuracy
# accuracy = 1. - results["eval"]["merror"][-1]
# tune.report(mean_accuracy=accuracy, done=True)
if __name__ == "__main__":
config = {
"objective": "multi:softmax",
"num_class": 6,
"eval_metric": ["mlogloss", "merror"],
"max_depth": tune.randint(1, 9),
"min_child_weight": tune.choice([1, 2, 3]),
"gamma": tune.uniform(0.5, 5.0),
"subsample": tune.uniform(0.5, 1.0),
"colsample_bytree": tune.uniform(0.4, 1.0),
"eta": tune.loguniform(1e-4, 1e-1),
"learning_rate": tune.choice([1e-3, 1e-4, 1e-5]),
"lambda": tune.uniform(0.1, 5.0),
"alpha": tune.uniform(0.1, 5.0)
}
# This will enable aggressive early stopping of bad trials.
scheduler = PopulationBasedTraining(
time_attr='time_total_s',
# metric='mean_accuracy',
metric='eval-merror',
mode='min',
perturbation_interval=10,
hyperparam_mutations={
"max_depth": lambda: random.randint(1, 9),
"min_child_weight": [1, 2, 3],
"gamma": lambda: random.uniform(0.5, 5.0),
"subsample": lambda: random.uniform(0.5, 1.0),
"colsample_bytree": lambda: random.uniform(0.4, 1.0),
"eta": lambda: random.uniform(1e-4, 1e-1),
"learning_rate": [1e-3, 1e-4, 1e-5],
"lambda": lambda: random.uniform(0.1, 5.0),
"alpha": lambda: random.uniform(0.1, 5.0)
})
analysis = tune.run(
train,
scheduler=scheduler,
resources_per_trial={"cpu": 1},
config=config,
num_samples=25)
# Gets best trial based on max accuracy across all training iterations.
best_trial = analysis.get_best_trial(metric="eval-merror", mode="min", scope="all")
# Gets best checkpoint for trial based on accuracy.
best_checkpoint = analysis.get_best_checkpoint(best_trial,
metric='eval-merror',
mode='min')
# Load the best model checkpoint
best_bst = xgb.Booster()
best_bst.load_model(os.path.join(best_checkpoint, "model.xgb"))
accuracy = 1. - best_trial.last_result["eval-merror"]
print(f"Best model parameters: {best_trial.config}")
print(f"Best model total accuracy: {accuracy:.4f}")