1. Severity of the issue: (select one)
[ x] High: Completely blocks me.
2. Environment:
- Ray version: 2.37.0
- Python version: 3.12
- OS: Windows
- Other libs/tools (if relevant): Ligthgbm
3. What happened vs. what you expected:
- Expected: Hyperparameter search for a gradient boosted regression tree using ray tune and lightgbm
- Actual: Sometimes I start the training and the first trial starts running, but nothing happens. I just get the print with the running information of the trial every 30s with status running. However my target function never seems to be called at all as the first line in this function is a print, which never appears on screen. I need to run multiple hyperparameter optimization tasks on slightly different data (different months, but data structure and distribution is equal). Weirdly this problem only arises never on some of the data and on the data it arises it only arises in like 90% of the runs. My code is below:
def trainGBRT(config, X_train, X_val, y_train, y_val):
try:
# LightGBM Dataset
print("StartTrain", flush=True)
train_data = lgb.Dataset(X_train, label=y_train, params={'max_bin': 63})
val_data = lgb.Dataset(X_val, label=y_val, params={'max_bin': 63}, reference=train_data)
print("Dataset", flush=True)
# Train the model
model = lgb.train(config, train_data, valid_sets=[val_data], valid_names=['eval'], callbacks=[lgb.early_stopping(stopping_rounds=25),
TuneReportCheckpointCallback(
{
"l2": "eval-l2"
}, frequency=1, checkpoint_at_end=True
)
],)
print("model", flush=True)
# Make predictions
y_pred = model.predict(X_val)
print("pred", flush=True)
# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print("mse", flush=True)
with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
path = os.path.join(temp_checkpoint_dir, "model.txt")
model.save_model(path)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
session.report({"l2": mse, 'done': True}, checkpoint=checkpoint)
except Exception as e:
import traceback
print("Exception occurred:", e)
traceback.print_exc()
raise
and
config = {
'objective': 'regression',
'metric': 'l2',
'boosting_type': 'gbdt',
'num_iterations': 500,
'num_leaves': tune.randint(10, 250),
'max_depth': tune.randint(3, 10),
'learning_rate': tune.choice([0.01, 0.1, 1]),
'feature_fraction': tune.uniform(0.25, 1),
'bagging_fraction': tune.uniform(0.25, 1),
'bagging_freq': tune.choice([1, 5, 10]),
'feature_penalty': list(feature_penalties),
'lambda_l1': tune.uniform(0, 0.1),
'lambda_l2': tune.uniform(0, 0.1),
'num_threads': 8,
'device_type': 'gpu',
'verbose': 2
}
print("Tune")
tuner = tune.Tuner(
tune.with_resources(
tune.with_parameters(trainGBRT, X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val),
resources={"cpu": 8, "gpu": 1}),
tune_config=tune.TuneConfig(
metric="l2",
mode="min",
scheduler=ASHAScheduler(max_t=config['num_iterations']),
search_alg=OptunaSearch(),
num_samples=50,
max_concurrent_trials=1,
),
param_space=config,
)
print(ray.cluster_resources())
print("Tune Fit")
results = tuner.fit()