RayTune gets stuck after completing all trials

Hey,

TLDR: Ray Tune is stuck when increasing the number of trials.

Details:
I’m using Ray Tune (ray[tune]==1.9.1) to find model optimal parameters for XGBOOST model.

When I run it in my local machine everything works well (even for a large number of trials, e.g. 100), but when I run it on a container it only works well when configured to 10 trials or less, for more than 10 trials it gets suck after completing all trials evaluations (by adding logs to report on completion of each trial).

I’d appreciate any help!


from ray import tune
import numpy as np
from typing import Callable, Dict
from xgboost import XGBClassifier
import logging
from ray.tune.logger import LoggerCallback


logging.basicConfig()
logger = logging.getLogger("luigi-interface")


def create_xgboost_classifier_config():
    return {
        "objective": "multi: softprob",
        "eta": tune.uniform(0.1, 0.5),
        "n_estimators": tune.uniform(20, 300),
        "max_depth": tune.uniform(1, 5),
        "gamma": tune.uniform(1e-10, 0.3),
        "lambda": tune.uniform(0.3, 5.0),
        "subsample": tune.uniform(0.3, 1.0),
    }

def extract_classifier_params_from_config(config):
    return {
        "objective": config["objective"],
        "n_estimators": int(config["n_estimators"]),
        "max_depth": int(config["max_depth"]),
        "eta": config["eta"],
        "gamma": config["gamma"],
        "lambda": config["lambda"],
        "subsample": config["subsample"],
        "n_jobs": 1,
        "use_label_encoder": False,
    }


def trial_str_creator(trial):
    return "lead_scoring_xgboost_tuning_trial_{}".format(trial.trial_id)


class TestLoggerCallback(LoggerCallback):
    def on_trial_result(self, iteration, trials, trial, result, **info):
        logger.info(f"XGBOOST tuning ({trial}): {result}")

    def on_trial_complete(self, iteration, trials, trial, **info):
        logger.info(f"XGBOOST tuning ({trial}): completed.")

    def on_trial_error(self, iteration, trials, trial, **info):
        logger.exception(f"XGBOOST tuning ({trial}): failed!")


def tune_xgboost_params(
    x: np.ndarray,
    y: np.ndarray,
    num_iterations: int,
    cv_callback: Callable[[XGBClassifier, np.ndarray, np.ndarray], Dict[str, Dict[str, float]]],
) -> XGBClassifier:
    """
    Parameters
    ----------
    x: features data to use as training/validation set. The expected format is: ndarray (n_samples, n_features)
    y: labels data to use as training/validation set. The expected format is: ndarray (n_samples, )
    num_iterations: Number of times that target function will be sampled
    cv_callback: callback function to compute the cross-validation model performance (e.g. K-Fold cross validation)
    Returns
    -------
    XGBOOST model configured with the "best" hyper-parameters
    """

    def trainable(config):
        clf = XGBClassifier(**extract_classifier_params_from_config(config))
        cv_res = cv_callback(clf, x, y)
        report = {
            "val_weighted_log_loss": cv_res["val"]["weighted_log_loss"],
            "val_accuracy": cv_res["val"]["accuracy"],
            "val_precision": cv_res["val"]["precision"],
            "val_recall": cv_res["val"]["recall"],
            "train_weighted_log_loss": cv_res["train"]["weighted_log_loss"],
            "train_accuracy": cv_res["train"]["accuracy"],
            "train_precision": cv_res["train"]["precision"],
            "train_recall": cv_res["train"]["recall"],
        }
        tune.report(**report)

    analysis = tune.run(
        trainable,
        metric="val_weighted_log_loss",
        mode="min",
        num_samples=num_iterations,
        config=create_xgboost_classifier_config(),
        callbacks=[TestLoggerCallback()],
        trial_name_creator=trial_str_creator,
    )

    return XGBClassifier(**extract_classifier_params_from_config(analysis.best_config))


Also cc @Yard1 @kai

@clubmed could you share the stdout when the program gets stuck? Looking at your code everything seems correct.

Also when you run it again and it hangs, can you use py-spy to see where your script is hanging? You can just pip install py-spy and then while your python script is running, you can do py-spy dump --pid PID_OF_MY_PYTHON_SCRIPT.