Best model not saved using ray tune for xgboost training

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

When running the example code from xgboost-ray repo,

import argparse
import os

import xgboost_ray
from sklearn import datasets
from sklearn.model_selection import train_test_split

import ray
from ray import tune

from xgboost_ray import train, RayDMatrix, RayParams


def train_breast_cancer(config, ray_params):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    bst = train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        ray_params=ray_params,
        verbose_eval=False,
        num_boost_round=10)

    model_path = "tuned.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))


def main(cpus_per_actor, num_actors, num_samples):
    # Set XGBoost config.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray_params = RayParams(
        max_actor_restarts=1,
        gpus_per_actor=0,
        cpus_per_actor=cpus_per_actor,
        num_actors=num_actors)

    analysis = tune.run(
        tune.with_parameters(train_breast_cancer, ray_params=ray_params),
        # Use the `get_tune_resources` helper function to set the resources.
        resources_per_trial=ray_params.get_tune_resources(),
        config=config,
        num_samples=num_samples,
        metric="eval-error",
        mode="min")

    # Load the best model checkpoint.
    best_bst = xgboost_ray.tune.load_model(
        os.path.join(analysis.best_logdir, "tuned.xgb"))

    best_bst.save_model("best_model.xgb")

    accuracy = 1. - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")


if __name__ == "__main__":
    ray.shutdown()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address",
        required=False,
        type=str,
        help="the address to use for Ray")
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.")
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=15,
        help="Sets number of CPUs per XGBoost training worker.")
    parser.add_argument(
        "--num-actors",
        type=int,
        default=20,
        help="Sets number of XGBoost workers to use.")
    parser.add_argument(
        "--num-samples",
        type=int,
        default=4,
        help="Number of samples to use for Tune.")
    parser.add_argument("--smoke-test", action="store_true", default=False)

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors * args.num_samples)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address="auto")
    
    main(args.cpus_per_actor, args.num_actors, args.num_samples)
    main(args.cpus_per_actor, args.num_actors, args.num_samples)

I got the following error:

(train_breast_cancer pid=236691, ip=192.168.0.33) Final validation error: 0.0629
2022-08-21 10:19:45,356 INFO tune.py:748 – Total run time: 27.83 seconds (27.08 seconds for the tuning loop).
Traceback (most recent call last):
File “ray-example.py”, line 117, in
main(args.cpus_per_actor, args.num_actors, args.num_samples)
File “ray-example.py”, line 69, in main
os.path.join(analysis.best_logdir, “tuned.xgb”))
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 157, in load_model
bst = load_model_fn(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 146, in load_model_fn
best_bst.load_model(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 2250, in load_model
self.handle, c_str(fname)))
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 203, in _check_call
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [10:19:45] …/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null: LocalFileSystem::Open “/root/ray_results/train_breast_cancer_2022-08-21_10-19-17/train_breast_cancer_b6b2d_00001_1_eta=0.0672,max_depth=6,subsample=0.6811_2022-08-21_10-19-27/tuned.xgb”: No such file or directory
Stack trace:
[bt] (0) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x729ffd) [0x7f11dcd13ffd]
[bt] (1) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x72a7d8) [0x7f11dcd147d8]
[bt] (2) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x6ff719) [0x7f11dcce9719]
[bt] (3) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterLoadModel+0x1eb) [0x7f11dc697c7b]
[bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f12907838ee]
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f12907832bf]
[bt] (6) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0xd563) [0x7f1290794563]
[bt] (7) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x12ac3) [0x7f1290799ac3]
[bt] (8) /usr/local/lib/libpython3.7m.so.1.0(_PyObject_FastCallKeywords+0x97) [0x7f1293f96377]

I check the directory shown in the error log and couldn’t find the tune.xgb as well. The tuning is actually finished, but the best mode seems to be not saved.

What especially strange for me is that it did run through several times, but then it couldn’t…

Other infos:

  • ray-cluster is manually set up with 2 nodes
  • all packages are installed in a standard manner with pip

I think you need to do something similar to this to have Tune take care of your checkpoint saved in a distributed setup.

xgb.train(
         config,
         train_set,
         evals=[(test_set, "eval")],
         verbose_eval=False,
         callbacks=[TuneReportCheckpointCallback(filename="model.xgb")],
     )

Notice the callbacks section here.

See Tuning XGBoost parameters — Ray 1.13.0

As a side note, we are migrating away from this API in favor of Ray AIR. Please also take a look here: Hyperparameter tuning with XGBoostTrainer — Ray 3.0.0.dev0

1 Like