Best model not saved using ray tune for xgboost training

Faaany · August 21, 2022, 10:31am

How severe does this issue affect your experience of using Ray?

High: It blocks me to complete my task.

When running the example code from xgboost-ray repo,

import argparse
import os

import xgboost_ray
from sklearn import datasets
from sklearn.model_selection import train_test_split

import ray
from ray import tune

from xgboost_ray import train, RayDMatrix, RayParams


def train_breast_cancer(config, ray_params):
    # Load dataset
    data, labels = datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.25)

    train_set = RayDMatrix(train_x, train_y)
    test_set = RayDMatrix(test_x, test_y)

    evals_result = {}

    bst = train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, "eval")],
        evals_result=evals_result,
        ray_params=ray_params,
        verbose_eval=False,
        num_boost_round=10)

    model_path = "tuned.xgb"
    bst.save_model(model_path)
    print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))


def main(cpus_per_actor, num_actors, num_samples):
    # Set XGBoost config.
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    ray_params = RayParams(
        max_actor_restarts=1,
        gpus_per_actor=0,
        cpus_per_actor=cpus_per_actor,
        num_actors=num_actors)

    analysis = tune.run(
        tune.with_parameters(train_breast_cancer, ray_params=ray_params),
        # Use the `get_tune_resources` helper function to set the resources.
        resources_per_trial=ray_params.get_tune_resources(),
        config=config,
        num_samples=num_samples,
        metric="eval-error",
        mode="min")

    # Load the best model checkpoint.
    best_bst = xgboost_ray.tune.load_model(
        os.path.join(analysis.best_logdir, "tuned.xgb"))

    best_bst.save_model("best_model.xgb")

    accuracy = 1. - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")


if __name__ == "__main__":
    ray.shutdown()
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address",
        required=False,
        type=str,
        help="the address to use for Ray")
    parser.add_argument(
        "--server-address",
        required=False,
        type=str,
        help="Address of the remote server if using Ray Client.")
    parser.add_argument(
        "--cpus-per-actor",
        type=int,
        default=15,
        help="Sets number of CPUs per XGBoost training worker.")
    parser.add_argument(
        "--num-actors",
        type=int,
        default=20,
        help="Sets number of XGBoost workers to use.")
    parser.add_argument(
        "--num-samples",
        type=int,
        default=4,
        help="Number of samples to use for Tune.")
    parser.add_argument("--smoke-test", action="store_true", default=False)

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=args.num_actors * args.num_samples)
    elif args.server_address:
        ray.util.connect(args.server_address)
    else:
        ray.init(address="auto")
    
    main(args.cpus_per_actor, args.num_actors, args.num_samples)
    main(args.cpus_per_actor, args.num_actors, args.num_samples)

I got the following error:

(train_breast_cancer pid=236691, ip=192.168.0.33) Final validation error: 0.0629
2022-08-21 10:19:45,356 INFO tune.py:748 – Total run time: 27.83 seconds (27.08 seconds for the tuning loop).
Traceback (most recent call last):
File “ray-example.py”, line 117, in
main(args.cpus_per_actor, args.num_actors, args.num_samples)
File “ray-example.py”, line 69, in main
os.path.join(analysis.best_logdir, “tuned.xgb”))
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 157, in load_model
bst = load_model_fn(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 146, in load_model_fn
best_bst.load_model(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 2250, in load_model
self.handle, c_str(fname)))
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 203, in _check_call
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [10:19:45] …/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null: LocalFileSystem::Open “/root/ray_results/train_breast_cancer_2022-08-21_10-19-17/train_breast_cancer_b6b2d_00001_1_eta=0.0672,max_depth=6,subsample=0.6811_2022-08-21_10-19-27/tuned.xgb”: No such file or directory
Stack trace:
[bt] (0) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x729ffd) [0x7f11dcd13ffd]
[bt] (1) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x72a7d8) [0x7f11dcd147d8]
[bt] (2) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x6ff719) [0x7f11dcce9719]
[bt] (3) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterLoadModel+0x1eb) [0x7f11dc697c7b]
[bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f12907838ee]
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f12907832bf]
[bt] (6) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0xd563) [0x7f1290794563]
[bt] (7) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x12ac3) [0x7f1290799ac3]
[bt] (8) /usr/local/lib/libpython3.7m.so.1.0(_PyObject_FastCallKeywords+0x97) [0x7f1293f96377]

I check the directory shown in the error log and couldn’t find the tune.xgb as well. The tuning is actually finished, but the best mode seems to be not saved.

What especially strange for me is that it did run through several times, but then it couldn’t…

Other infos:

ray-cluster is manually set up with 2 nodes
all packages are installed in a standard manner with pip

xwjiang2010 · August 22, 2022, 6:16pm

I think you need to do something similar to this to have Tune take care of your checkpoint saved in a distributed setup.

xgb.train(
         config,
         train_set,
         evals=[(test_set, "eval")],
         verbose_eval=False,
         callbacks=[TuneReportCheckpointCallback(filename="model.xgb")],
     )

Notice the callbacks section here.

See Tuning XGBoost parameters — Ray 1.13.0

As a side note, we are migrating away from this API in favor of Ray AIR. Please also take a look here: Hyperparameter tuning with XGBoostTrainer — Ray 3.0.0.dev0

Topic		Replies	Views
Model training is slower in Ray Tune Ray Tune	8	949	June 30, 2023
Tuning XGBoost with PBT Ray Tune	8	1211	April 22, 2021
Tune xgboost with cross-validation? Ray Tune	2	811	September 22, 2021
[Tune] Will `pip install xgboost-ray==0.1.0` work with ray 2.0.dev? Ray Tune	4	404	May 25, 2021
Distributed data loading using Ray Data with XGBoost official (or XGBoost Sklearn) model	1	313	August 26, 2022

Best model not saved using ray tune for xgboost training

Related topics