How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
When running the example code from xgboost-ray repo,
import argparse
import os
import xgboost_ray
from sklearn import datasets
from sklearn.model_selection import train_test_split
import ray
from ray import tune
from xgboost_ray import train, RayDMatrix, RayParams
def train_breast_cancer(config, ray_params):
# Load dataset
data, labels = datasets.load_breast_cancer(return_X_y=True)
# Split into train and test set
train_x, test_x, train_y, test_y = train_test_split(
data, labels, test_size=0.25)
train_set = RayDMatrix(train_x, train_y)
test_set = RayDMatrix(test_x, test_y)
evals_result = {}
bst = train(
params=config,
dtrain=train_set,
evals=[(test_set, "eval")],
evals_result=evals_result,
ray_params=ray_params,
verbose_eval=False,
num_boost_round=10)
model_path = "tuned.xgb"
bst.save_model(model_path)
print("Final validation error: {:.4f}".format(
evals_result["eval"]["error"][-1]))
def main(cpus_per_actor, num_actors, num_samples):
# Set XGBoost config.
config = {
"tree_method": "approx",
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
"eta": tune.loguniform(1e-4, 1e-1),
"subsample": tune.uniform(0.5, 1.0),
"max_depth": tune.randint(1, 9)
}
ray_params = RayParams(
max_actor_restarts=1,
gpus_per_actor=0,
cpus_per_actor=cpus_per_actor,
num_actors=num_actors)
analysis = tune.run(
tune.with_parameters(train_breast_cancer, ray_params=ray_params),
# Use the `get_tune_resources` helper function to set the resources.
resources_per_trial=ray_params.get_tune_resources(),
config=config,
num_samples=num_samples,
metric="eval-error",
mode="min")
# Load the best model checkpoint.
best_bst = xgboost_ray.tune.load_model(
os.path.join(analysis.best_logdir, "tuned.xgb"))
best_bst.save_model("best_model.xgb")
accuracy = 1. - analysis.best_result["eval-error"]
print(f"Best model parameters: {analysis.best_config}")
print(f"Best model total accuracy: {accuracy:.4f}")
if __name__ == "__main__":
ray.shutdown()
parser = argparse.ArgumentParser()
parser.add_argument(
"--address",
required=False,
type=str,
help="the address to use for Ray")
parser.add_argument(
"--server-address",
required=False,
type=str,
help="Address of the remote server if using Ray Client.")
parser.add_argument(
"--cpus-per-actor",
type=int,
default=15,
help="Sets number of CPUs per XGBoost training worker.")
parser.add_argument(
"--num-actors",
type=int,
default=20,
help="Sets number of XGBoost workers to use.")
parser.add_argument(
"--num-samples",
type=int,
default=4,
help="Number of samples to use for Tune.")
parser.add_argument("--smoke-test", action="store_true", default=False)
args, _ = parser.parse_known_args()
if args.smoke_test:
ray.init(num_cpus=args.num_actors * args.num_samples)
elif args.server_address:
ray.util.connect(args.server_address)
else:
ray.init(address="auto")
main(args.cpus_per_actor, args.num_actors, args.num_samples)
main(args.cpus_per_actor, args.num_actors, args.num_samples)
I got the following error:
(train_breast_cancer pid=236691, ip=192.168.0.33) Final validation error: 0.0629
2022-08-21 10:19:45,356 INFO tune.py:748 – Total run time: 27.83 seconds (27.08 seconds for the tuning loop).
Traceback (most recent call last):
File “ray-example.py”, line 117, in
main(args.cpus_per_actor, args.num_actors, args.num_samples)
File “ray-example.py”, line 69, in main
os.path.join(analysis.best_logdir, “tuned.xgb”))
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 157, in load_model
bst = load_model_fn(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost_ray/tune.py”, line 146, in load_model_fn
best_bst.load_model(model_path)
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 2250, in load_model
self.handle, c_str(fname)))
File “/usr/local/lib/python3.7/site-packages/xgboost/core.py”, line 203, in _check_call
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [10:19:45] …/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null: LocalFileSystem::Open “/root/ray_results/train_breast_cancer_2022-08-21_10-19-17/train_breast_cancer_b6b2d_00001_1_eta=0.0672,max_depth=6,subsample=0.6811_2022-08-21_10-19-27/tuned.xgb”: No such file or directory
Stack trace:
[bt] (0) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x729ffd) [0x7f11dcd13ffd]
[bt] (1) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x72a7d8) [0x7f11dcd147d8]
[bt] (2) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x6ff719) [0x7f11dcce9719]
[bt] (3) /usr/local/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterLoadModel+0x1eb) [0x7f11dc697c7b]
[bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f12907838ee]
[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f12907832bf]
[bt] (6) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0xd563) [0x7f1290794563]
[bt] (7) /usr/local/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x12ac3) [0x7f1290799ac3]
[bt] (8) /usr/local/lib/libpython3.7m.so.1.0(_PyObject_FastCallKeywords+0x97) [0x7f1293f96377]
I check the directory shown in the error log and couldn’t find the tune.xgb
as well. The tuning is actually finished, but the best mode seems to be not saved.
What especially strange for me is that it did run through several times, but then it couldn’t…
Other infos:
- ray-cluster is manually set up with 2 nodes
- all packages are installed in a standard manner with pip