Xgboost_ray crashes when used for multiclass text classification

I modified the official example to train a multi-class text classification model with countvecotrizer as preprocessor on a local cluster but keep getting this error:

RuntimeError: A Ray actor died during training and the maximum number of retries (0) is exhausted.

I am running ray v2.2.0 on python 3.9.x on M1 mac.

Here is my code:

import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.data.preprocessors import CountVectorizer

from sklearn.datasets import fetch_20newsgroups


twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

# Create dask dataframe 
df = pd.DataFrame({"text": twenty_train.data[:100], "target": twenty_train.target[:100]})
ds = ray.data.from_pandas(df)

# Split data into train and validation.
train_dataset, valid_dataset = ds.train_test_split(test_size=0.2, seed=42)

preprocessor = CountVectorizer(columns=["text"])
# XGBoost specific params
params = {
    "tree_method": "approx",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": df['target'].nunique()
}

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
    label_column="target",
    params=params,
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
    num_boost_round=2,
)

result = trainer.fit()

Hey, the actual exception can be seen in the logs above:

(_RemoteRayXGBoostActor pid=11076) Exception in thread Thread-2:
(_RemoteRayXGBoostActor pid=11076) Traceback (most recent call last):
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/threading.py", line 973, in _bootstrap_inner
(_RemoteRayXGBoostActor pid=11076)     self.run()
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/threading.py", line 910, in run
(_RemoteRayXGBoostActor pid=11076)     self._target(*self._args, **self._kwargs)
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost_ray/main.py", line 612, in _train
(_RemoteRayXGBoostActor pid=11076)     local_dtrain = _get_dmatrix(dtrain, self._data[dtrain])
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost_ray/main.py", line 345, in _get_dmatrix
(_RemoteRayXGBoostActor pid=11076)     matrix = xgb.DMatrix(**param)
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
(_RemoteRayXGBoostActor pid=11076)     return func(**kwargs)
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 766, in __init__
(_RemoteRayXGBoostActor pid=11076)     self.feature_names = feature_names
(_RemoteRayXGBoostActor pid=11076)   File "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 1164, in feature_names
(_RemoteRayXGBoostActor pid=11076)     raise ValueError('feature_names must be string, and may not contain [, ] or <')
(_RemoteRayXGBoostActor pid=11076) ValueError: feature_names must be string, and may not contain [, ] or <

The problem is that the CountVectorizer preprocessor creates columns with names that contain characters XGBoost is unhappy about. We can easily fix that by adding a BatchMapper preprocessor which will fix the column names by replacing the characters with their names:

import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.data.preprocessors import BatchMapper, Chain, CountVectorizer
import pandas as pd

from sklearn.datasets import fetch_20newsgroups


twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)

# Create dask dataframe
df = pd.DataFrame(
    {"text": twenty_train.data[:100], "target": twenty_train.target[:100]}
)
ds = ray.data.from_pandas(df)

# Split data into train and validation.
train_dataset, valid_dataset = ds.train_test_split(test_size=0.2, seed=42)


def fix_col_names(batch: pd.DataFrame) -> pd.DataFrame:
    batch.columns = [
        colname.replace(":", "COLON")
        .replace(",", "COMMA")
        .replace("<", "LT")
        .replace(">", "GT")
        .replace("[", "LBRACKET")
        .replace("]", "RBRACKET")
        for colname in batch.columns
    ]
    return batch


preprocessor = Chain(
    CountVectorizer(columns=["text"]), BatchMapper(fix_col_names, batch_format="pandas")
)
# XGBoost specific params
params = {
    "tree_method": "approx",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": df["target"].nunique(),
}

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
    label_column="target",
    params=params,
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
    num_boost_round=2,
)

result = trainer.fit()

This script should run successfully. Let me know if you have any other questions and please mark this answer as the solution if it worked for you!

1 Like

thanks. It works when I limit the data to the first 100 rows but when I used the full dataset, I am getting a new error.

RayTaskError(IndexError): ray::_Inner.train() (pid=41186, ip=127.0.0.1, repr=XGBoostTrainer)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/trainable.py”, line 367, in train
raise skipped from exception_cause(skipped)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py”, line 335, in entrypoint
return self._trainable_func(
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 480, in _trainable_func
super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py”, line 652, in _trainable_func
output = fn()
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 389, in train_func
trainer.preprocess_datasets()
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/gbdt_trainer.py”, line 187, in preprocess_datasets
super().preprocess_datasets()
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py”, line 299, in preprocess_datasets
self.preprocessor.fit(train_dataset)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py”, line 105, in fit
return self._fit(dataset)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessors/chain.py”, line 73, in _fit
ds = preprocessor.fit_transform(ds)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py”, line 120, in fit_transform
self.fit(dataset)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py”, line 105, in fit
return self._fit(dataset)
File “/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessors/vectorizer.py”, line 233, in _fit
total_counts[i].update(col_value_counts)
IndexError: list index out of range

Thanks, I can reproduce this. Looks to be a bug. Will keep you updated on the fix!

1 Like

@Yard1 any updates on this topic? I’ve got an issue seems related: Does xgboost ray supports multi-output, many y labels?