I’m trying to train a RandomForestClassifier
with the following codes in Ray and run inference:
import ray
from ray.air.config import ScalingConfig
from ray.train.sklearn import SklearnTrainer
from sklearn.ensemble import RandomForestClassifier
from ray.train.batch_predictor import BatchPredictor
from ray.train.sklearn import SklearnPredictor
# Load data.
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")
# Split data into train and validation.
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)
# Create a test dataset by dropping the target column.
test_dataset = valid_dataset.drop_columns(cols=["target"])
# Create a preprocessor to scale some columns.
from ray.data.preprocessors import StandardScaler
preprocessor = StandardScaler(columns=["mean radius", "mean texture"])
trainer = SklearnTrainer(
estimator=RandomForestClassifier(),
label_column="target",
datasets={"train": train_dataset, "valid": valid_dataset},
preprocessor=preprocessor,
cv=5,
scaling_config=ScalingConfig(trainer_resources={"CPU": 1}),
)
result = trainer.fit()
batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, SklearnPredictor)
predicted_output = batch_predictor.predict(test_dataset)
predicted_output.show()
but when I use batch_predictor.predict(test_dataset)
, it will output predicted class like this:
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
I wonder is there anyway we can get class probabilities like predict_proba(X) in sklearn?