Trainable too slow to initialize

I’m using PB2 to tune Random Forest from scikit-learn. Here’s my code snippet

pbt = PB2(
        time_attr="training_iteration",
        #metric="mean_accuracy",
        #mode="max",
        perturbation_interval=20,
        #resample_probability=0.25,
        quantile_fraction=0.25,  # copy bottom % with top %
        log_config=True,
        # Specifies the search space for these hyperparams
        hyperparam_bounds={
            "n_estimators" : [50, 200],
            "min_samples_split" : [2, 6],
            "min_samples_leaf" : [1, 4]})
    

    start = time.perf_counter()
    if not os.path.exists('tuning/'+var):
        os.makedirs('./tuning/'+var)
    output = f"tuning/{var}/RandomForestClassifier_{var}_.csv"
    print('Working with '+var+' dataset...', file=open(output, "w"))
    print('Working with '+var+' dataset...')
    analysis = run(
        RF_PB2,
        name=f"RandomForestClassifier_PB2_{var}",
        verbose=0,
        scheduler=pbt,
        reuse_actors=True,
        local_dir="./tune_results",
        #resources_per_trial={
        ##    "cpu": 1,
        #    "gpu": 1
        #},
        #global_checkpoint_period=np.inf,   # Do not save checkpoints based on time interval
        checkpoint_freq = 20,        # Save checkpoint every time the checkpoint_score_attr improves
        checkpoint_at_end = True,   
        keep_checkpoints_num = 2,   # Keep only the best checkpoint
        checkpoint_score_attr = 'mean_accuracy', # Metric used to compare checkpoints
        metric="mean_accuracy",
        mode="max",
        stop={
            "training_iteration": 50,
        },
        num_samples=2,
        fail_fast=True,
        queue_trials=True,
        config={ #https://www.geeksforgeeks.org/hyperparameters-of-random-forest-classifier/
            "var": var,
            "n_estimators" : tune.randint(50, 200),
            "min_samples_split" : tune.randint(2, 6),
            "min_samples_leaf" : tune.randint(1, 4),
            "criterion" : tune.choice(["gini", "entropy"]),
            "max_features" : tune.choice(["sqrt", "log2"]),
            "class_weight" : tune.choice(["balanced", "balanced_subsample"])
    })

It takes forever to start the sample and tuning process and I get these in my logs

|[2me[36m(pid=23487)e[0m 2021-03-07 14:11:42,588|INFO trainable.py:99 -- Trainable.setup took 24.839 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.|
|---|---|
|e[2me[36m(pid=23482)e[0m 2021-03-07 14:11:43,058|INFO trainable.py:99 -- Trainable.setup took 25.309 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.|
|2021-03-07 15:02:34,648|INFO pbt.py:481 -- [pbt]: no checkpoint for trial. Skip exploit for Trial RF_PB2_3ce57_00000|

Can someone please help me on what’s going on? I can share my whole script if it’s of any use.
Data frame shape - (1348645, 86)

Any help is appreciated!
Thanks in advance!

Hmm, what are you doing in RF_PB2?

Also, it’s unlikely that PB2 is going to help you with scikit-learn estimators that don’t support partial_fit. PB2 requires the training to be iterative (while scikit-learn estimators are not).

Try using GitHub - ray-project/tune-sklearn: A drop-in replacement for Scikit-Learn’s GridSearchCV / RandomizedSearchCV -- but with cutting edge hyperparameter tuning techniques. ?

This is my function

class RF_PB2(Trainable):  #https://docs.ray.io/en/master/tune/examples/pbt_tune_cifar10_with_keras.html
    def _read_data(self, config):
        os.chdir('/data/project/worthey_lab/projects/experimental_pipelines/tarun/ditto/data/processed/')
        with open("../../configs/columns_config.yaml") as fh:
            config_dict = yaml.safe_load(fh)
        var = config.get("var")
        x_train = pd.read_csv(f'train_{var}/merged_data-train_{var}.csv')
        #var = X_train[config_dict['ML_VAR']]
        x_train = x_train.drop(config_dict['ML_VAR'], axis=1)
        x_train.replace([np.inf, -np.inf], np.nan, inplace=True)
        x_train.fillna(0, inplace=True)
        feature_names = x_train.columns.tolist()
        x_train = x_train.values
        y_train = pd.read_csv(f'train_{var}/merged_data-y-train_{var}.csv')
        #Y_train = pd.get_dummies(Y_train)
        y_train = label_binarize(y_train.values, classes=['low_impact', 'high_impact']).ravel()  
        x_test = pd.read_csv(f'test_{var}/merged_data-test_{var}.csv')
        #var = X_test[config_dict['ML_VAR']]
        x_test = x_test.drop(config_dict['ML_VAR'], axis=1)
        #feature_names = X_test.columns.tolist()
        x_test = x_test.values
        y_test = pd.read_csv(f'test_{var}/merged_data-y-test_{var}.csv')
        print('Data Loaded!')
        #Y_test = pd.get_dummies(Y_test)
        y_test = label_binarize(y_test.values, classes=['low_impact', 'high_impact']).ravel()  
        #print(f'Shape: {Y_test.shape}')
        return x_train, x_test, y_train, y_test, feature_names

    def setup(self, config):
        self.x_train, self.x_test, self.y_train, self.y_test, self.feature_names = self._read_data(config)
        model = RandomForestClassifier(random_state=42, n_estimators=self.config.get("n_estimators", 100), min_samples_split=self.config.get("min_samples_split",2), min_samples_leaf=self.config.get("min_samples_leaf",1), criterion=self.config.get("criterion","gini"), max_features=self.config.get("max_features","sqrt"), class_weight=self.config.get("class_weight","balanced"), n_jobs = -1)
        #model = RandomForestClassifier(config)
        self.model = model

    def reset_config(self, new_config):
        self.n_estimators = new_config["n_estimators"]
        self.min_samples_split = new_config["min_samples_split"]
        self.min_samples_leaf = new_config["min_samples_leaf"]
        self.criterion = new_config["criterion"]
        self.max_features = new_config["max_features"]
        self.class_weight = new_config["class_weight"]
        self.config = new_config
        return True

    def step(self):
        score = cross_validate(self.model, self.x_train, self.y_train, cv=10, return_train_score=True, return_estimator=True, n_jobs=-1, verbose=0)
        testing_score = np.mean(score['test_score'])
        #print(accuracy)
        return {"mean_accuracy": testing_score}

    def save_checkpoint(self, checkpoint_dir):
        file_path = checkpoint_dir + "/model"
        pickle.dump(self.model, open(file_path, 'wb'))
        return file_path

    def load_checkpoint(self, path):
        # See https://stackoverflow.com/a/42763323
        del self.model
        self.model = pickle.load(open(path,'rb'))

    def cleanup(self):
        # If need, save your model when exit.
        # saved_path = self.model.save(self.logdir)
        # print("save model at: ", saved_path)
        pass

So, PB2 is only useful for Neural Networks?
Thanks for the link!

yeah, I would recommend using tune-sklearn here! PB2 should mostly be used for iterative training procedures, including neural networks yeah.