StatusCode.RESOURCE_EXHAUSTED

I am using Ray Tune on a Google Colab TPU large instance. With a small dataset it runs. But with my full size dataset (300 MB), I am getting the fatal error pasted below. The data is stored as a parquet on Google Drive and makes its way into tune.run via a "partial"ing of my training function and a Pandas dataframe.
I cannot tell if there is some setting that I can configure to allow a larger “message” in the grpc module.

Python 3.7
ray[tune] 1.9.0.

Any insight is appreciated.
Thanks,
tf

/usr/local/lib/python3.7/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, max_concurrent_trials, queue_trials, loggers, _remote)
    442                 export_formats=export_formats,
    443                 max_failures=max_failures,
--> 444                 restore=restore)
    445     else:
    446         logger.debug("Ignoring some parameters passed into tune.run.")

/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in __init__(self, name, run, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, sync_config, trial_name_creator, trial_dirname_creator, log_to_file, checkpoint_freq, checkpoint_at_end, keep_checkpoints_num, checkpoint_score_attr, export_formats, max_failures, restore)
    111                     "checkpointable function. You can specify checkpoints "
    112                     "within your trainable function.")
--> 113         self._run_identifier = Experiment.register_if_needed(run)
    114         self.name = name or self._run_identifier
    115 

/usr/local/lib/python3.7/dist-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
    256                     "No name detected on trainable. Using {}.".format(name))
    257             try:
--> 258                 register_trainable(name, run_object)
    259             except (TypeError, PicklingError) as e:
    260                 extra_msg = ("Other options: "

/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register_trainable(name, trainable, warn)
     74         raise TypeError("Second argument must be convertable to Trainable",
     75                         trainable)
---> 76     _global_registry.register(TRAINABLE_CLASS, name, trainable)
     77 
     78 

/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in register(self, category, key, value)
    150         self._to_flush[(category, key)] = pickle.dumps_debug(value)
    151         if _internal_kv_initialized():
--> 152             self.flush_values()
    153 
    154     def contains(self, category, key):

/usr/local/lib/python3.7/dist-packages/ray/tune/registry.py in flush_values(self)
    173         for (category, key), value in self._to_flush.items():
    174             _internal_kv_put(
--> 175                 _make_key(self._prefix, category, key), value, overwrite=True)
    176         self._to_flush.clear()
    177 

/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py in wrapper(*args, **kwargs)
    103             if func.__name__ != "init" or is_client_mode_enabled_by_default:
    104                 return getattr(ray, func.__name__)(*args, **kwargs)
--> 105         return func(*args, **kwargs)
    106 
    107     return wrapper

/usr/local/lib/python3.7/dist-packages/ray/experimental/internal_kv.py in _internal_kv_put(key, value, overwrite, namespace)
     77         overwrite, bool)
     78     return global_gcs_client.internal_kv_put(key, value, overwrite,
---> 79                                              namespace) == 0
     80 
     81 

/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in wrapper(self, *args, **kwargs)
    128         while True:
    129             try:
--> 130                 return f(self, *args, **kwargs)
    131             except grpc.RpcError as e:
    132                 if remaining_retry <= 0:

/usr/local/lib/python3.7/dist-packages/ray/_private/gcs_utils.py in internal_kv_put(self, key, value, overwrite, namespace)
    247         req = gcs_service_pb2.InternalKVPutRequest(
    248             key=key, value=value, overwrite=overwrite)
--> 249         reply = self._kv_stub.InternalKVPut(req)
    250         if reply.status.code == GcsCode.OK:
    251             return reply.added_num

/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
    944         state, call, = self._blocking(request, timeout, metadata, credentials,
    945                                       wait_for_ready, compression)
--> 946         return _end_unary_response_blocking(state, call, False, None)
    947 
    948     def with_call(self,

/usr/local/lib/python3.7/dist-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
    847             return state.response
    848     else:
--> 849         raise _InactiveRpcError(state)
    850 
    851 

_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Received message larger than max (355752448 vs. 104857600)"
	debug_error_string = "{"created":"@1638857019.251890494","description":"Error received from peer ipv4:172.28.0.2:42493","file":"src/core/lib/surface/call.cc","file_line":1063,"grpc_message":"Received message larger than max (355752448 vs. 104857600)","grpc_status":8}"

Hi @tenderfoot, it’s hard to tell what is happening without a code example. However, since you mentioned that you’re partial'ing your data into the trainable, maybe try using tune.with_parameters instead: Training (tune.Trainable, tune.report) — Ray v1.9.0

tune.with_parameters is basically the equivalent of functools.partial, with the difference that the arguments are persisted into the Ray object store. This is beneficial for datasets, as only one copy is stored in the object store - with partial you’ll likely run into the problem that your data is serialized a large number of times, hence leading to resource exhaustion.

Let me know if that helps!

2 Likes

kai,
tune.with_parameters did the trick.
Thank you so much!

Hi, there, I have tried to put my dataset ref into tune.with_parameters instead: [Training (tune.Trainable, tune.report) — Ray v1.9.0 ]. However, the program gets stuck without info.
THe “htop” tells me the virtual memory usage of the process is about 80GB. I used ray tune to do the knowledge distillation with huggingface transformers, the transformers has to produce the features first before putting into training which is super large in my case.

Do you have any ideas, thank you.

@kai Can we file a github issue to improve the error message here?

hi,I get same error when use ray tune, my code is

def train_model(config):
    train_set = ray_lgb_build_dataset(train_data)
    val_set = ray_lgb_build_dataset(val_data)
    evals_result = {}
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        valid_sets=[val_set],
        valid_names=['val'],
        ray_params=ray_params
    )

bayesopt = BayesOptSearch(metric='l2', mode='min')
hy_space = {
    'max_iter': tune.randint(64, 128),
    'max_depth': tune.randint(4, 9),
}
analysis = tune.run(
    tune.with_parameters(train_model),
    config=hy_space,
    search_alg=bayesopt,
    resources_per_trial=ray_params.get_tune_resources()
)

env:
python 3.8.5
ray 1.9.1

when I use ray to train a model using default params, it works, but when tune ,get this error, my data is bigger than million.

Hi all,

I get the same error on a data bricks cluster with the following config

worker type: r5.24xlarge , 96 cores, 786 gb
driver type: r5.24xlarge , 96 cores, 786 gb
max worker : 4

I am running the following code snippet:

# from tune_sklearn import TuneSearchCV
from ray import tune
import sys
 
sys.stdout.fileno = lambda: False


# Other imports
import scipy
from ray import tune
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb
# from tune_sklearn import TuneSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from ray.tune.integration.mlflow import MLflowLoggerCallback


X, y = make_classification(n_samples=600000, n_features=120, n_redundant=0, n_classes=2, class_sep=2.5)
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.6, random_state=123)

X_test, X_val, y_test, y_val \
    = train_test_split(X_test, y_test, test_size=0.5, random_state=123)
def train_breast_cancer(config):
    train_set = lgb.Dataset(X_train, label=y_train)
    test_set = lgb.Dataset(X_val, label=y_val)
    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[test_set],
        valid_names=["eval"],
        verbose_eval=False,
        callbacks=[
            TuneReportCheckpointCallback({
                "binary_error": "eval-binary_error",
                "binary_logloss": "eval-binary_logloss"
            })
        ])
    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    tune.report(
        mean_accuracy=sklearn.metrics.accuracy_score(y_test, pred_labels),
        done=True)
num_actors = 2
num_cpus_per_actor = 70

ray_params = RayParams(
    num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
config = {
    "objective": "binary",
    "metric": ["binary_error", "binary_logloss"],
    "verbose": -1,
    "boosting_type": tune.grid_search(["gbdt", "dart"]),
    "num_leaves": tune.randint(10, 1000),
    "learning_rate": tune.loguniform(1e-8, 1e-1)
}

analysis = tune.run(
    tune.with_parameters(train_breast_cancer),
    metric="binary_error",
    mode="min",
    config=config,
    num_samples=40,
    resources_per_trial=ray_params.get_tune_resources())
print("Best hyperparameters found were: ", analysis.best_config)

ERROR:

<_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (580804126 vs. 536870912)"
	debug_error_string = "{"created":"@1645562954.671849508","description":"Sent message larger than max (580804126 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":268,"grpc_status":8}"
>

I am trying to tune lgbm parameters on a dataset of 6million rows. Is there a better way to handle such large datasets?

Hi @naadvar,
The problem here is X_train, etc are instantiated and then captured in train_breast_cancer function. This causes this large data structure to be serialized and shipped over grpc. And thus the error RESOURCE_EXHAUSTED . Try instead instantiating X_train etc within train_breast_cancer.

@xwjiang2010 That worked perfectly, what if I were to read in a spark dataframe when doing this on a real world dataset, do you think I should do the following within the train_breast_cancer?

  • Read into spark df
  • convert to pandas dataframe

If I modify the function to do convert a spark df to a pandas within the df I get the following errro:

def train_breast_cancer(config: dict):
    df_2 = df.toPandas()
    def train_x_y(train_gbm_x):
      x_train = train_gbm_x[
          train_gbm_x.columns.difference(
              ["emp_id", "mstr_dt_x", "mstr_dt_y", "total_attrition"]
          )
      ]
      y_train = train_gbm_x[["total_attrition"]]
      return x_train, y_train
    # Split into train and test set
    X,y = train_x_y(df_2)
    train_x, test_x, train_y, test_y = train_test_split(
        X, y, test_size=0.25)
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    # Train the classifier, using the Tune callback
    xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")])

error 1:

ray.cloudpickle.dumps(<class 'ray.tune.function_runner.wrap_function.<locals>.ImplicitFunc'>) failed.
To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options: 
-Try reproducing the issue by calling `pickle.dumps(trainable)`. 
-If the error is typing-related, try removing the type annotations and try again.

if I move the ‘df_2 = df.toPandas()’ outside the function, then it gives the following error:

<_InactiveRpcError of RPC that terminated with:
	status = StatusCode.INTERNAL
	details = "Exception serializing request!"
	debug_error_string = "None"
>

Great question. Ray Dataset is designed exactly for that.
It solves the last mile problem and serves as a connector between ETL and Ray ML.
Take a look here: Datasets: Distributed Data Loading and Compute — Ray v1.10.0
It offers easy conversation from a Spark DF.

Hi All,

I had the similar issue when using tune.run, ray has already been updated to 1.12 and my data was only 850mbs. Any help will be appreciated. :slight_smile:

Code snippet

def data_store(self):
        x_train, y_train, x_test, y_test=\

            ray.put(self.x_train),ray.put(self.y_train),\
            ray.put(self.x_test), ray.put(self.x_test)

        return x_train,y_train,x_test, y_test

def train_xgb_cls_model_single(self, tuning_param=None):

    model = self.make_xgb_cls_model(tuning_param)

    ray_x_train, ray_y_train, ray_x_test, ray_y_test = self.data_store()

    eval_set = [(ray.get(ray_x_train), ray.get(ray_y_train)), (ray.get(ray_x_test), ray.get(ray_y_test))]

    cv_train = xgb.DMatrix(ray.get(ray_x_train), label=ray.get(ray_y_train), missing=None, weight=None,
                               silent=False,
                               feature_names=ray.get(ray_x_train).columns, feature_types=None, nthread=-1)

    xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                              num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                              as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)

    model.fit(ray.get(ray_x_train), ray.get(ray_y_train), eval_metric="logloss",
              early_stopping_rounds=30, eval_set=eval_set,
              verbose=True)

    model_predictions = model.predict(ray.get(ray_x_test))

    logloss = log_loss(ray.get(ray_y_test), model_predictions)

    tune.report(logloss=logloss, done=True)

    return model

    def hyper_tune_model(self):

        mod_params = {
            "n_estimators": tune.randint(20, 512),
            "max_depth": tune.randint(3, 15),
            "min_child_weight": tune.randint(1, 7),
            "subsample": tune.uniform(0.3, 1.0),
            "learning_rate": tune.loguniform(0.000001, 1.0),
            "reg_alpha": tune.uniform(0.1, 5.0),
            "reg_lambda": tune.uniform(0.1, 5.0),
            "num_parallel_tree":tune.randint(20, 500),
            "gamma" :tune.uniform(0.1, 10.0),
            "max_bin":tune.randint(10, 512),
            "colsample_bytree":tune.loguniform(0.5, 1.0),
            "colsample_bylevel": tune.loguniform(0.5, 1.0),
            "max_delta_step":tune.randint(0, 5),
        }

        optuna_opt = OptunaSearch(
            metric="logloss",
            seed=101,
            mode="min")

        analysis = tune.run(
            tune.with_parameters(self.train_xgb_cls_model_single),
            reuse_actors=True,
            metric="logloss",
            checkpoint_at_end=True,
            max_concurrent_trials=None,
            mode="min",
            resources_per_trial={"cpu":64, "gpu": 3},
            config=mod_params,
            search_alg = optuna_opt,
            verbose=1,
            resume="AUTO",
            num_samples=1000)

        best_result, best_params = analysis.best_dataframe, analysis.best_config

        return best_params, best_result

Traceback

Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (1789998348 vs. 536870912)"
	debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-febb8a874c56>", line 1, in <module>
    full = XGBoostModel(**params1 | bb).hyper_tune_model()
  File "<ipython-input-3-b6b4eff61339>", line 128, in hyper_tune_model
    analysis = tune.run(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
    experiments[i] = Experiment(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
    raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
Original exception: Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (1789998348 vs. 536870912)"
	debug_error_string = "{"created":"@1651587413.094226455","description":"Sent message larger than max (1789998348 vs. 536870912)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>

The reason is self.x_train etc are probably big and they are captured implicitly in your train_xgb_cls_model_single.
Could you restructure the code so that they are contained in the train function itself?
You can try

def train_func(data):
...

and then

tune.with_parameter(train_func, data)  #  this will automatically put data into object store without you having to use `ray.put` explicitly.

Thanks @xwjiang2010 .

I have changed the code in train_xgb_cls_model_single by addind a parameter call “data”, which will be a list to contain x_train, y_train, _x_test and y_test, but it seems that I am still getting the same error for tune.run. Am I doing something wrong?

Code snippet

def train_xgb_cls_model_single(self, data, tuning_param=None, cv=False, cv_folds_t=None):

    model = self.make_xgb_cls_model(tuning_param)
    
    #Fetch train test data from data, which is a list
    x_train, y_train, x_test, y_test = data[0], data[1], data[2], data[3]
    eval_set = [(x_train, y_train), (x_test, y_test)]


    model_params = model.get_params()

    cv_train = xgb.DMatrix(x_train, label=y_train, missing=None, weight=None, silent=False,
                           feature_names=x_train.columns, feature_types=None, nthread=-1)

    xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                          num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                          as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)

    model.fit(x_train, y_train, eval_metric="logloss",
              early_stopping_rounds=30, eval_set=eval_set,
              verbose=True)

    model_predictions = model.predict(x_test)

    logloss = log_loss(y_test, model_predictions)

    tune.report(logloss=logloss, done=True)

    return model

def hyper_tune_model(self):
  
    mod_params = {
        "n_estimators": tune.randint(20, 512),
        "max_depth": tune.randint(3, 15),
        "min_child_weight": tune.randint(1, 7),
        "subsample": tune.uniform(0.3, 1.0),
        "learning_rate": tune.loguniform(0.000001, 1.0),
        "reg_alpha": tune.uniform(0.1, 5.0),
        "reg_lambda": tune.uniform(0.1, 5.0),
        "num_parallel_tree":tune.randint(20, 500),
        "gamma" :tune.uniform(0.1, 10.0),
        "max_bin":tune.randint(10, 512),
        "colsample_bytree":tune.loguniform(0.5, 1.0),
        "colsample_bylevel": tune.loguniform(0.5, 1.0),
        "max_delta_step":tune.randint(0, 5),
    }
  
    optuna_opt = OptunaSearch(
        metric="logloss",
        seed=101,
        mode="min")
  
    analysis = tune.run(
        tune.with_parameters(self.train_xgb_cls_model_single,data=[self.x_train,self.y_train,self.x_test,self.y_test]),
        reuse_actors=True,
        metric="logloss",
        checkpoint_at_end=True,
        max_concurrent_trials=None,
        mode="min",
        resources_per_trial={"cpu":64, "gpu": 3},
        config=mod_params,
        search_alg = optuna_opt,
        verbose=1,
        resume="AUTO",
        num_samples=1000)
  
    best_result, best_params = analysis.best_dataframe, analysis.best_config
  
    return best_params, best_result

Traceback

Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (2091700267 vs. 1073741824)"
	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-febb8a874c56>", line 1, in <module>
    full = XGBoostModel(**params1 | bb).hyper_tune_model()
  File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
    analysis = tune.run(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
    experiments[i] = Experiment(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
    raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
Original exception: Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (2091700267 vs. 1073741824)"
	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>

Thanks @xwjiang2010 .

I have changed the code for “train_xgb_cls_model_single”, basically added a parameter called data which is a list contains x_train, y_train, x_test and y_test. However, I am still getting the same error. Have I done anything wrong?

Code snippet

def train_xgb_cls_model_single(self, data, tuning_param=None, cv=False, cv_folds_t=None):
    model = self.make_xgb_cls_model(tuning_param)

    # Get train test data from data, which is a list
    x_train, y_train, x_test, y_test = data[0], data[1], data[2], data[3]
    eval_set = [(x_train, y_train), (x_test, y_test)]

    model_params = model.get_params()

    cv_train = xgb.DMatrix(x_train, label=y_train, missing=None, weight=None, silent=False,
                           feature_names=x_train.columns, feature_types=None, nthread=-1)

    xgb_cv_error = xgb.cv(dtrain=cv_train, params=model_params, folds=5,
                          num_boost_round=model_params['n_estimators'], early_stopping_rounds=30, metrics="logloss",
                          as_pandas=True, seed=101, maximize=False, verbose_eval=3, shuffle=False)

    model.fit(x_train, y_train, eval_metric="logloss",
              early_stopping_rounds=30, eval_set=eval_set,
              verbose=True)

    model_predictions = model.predict(x_test)

    logloss = log_loss(y_test, model_predictions)

    tune.report(logloss=logloss, done=True)

    return model

  def hyper_tune_model(self):

      mod_params = {
          "n_estimators": tune.randint(20, 512),
          "max_depth": tune.randint(3, 15),
          "min_child_weight": tune.randint(1, 7),
          "subsample": tune.uniform(0.3, 1.0),
          "learning_rate": tune.loguniform(0.000001, 1.0),
          "reg_alpha": tune.uniform(0.1, 5.0),
          "reg_lambda": tune.uniform(0.1, 5.0),
          "num_parallel_tree":tune.randint(20, 500),
          "gamma" :tune.uniform(0.1, 10.0),
          "max_bin":tune.randint(10, 512),
          "colsample_bytree":tune.loguniform(0.5, 1.0),
          "colsample_bylevel": tune.loguniform(0.5, 1.0),
          "max_delta_step":tune.randint(0, 5),
      }

      optuna_opt = OptunaSearch(
          metric="logloss",
          seed=101,
          mode="min")

      analysis = tune.run(
          tune.with_parameters(self.train_xgb_cls_model_single,data=[self.x_train,self.y_train,self.x_test,self.y_test]),
          reuse_actors=True,
          metric="logloss",
          checkpoint_at_end=True,
          max_concurrent_trials=None,
          mode="min",
          resources_per_trial={"cpu":64, "gpu": 3},
          config=mod_params,
          search_alg = optuna_opt,
          verbose=1,
          resume="AUTO",
          num_samples=1000)

      best_result, best_params = analysis.best_dataframe, analysis.best_config

      return best_params, best_result

Traceback

Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (2091700267 vs. 1073741824)"
	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-febb8a874c56>", line 1, in <module>
    full = XGBoostModel(**params1 | bb).hyper_tune_model()
  File "<ipython-input-4-5fa792b9aef4>", line 124, in hyper_tune_model
    analysis = tune.run(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
    experiments[i] = Experiment(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 167, in __init__
    raise TuneError(
ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
Original exception: Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "Sent message larger than max (2091700267 vs. 1073741824)"
	debug_error_string = "{"created":"@1651625808.218316975","description":"Sent message larger than max (2091700267 vs. 1073741824)","file":"src/core/ext/filters/message_size/message_size_filter.cc","file_line":264,"grpc_status":8}"
>

Updated, I run the code again and I think I got the same error as naadvar got.

error

Traceback (most recent call last):
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-44929bcce122>", line 1434, in <module>
    full = XGBoostModel(**params1 | bb).create_tuned_base_model()
  File "<ipython-input-2-44929bcce122>", line 825, in create_tuned_base_model
    best_params, best_result = self.hyper_tune_model()
  File "<ipython-input-2-44929bcce122>", line 804, in hyper_tune_model
    analysis = tune.run(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py", line 470, in run
    experiments[i] = Experiment(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 176, in __init__
    raise e
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 164, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/experiment.py", line 353, in register_if_needed
    register_trainable(name, run_object)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 84, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 168, in register
    self.flush_values()
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/tune/registry.py", line 190, in flush_values
    _internal_kv_put(
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 104, in wrapper
    return f(self, *args, **kwargs)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/ray/_private/gcs_utils.py", line 195, in internal_kv_put
    reply = self._kv_stub.InternalKVPut(req)
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 944, in __call__
    state, call, = self._blocking(request, timeout, metadata, credentials,
  File "/home/tigertimwu/anaconda3/lib/python3.9/site-packages/grpc/_channel.py", line 924, in _blocking
    raise rendezvous  # pylint: disable-msg=raising-bad-type
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.INTERNAL
	details = "Exception serializing request!"
	debug_error_string = "None"
>

@tigertimwu Thanks for trying out.

I think the problem lies in the fact that your train_func is a method of a class instead of a standalone function.

Consider the following examples:
====== using a standalone function =====
Compare

data = np.random.rand(10000, 10000)
def func(config, data):
	print(data.size)

tune.run(tune.with_parameters(func, data=data))       # this works fine

v.s.

data = np.random.rand(10000, 10000)
def func(config):
	print(data.size)

tune.run(func)    # not working as `data` is captured implicitly

======== using a class method ==========

class MyTuneJob:
	def __init__(self):
		self.data = np.random.rand(10000, 10000)

	def func(self, config, data):
		print(data.size)

	def hyper_tune_model(self):
		tune.run(tune.with_parameters(self.func, data=self.data))     # failing as `self` is still captured.

v.s.

class MyTuneJob:
	def __init__(self):
		self.data = np.random.rand(10000, 10000)

	def func(self, config):
		print(self.data.size)

	def hyper_tune_model(self):
		tune.run(self.func)   # failing not surprisingly...

Long story short, you should try moving out of the class.

Hi all, I try to pass the data into my tuner file, but I still receive two errors:

_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.RESOURCE_EXHAUSTED
details = “Sent message larger than max (845011379 vs. 536870912)”
debug_error_string = “UNKNOWN:Error received from peer 10.178.134.118:60685 {created_time:“2023-03-30T17:58:36.265450971-04:00”, grpc_status:8, grpc_message:“Sent message larger than max (845011379 vs. 536870912)”}”

During handling of the above exception, another exception occurred:

TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store.
Original exception: Traceback (most recent call last):

I try to delete all the information without model training, and I got the storage informaiton. Can we increase its upper bound? Thanks a lot

Interesting! I just addressed this problem.

My condition is like: I call another helper funciton in my train function, and at that function I used some large data. Althought I have passed the data via the variables of train(), I still need to consider the parameter searching of my helper function. If I move the helper function inner my training function, I can run the tunner.

Hi I also had same error, and I wonder is it possible to use tune.with_parameters just because I had problems with dataset size, but not necessarily because I want to tune hyperparams ?

for quick PoC if I can with large dataset, then I should also be able to do with smaller dataset using tune.with_parameters so I try to apply that to fashion mnist dataset, base on this, I modify the data (truncated a bit)

import argparse
from typing import Dict
from ray.air import session

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig
from ray.tune import Tuner
from ray import tune

# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="~/data",
    train=True,
    download=True,
    transform=ToTensor(),
)

testing_data = datasets.FashionMNIST(...
   # exactly same as docs
)


# Define model
class NeuralNetwork(nn.Module):
    # exactly same as docs

def train_epoch(dataloader, model, loss_fn, optimizer):
    # exactly same as docs

def validate_epoch(dataloader, model, loss_fn):
    # exactly same as docs

def train_func_tune(config, train_data, test_data):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    worker_batch_size = batch_size // session.get_world_size()

    # Create data loaders.
    train_dataloader = DataLoader(train_data, batch_size=worker_batch_size)
    test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


if __name__ == "__main__":
    tuner = Tuner(
    tune.with_parameters(train_func_tune, 
        config=config,
        train_data=training_data, test_data=test_data)
)
results = tuner.fit()
results

Then it only says trials did not complete, it doesn’t specify which part is causing it

any help is appreciated

update: following some suggestion I cd into ray_results dir and got this error.txt, why it complains about my config having multiple argument?

Failure # 1 (occurred at 2023-04-10_10-46-11)
ray::ImplicitFunc.train() (pid=863, ip=10.42.59.67, repr=train_func_tune)
  File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
    return self._trainable_func(
  File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 406, in _inner
    return inner(config, checkpoint_dir=None)
  File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 398, in inner
    return trainable(config, **fn_kwargs)
TypeError: train_func_tune() got multiple values for argument 'config'