Again a "The actor ImplicitFunc is too large error"

Hello everyone,

As the title says, I am facing a “The actor ImplicitFunc is too large” error. The problem is that I upgraded Ray to 1.7 today and everything was running smoothly before.

I saw two conversations on this forum, one where using tune.with_parameters() solved the issues and another where the issue was caused by XGboost. I tried using tune.with_parameters() to pass my datasets to the training function but the issue is still there and I am fine-tuning a torch model.

I used the following code to compute the size of the model and both data sets.
My training set is 13Mb, my validation set is 1Mb and the model is 0 Mb.

from ray import cloudpickle as pickle

pickled = pickle.dumps(...)
length_mib = len(pickled) // (1024 * 1024)
print(length_mib)

Here is part of my code, maybe you can have an idea where the issue might be:

def train(self, config, data):
   
        print("Train")

        net = None
        if self.df:
            net = Net(k1=config["k1"], k2=config["k2"], out1=config["out1"], out2=config["out2"], L1=config["l1"])
        else:
            net = Net()
        net.to(self.device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

        trainloader = torch.utils.data.DataLoader(
            data[0],
            batch_size=int(config["batch_size"]),
            shuffle=True,
            num_workers=8)

        valloader = torch.utils.data.DataLoader(
            data[1],
            batch_size=int(config["batch_size"]),
            shuffle=True,
            num_workers=8)

         # Trains the network 

         with tune.checkpoint_dir(epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((net.state_dict(), optimizer.state_dict()), path)

         tune.report(loss=(val_loss / val_steps), accuracy= correct/total)#eval(self.part.init_data["val"]["label"].to_numpy(), predicted_labels.astype(int))["F1"])

def main(self, num_samples=50, max_num_epochs=20, gpus_per_trial=1):
        config = None
        print("Main")

        config = {
                "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 10)),
                "lr": tune.loguniform(1e-4, 1e-1),
                "k1": tune.choice([4, 5]),
                "k2": tune.choice([4, 5]),
                "out1": tune.choice([16, 32, 64, 128]),
                "out2": tune.choice([16, 32, 64, 128]),
                "batch_size": tune.choice([16, 32, 50, 64, 128]),
                "epoch": tune.choice([5, 10, 15, 20, 25, 30, 40, 50, 75, 100])
            }
   
        pickled = pickle.dumps(self.train_data)
        length_mib = len(pickled) // (1024 * 1024)
        print("Length mb: {}".format(length_mib))

        pickled = pickle.dumps(self.val)
        length_mib = len(pickled) // (1024 * 1024)
        print("Length mb: {}".format(length_mib))

        pickled = pickle.dumps(config)
        length_mib = len(pickled) // (1024 * 1024)
        print("Length mb: {}".format(length_mib))

        pickled = pickle.dumps(Net(k1=4, k2=4, out1=16, out2=16, L1=10))
        length_mib = len(pickled) // (1024 * 1024)
        print("Length mb: {}".format(length_mib))

        scheduler = ASHAScheduler(
            metric="loss",
            mode="min",
            max_t=max_num_epochs,
            grace_period=1,
            reduction_factor=2)

        print(1)

        result = tune.run(
            tune.with_parameters(self.train, data=(self.train_data,self.val)),
            resources_per_trial={"cpu": 4, "gpu": 1},
            config=config,
            num_samples=num_samples,
            scheduler=scheduler,
            progress_reporter=ExperimentTerminationReporter(),
            verbose=1)
         print(2)

And the complete log in case it helps:

2021-10-29 18:01:03,649 INFO services.py:1250 -- View the Ray dashboard at http://127.0.0.1:8265
2021-10-29 18:01:04,916 WARNING function_runner.py:558 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
2021-10-29 18:01:11,619 ERROR ray_trial_executor.py:599 -- Trial train_6c120_00000: Unexpected error starting runner.
Traceback (most recent call last):
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 590, in start_trial
    return self._start_trial(trial, checkpoint, train=train)
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 465, in _start_trial
    runner = self._setup_remote_runner(trial)
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 382, in _setup_remote_runner
    return full_actor_class.remote(**kwargs)
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/actor.py", line 480, in remote
    return actor_cls._remote(
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 371, in _invocation_actor_class_remote_span
    return method(self, args, kwargs, *_args, **_kwargs)
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/actor.py", line 713, in _remote
    worker.function_actor_manager.export_actor_class(
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/_private/function_manager.py", line 383, in export_actor_class
    check_oversized_function(actor_class_info["class"],
  File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/_private/utils.py", line 641, in check_oversized_function
    raise ValueError(error)
ValueError: The actor ImplicitFunc is too large (177 MiB > FUNCTION_SIZE_ERROR_THRESHOLD=95 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.

When I use Ray 1.6.0, everything works but I get the warning: The actor ImplicitFunc is very large (88 MiB)
Thank you very much for any help you can give

So I just found my issue. I was using this in an object, so when train() was called, it would pass the entire object with the parameters so (self.train, data=(self.train_data,self.val)) was around 145Mb already. I moved the train function out of the object and it seems to work now.