Hello everyone,
As the title says, I am facing a “The actor ImplicitFunc is too large” error. The problem is that I upgraded Ray to 1.7 today and everything was running smoothly before.
I saw two conversations on this forum, one where using tune.with_parameters()
solved the issues and another where the issue was caused by XGboost. I tried using tune.with_parameters()
to pass my datasets to the training function but the issue is still there and I am fine-tuning a torch model.
I used the following code to compute the size of the model and both data sets.
My training set is 13Mb, my validation set is 1Mb and the model is 0 Mb.
from ray import cloudpickle as pickle
pickled = pickle.dumps(...)
length_mib = len(pickled) // (1024 * 1024)
print(length_mib)
Here is part of my code, maybe you can have an idea where the issue might be:
def train(self, config, data):
print("Train")
net = None
if self.df:
net = Net(k1=config["k1"], k2=config["k2"], out1=config["out1"], out2=config["out2"], L1=config["l1"])
else:
net = Net()
net.to(self.device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
trainloader = torch.utils.data.DataLoader(
data[0],
batch_size=int(config["batch_size"]),
shuffle=True,
num_workers=8)
valloader = torch.utils.data.DataLoader(
data[1],
batch_size=int(config["batch_size"]),
shuffle=True,
num_workers=8)
# Trains the network
with tune.checkpoint_dir(epoch) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((net.state_dict(), optimizer.state_dict()), path)
tune.report(loss=(val_loss / val_steps), accuracy= correct/total)#eval(self.part.init_data["val"]["label"].to_numpy(), predicted_labels.astype(int))["F1"])
def main(self, num_samples=50, max_num_epochs=20, gpus_per_trial=1):
config = None
print("Main")
config = {
"l1": tune.sample_from(lambda _: 2**np.random.randint(2, 10)),
"lr": tune.loguniform(1e-4, 1e-1),
"k1": tune.choice([4, 5]),
"k2": tune.choice([4, 5]),
"out1": tune.choice([16, 32, 64, 128]),
"out2": tune.choice([16, 32, 64, 128]),
"batch_size": tune.choice([16, 32, 50, 64, 128]),
"epoch": tune.choice([5, 10, 15, 20, 25, 30, 40, 50, 75, 100])
}
pickled = pickle.dumps(self.train_data)
length_mib = len(pickled) // (1024 * 1024)
print("Length mb: {}".format(length_mib))
pickled = pickle.dumps(self.val)
length_mib = len(pickled) // (1024 * 1024)
print("Length mb: {}".format(length_mib))
pickled = pickle.dumps(config)
length_mib = len(pickled) // (1024 * 1024)
print("Length mb: {}".format(length_mib))
pickled = pickle.dumps(Net(k1=4, k2=4, out1=16, out2=16, L1=10))
length_mib = len(pickled) // (1024 * 1024)
print("Length mb: {}".format(length_mib))
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=max_num_epochs,
grace_period=1,
reduction_factor=2)
print(1)
result = tune.run(
tune.with_parameters(self.train, data=(self.train_data,self.val)),
resources_per_trial={"cpu": 4, "gpu": 1},
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=ExperimentTerminationReporter(),
verbose=1)
print(2)
And the complete log in case it helps:
2021-10-29 18:01:03,649 INFO services.py:1250 -- View the Ray dashboard at http://127.0.0.1:8265
2021-10-29 18:01:04,916 WARNING function_runner.py:558 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.
2021-10-29 18:01:11,619 ERROR ray_trial_executor.py:599 -- Trial train_6c120_00000: Unexpected error starting runner.
Traceback (most recent call last):
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 590, in start_trial
return self._start_trial(trial, checkpoint, train=train)
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 465, in _start_trial
runner = self._setup_remote_runner(trial)
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/tune/ray_trial_executor.py", line 382, in _setup_remote_runner
return full_actor_class.remote(**kwargs)
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/actor.py", line 480, in remote
return actor_cls._remote(
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 371, in _invocation_actor_class_remote_span
return method(self, args, kwargs, *_args, **_kwargs)
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/actor.py", line 713, in _remote
worker.function_actor_manager.export_actor_class(
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/_private/function_manager.py", line 383, in export_actor_class
check_oversized_function(actor_class_info["class"],
File "/home/.../anaconda3/envs/raytune/lib/python3.8/site-packages/ray/_private/utils.py", line 641, in check_oversized_function
raise ValueError(error)
ValueError: The actor ImplicitFunc is too large (177 MiB > FUNCTION_SIZE_ERROR_THRESHOLD=95 MiB). Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use ray.put() to put large objects in the Ray object store.
When I use Ray 1.6.0, everything works but I get the warning: The actor ImplicitFunc is very large (88 MiB)
Thank you very much for any help you can give