Hi all! In some code I wrote recently, trials are taking longer to execute as I allocate more cpus_per_trial
.
I created the script below to reproduce the behavior I’m seeing. I am doing grid search on hyperparameters for Bayesian optimization algorithms (meta I know!).
As you can see in my notes, the code executes quickly in local_mode (same as if I ran the script without tune). However, when I allocate an equivalent number of cpus per trial in cluster mode (4 on my Macbook Pro), the code takes more than 3x the time to execute!
Any ideas on what might be going on here?
""""
1. Install dependencies: `pip install botorch ray[default] numpy`
2. Run script in local mode: `python reproduce_botorch.py --local_mode`
3. Run script on normal cluster: `python reproduce_botorch.py --cpus_per_trial <CPUS>`
Running on 2015 Macbook Pro (Intel i5, 16G RAM):
- Local mode: 8.75 seconds
- Cluster mode with 1 cpu per trial: 17.06 seconds
- Cluster mode with 2 cpus per trial: 20.22 seconds
- Cluster mode with 3 cpus per trial: 32.47 seconds
- Cluster mode with 4 cpus per trial: 30.85 seconds
"""
import numpy as np
import torch
from ray import tune
import ray
from botorch.models import MultiTaskGP
from botorch.fit import fit_gpytorch_model
from gpytorch.mlls.exact_marginal_log_likelihood import (
ExactMarginalLogLikelihood,
)
from botorch.acquisition import ExpectedImprovement as EI
from botorch.optim import optimize_acqf
import argparse
def main(local_mode, cpus_per_trial):
ray.init(local_mode=local_mode)
tune.run(
trainable, num_samples=10, resources_per_trial={"cpu": int(cpus_per_trial)}
)
def trainable(config):
X, y = generate_data(noise=20, n_t1=50, n_t2=5)
# Train model
model = MultiTaskGP(X, y, task_feature=-1, output_tasks=[1])
mll = ExactMarginalLogLikelihood(model.likelihood, model)
_ = fit_gpytorch_model(mll)
# Acquisition function
ei = EI(model, best_f=y.min(), maximize=False)
optimize_acqf(
acq_function=ei,
bounds=torch.tensor([[X.min(), X.max()]]).T.float(),
num_restarts=20,
q=1,
raw_samples=100,
)
### Data Generation
f1 = lambda X: torch.cos(5 * X[:, 0]) ** 2
f2 = lambda X: 1.5 * torch.cos(5 * X[:, 0]) ** 2
gen_inputs = lambda n: torch.rand(n, 1)
gen_obs = lambda X, f, noise: f(X) + noise * torch.rand(X.shape[0])
def generate_data(noise, n_t1, n_t2):
X1, X2 = gen_inputs(n_t1), gen_inputs(n_t2)
i1, i2 = torch.zeros(n_t1, 1), torch.ones(n_t2, 1)
train_X = torch.cat([torch.cat([X1, i1], -1), torch.cat([X2, i2], -1)])
train_Y_f1 = gen_obs(X1, f1, noise)
train_Y_f2 = gen_obs(X2, f2, noise)
train_Y = torch.cat([train_Y_f1, train_Y_f2]).unsqueeze(-1)
train_Y_mean = train_Y.mean()
train_Y_std = train_Y.std()
train_Y_norm = (train_Y - train_Y_mean) / train_Y_std
return train_X, train_Y_norm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_mode", default=False, action="store_true")
parser.add_argument("--cpus_per_trial", default=4, type=int)
args = parser.parse_args()
main(args.local_mode, args.cpus_per_trial)