Hi there. I’m new to ML Engineering and currently trying to run experiments on my training loop to tune my hyperparameters of my model. However, after fully completing the first epoch i’m running into the following exception, part way through my second epoch:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[3], line 60
41 scheduler = ASHAScheduler(
42 max_t=10,
43 grace_period=1,
44 reduction_factor=2)
45 tuner = tune.Tuner(
46 tune.with_resources(
47 tune.with_parameters(train_fbm),
(...)
57 run_config= RunConfig(verbose=3)
58 )
---> 60 results = tuner.fit()
61 best_result = results.get_best_result("loss", "min")
62 print("Best trial config: {}".format(best_result.config))
File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tuner.py:347, in Tuner.fit(self)
345 if not self._is_ray_client:
346 try:
--> 347 return self._local_tuner.fit()
348 except TuneError as e:
349 raise TuneError(
350 _TUNER_FAILED_MSG.format(
351 path=self._local_tuner.get_experiment_checkpoint_dir()
...
--> 970 return self.numpy()
971 else:
972 return self.numpy().astype(dtype, copy=False)
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
Below is my code that I am running in a Jupyter notebook:
from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset
def get_datasets(training_data_dir, cleaned_images_dir):
transform_list = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
with FileLock(os.path.expanduser("~/.data.lock")):
dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
#obtain the list of targets
train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
return (train_dataset, test_dataset, val_dataset)
import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import session
#from torch.utils.tensorboard import SummaryWriter
def create_model_dir_path():
parent_dir = 'model_evaluation'
current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
child_dir = 'weights'
path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
return path
def training_loop(model, optimiser, train_loader, epoch_num,device=None):
# writer = SummaryWriter()
batch_id = 0
# Set the model to run on the device
model = model.to(device)
model.train(True)
print(f'Beginning Batches for epoch {epoch_num}')
print(len(train_loader))
for batch in train_loader:
# get features and labels from the batch
features,labels = batch
features = features.to(device)
labels = labels.to(device, non_blocking=True)
# loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
optimiser.zero_grad()
# make a prediction
prediction = model(features)
# calculate loss
criterion = F.cross_entropy(prediction,labels)
# backward function calculates the gradient of the current tensor w.r.t graph leaves
criterion.backward()
# moves each parameter in the opposite direction of the gradient, proportional to the learning rate
optimiser.step()
# writer.add_scalar('Loss', criterion.item(), batch_id)
batch_id += 1
print("Completed")
def validate(model, val_loader, device):
# Set the model to evaluation mode
model.eval()
running_vloss = 0.0
with torch.no_grad():
for i, vdata in enumerate(val_loader):
vinputs, vlabels = vdata
vinputs = vinputs.to(device)
vlabels = vlabels.to(device, non_blocking=True)
voutputs = model(vinputs)
vloss = F.cross_entropy(voutputs, vlabels)
running_vloss += vloss
avg_vloss = running_vloss / (i + 1)
return avg_vloss
def train_fbm(config, device=None):
model = FBMClassifier()
optimiser = SGD(model.resnet50.fc.parameters(), lr = config["lr"])
base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
# Set the model to run on the device
model = model.to(device)
train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
path = create_model_dir_path()
os.makedirs(path)
for epoch in range(10):
print(f"Beginning {epoch} ...")
# tune.utils.wait_for_gpu()
training_loop(model,optimiser,train_loader, epoch, device=device)
print('Training complete ...')
loss = validate(model, val_loader, device=device)
print('Validation complete ...')
print(f'Epoch {epoch} - Average Loss: {loss}')
session.report(metrics={"loss": loss})#tune.report(mean_accuracy=loss)
# torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
print(f"Ending {epoch} ...")
# analysis = tune.run(train_fbm, config={"lr": tune.loguniform(1e-4, 1e-1),"batch_size": tune.choice([2, 4, 8, 16])})
# print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig
config = {
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([2, 4, 8, 16])
}
scheduler = ASHAScheduler(
max_t=10,
grace_period=1,
reduction_factor=2)
tuner = tune.Tuner(
tune.with_resources(
tune.with_parameters(train_fbm),
resources=tune.PlacementGroupFactory([{"CPU": 1, "GPU": 0.5}])
),
tune_config=tune.TuneConfig(
metric="loss",
mode="min",
scheduler=scheduler,
num_samples=3,
),
param_space=config,
run_config= RunConfig(verbose=3)
)
results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
best_result.metrics["accuracy"]))
I think based on the error and and some experimentation that its due to memory allocation of the various trials. As when I have ran a single trial on GPU CUDA, it doesn’t seem to fail at the same point. I used the following code and it successfully completed the second epoch:
analysis = tune.run(train_fbm, config={"lr": tune.loguniform(1e-4, 1e-1),"batch_size": tune.choice([2, 4, 8, 16])})
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))
I have also tried passing through a dictionary instead of a PlacementGroupFactory to tune.with_resources, to run a single trial at a time, but i’m getting the same error:
resources={"cpu":2, "gpu": 1}
I’m running all this on a Windows 10 OS with a single GPU (RTX 1660 Super) and an AMD Ryzen 5 2600X processor with 16GB RAM.
Any help would be greatly appreciated and if you require any more information, let me know. Thank you!