TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

PDDhillon · July 22, 2023, 11:41am

Hi there. I’m new to ML Engineering and currently trying to run experiments on my training loop to tune my hyperparameters of my model. However, after fully completing the first epoch i’m running into the following exception, part way through my second epoch:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 60
     41 scheduler = ASHAScheduler(
     42         max_t=10,
     43         grace_period=1,
     44         reduction_factor=2)
     45 tuner = tune.Tuner(
     46         tune.with_resources(
     47             tune.with_parameters(train_fbm),
   (...)
     57         run_config= RunConfig(verbose=3)
     58     )
---> 60 results = tuner.fit()
     61 best_result = results.get_best_result("loss", "min")
     62 print("Best trial config: {}".format(best_result.config))

File c:\Users\user\miniconda3\envs\fbm_v2\lib\site-packages\ray\tune\tuner.py:347, in Tuner.fit(self)
    345 if not self._is_ray_client:
    346     try:
--> 347         return self._local_tuner.fit()
    348     except TuneError as e:
    349         raise TuneError(
    350             _TUNER_FAILED_MSG.format(
    351                 path=self._local_tuner.get_experiment_checkpoint_dir()
...
--> 970     return self.numpy()
    971 else:
    972     return self.numpy().astype(dtype, copy=False)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

Below is my code that I am running in a Jupyter notebook:

from torchvision import transforms
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from FBMDataset import FBMDataset

def get_datasets(training_data_dir, cleaned_images_dir):    
    transform_list = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], 
                                [0.229, 0.224, 0.225])
        ])
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataset = FBMDataset(training_data_dir,cleaned_images_dir,transform=transform_list)
        #obtain the list of targets
        train_dataset,test_dataset,val_dataset = random_split(dataset, [0.7,0.15,0.15])
    return (train_dataset, test_dataset, val_dataset)

import torch
import os
from datetime import datetime
import torch.nn.functional as F
from torch.optim import SGD
from FBMClassifier import FBMClassifier
from ray import tune
from ray.air import session
#from torch.utils.tensorboard import SummaryWriter

def create_model_dir_path():
    parent_dir = 'model_evaluation'
    current_datetime = datetime.now().strftime('%y%m%d%H%M%S')
    child_dir = 'weights'
    path = os.path.join(os.getcwd(), parent_dir, current_datetime, child_dir)
    return path

def training_loop(model, optimiser, train_loader, epoch_num,device=None):  
    # writer = SummaryWriter()
    batch_id = 0     
    # Set the model to run on the device
    model = model.to(device)
    model.train(True)     
    print(f'Beginning Batches for epoch {epoch_num}')
    print(len(train_loader))
    for batch in train_loader:
        # get features and labels from the batch
        features,labels = batch
        features = features.to(device)
        labels = labels.to(device, non_blocking=True)
        # loss.backward does not overwrite, it adds. To stop this, we set the gradients back to zero. sets the .grad of all optimized tensors to zero
        optimiser.zero_grad()
        # make a prediction
        prediction = model(features)
        # calculate loss
        criterion = F.cross_entropy(prediction,labels)
        # backward function calculates the gradient of the current tensor w.r.t graph leaves
        criterion.backward()
        # moves each parameter in the opposite direction of the gradient, proportional to the learning rate
        optimiser.step()
        # writer.add_scalar('Loss', criterion.item(), batch_id)
        batch_id += 1
    print("Completed")
    

def validate(model, val_loader, device):
    # Set the model to evaluation mode
    model.eval()
    running_vloss = 0.0
    with torch.no_grad():
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels = vdata                
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device, non_blocking=True)
            
            voutputs = model(vinputs)
            vloss = F.cross_entropy(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    return avg_vloss


def train_fbm(config, device=None):    
    model = FBMClassifier()
    optimiser = SGD(model.resnet50.fc.parameters(), lr = config["lr"])    
    base_dir = "D:/Documents/AICore/facebook-marketplaces-recommendation-ranking-system"
    train_dataset, test_dataset, val_dataset = get_datasets(os.path.join(base_dir,"training_data.csv"),os.path.join(base_dir,"cleaned_images"))
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    print(device)
    # Set the model to run on the device
    model = model.to(device)

    train_loader = DataLoader(train_dataset,batch_size=config["batch_size"],shuffle=True)
    test_loader = DataLoader(test_dataset,batch_size=config["batch_size"],shuffle=True)
    val_loader = DataLoader(val_dataset,batch_size=config["batch_size"],shuffle=True)
    
    path = create_model_dir_path()
    os.makedirs(path)
    
    for epoch in range(10):
        print(f"Beginning {epoch} ...")
        # tune.utils.wait_for_gpu()
        training_loop(model,optimiser,train_loader, epoch, device=device)
        print('Training complete ...')
        loss = validate(model, val_loader, device=device)
        print('Validation complete ...')        
        print(f'Epoch {epoch} - Average Loss: {loss}')
        session.report(metrics={"loss": loss})#tune.report(mean_accuracy=loss)
        # torch.save(model.state_dict(), path + f'/epoch_{epoch}.pt')
        print(f"Ending {epoch} ...")

# analysis = tune.run(train_fbm, config={"lr": tune.loguniform(1e-4, 1e-1),"batch_size": tune.choice([2, 4, 8, 16])})
# print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig
config = {        
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
scheduler = ASHAScheduler(
        max_t=10,
        grace_period=1,
        reduction_factor=2)
tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_fbm),
             resources=tune.PlacementGroupFactory([{"CPU": 1, "GPU": 0.5}])
        ),
        tune_config=tune.TuneConfig(
            metric="loss",
            mode="min",
            scheduler=scheduler,
            num_samples=3,
        ),
        param_space=config,
        run_config= RunConfig(verbose=3)
    )

results = tuner.fit()
best_result = results.get_best_result("loss", "min")
print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["accuracy"]))

I think based on the error and and some experimentation that its due to memory allocation of the various trials. As when I have ran a single trial on GPU CUDA, it doesn’t seem to fail at the same point. I used the following code and it successfully completed the second epoch:

analysis = tune.run(train_fbm, config={"lr": tune.loguniform(1e-4, 1e-1),"batch_size": tune.choice([2, 4, 8, 16])})
print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

I have also tried passing through a dictionary instead of a PlacementGroupFactory to tune.with_resources, to run a single trial at a time, but i’m getting the same error:

resources={"cpu":2, "gpu": 1}

I’m running all this on a Windows 10 OS with a single GPU (RTX 1660 Super) and an AMD Ryzen 5 2600X processor with 16GB RAM.

Any help would be greatly appreciated and if you require any more information, let me know. Thank you!

Topic		Replies	Views
Status: all CUDA-capable devices are busy or unavailable Ray Tune	7	1793	February 15, 2022
TypeError: '<' not supported between instances of 'TrackedActor' and 'TrackedActor' Ray Tune	1	522	July 19, 2023
ValueError in tuner.fit() Ray Tune	2	548	June 1, 2023
RuntimeError: No CUDA GPUs are available Ray Tune	12	14702	February 3, 2023
Attempting to deserialize object on a CUDA device... error on 2 GPU machine Ray Tune	3	2977	April 6, 2021

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

Related topics