DDP tuning not returning loss in from TuneReportCallback

Hi,

I basically copied the example from DDP tuning from here. I am tuning my own model, with a custom validation loss and accuracy. However, in the results dict from the TrialRunner the loss is not present and always trigger the same error.

Trial returned a result which did not include the specified metric(s) 'loss' that 'tune.TuneConfig()' expects

Any idea of what could be wrong?
This is my tuning code:

import os
import sys

import numpy as np
from projection import setup_projection
import torch.utils.data as data_utils
from ray_lightning.tune import TuneReportCallback, get_tune_resources
from ray import tune
from ray_lightning import RayStrategy

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(sys.modules[__name__].__file__), "../../")))

from module import Inpaint_IcoPartConv
from argparse import ArgumentParser
import pytorch_lightning as pl
from dataset import DatasetFromTxt, my_collate


def hparam_search(config, use_gpu=False, num_workers=1, num_epochs=10, callbacks=None):
    model = Inpaint_IcoPartConv(SUBDIVISION, config=config)
    strategy = RayStrategy(num_workers=num_workers, use_gpu=use_gpu)
    trainer = pl.Trainer(log_every_n_steps=2, num_sanity_val_steps=2, callbacks=callbacks,
                         max_epochs=num_epochs, strategy=strategy)
    trainer.fit(model, train_loader)


parser = ArgumentParser()

# define run specific arguments
parser.add_argument("--data_path", required=True, type=str,
                    help='The path to the pickle file containing the projected MNIST data')
parser.add_argument("--batch_size", default=64, type=int, help='The batch size')
parser.add_argument("--num_workers", default=1, type=int)
parser.add_argument("--use_gpu", action='store_true')
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--icosahedral_subdiv", default=4, type=int, help='icosahedral subdivision')
parser.add_argument("--erp_height", default=256, type=int,
                    help='Height of the resulting ERP image (width is double the height)')

# add params from Trainer
parser = pl.Trainer.add_argparse_args(parser)
args = parser.parse_args()
assert args.icosahedral_subdiv >= 0, "Negative subdivision level for icosahedral projection not possible"
assert args.icosahedral_subdiv <= 10, "Subdivision level for icosahedral projection too big"
assert args.erp_height % 2 == 0, "ERP height should be multiple of two"

# args
DATA_PATH = args.data_path
BATCH_SIZE = args.batch_size
SUBDIVISION = args.icosahedral_subdiv
ERP_HEIGHT = args.erp_height
NUM_EPOCHS = args.epochs
USE_GPU = args.use_gpu
NUM_WORKERS = args.num_workers

# data
train_dataset = DatasetFromTxt(os.path.join(DATA_PATH, "toy_test_set.txt"), SUBDIVISION)
train_loader = data_utils.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                                     num_workers=np.minimum(os.cpu_count(), 16), drop_last=False, pin_memory=True,
                                     collate_fn=my_collate)

val_dataset = DatasetFromTxt(os.path.join(DATA_PATH, "toy_test_set.txt"), SUBDIVISION)
val_loader = data_utils.DataLoader(val_dataset, batch_size=1, shuffle=False,
                                   num_workers=np.minimum(os.cpu_count(), 16), drop_last=False, pin_memory=True,
                                   collate_fn=my_collate)

# Preprocessing
mask_charts_erp, chart2erp_interp, ico_xx, ico_yy = setup_projection(SUBDIVISION, train_dataset.img_h, ERP_HEIGHT)

train_dataset.ico_yy = ico_yy
train_dataset.ico_xx = ico_xx
val_dataset.ico_yy = ico_yy
val_dataset.ico_xx = ico_xx

# Hyperparameter search space
config = {
    "vgg_weight": tune.loguniform(1e-4, 10),
    "style_weight": tune.loguniform(1e-4, 10),
    "tv_weight": tune.loguniform(1e-4, 10),
    "valid_weight": tune.loguniform(1e-4, 10),
    "hole_weight": tune.loguniform(1e-4, 10),
    "grad_weight": tune.loguniform(1e-4, 10),
}

metrics = {"loss": "ptl/val_loss", "lpips": "ptl/val_lpips"}
callbacks = [TuneReportCallback(metrics, on="validation_end")]

trainable = tune.with_parameters(
    hparam_search,
    use_gpu=USE_GPU,
    num_workers=NUM_WORKERS,
    num_epochs=NUM_EPOCHS,
    callbacks=callbacks)

analysis = tune.run(
    trainable,
    metric="loss",
    local_dir="./ray_results",
    mode="min",
    config=config,
    num_samples=50,
    resources_per_trial=get_tune_resources(
        num_workers=NUM_WORKERS, use_gpu=USE_GPU),
    name="tune_inpainting")

best_trial = analysis.best_trial  # Get best trial
best_config = analysis.best_config  # Get best trial's hyperparameters
best_logdir = analysis.best_logdir  # Get best trial's logdir
best_checkpoint = analysis.best_checkpoint  # Get best trial's best checkpoint
best_result = analysis.best_result  # Get best trial's last results

print(best_config)

and my model hooks are implemented like this:

    def training_step(self, batch, batch_idx):
        x, mask, _ = batch
        assert x.shape[0] == mask.shape[0], "batch size mismatch"
        batch_size = x.shape[0]
        x_hat = self(x, mask)
        l_hole = F.l1_loss(torch.mul(1 - mask, x_hat), torch.mul(1 - mask, x), reduction='sum') / (1 - mask).sum()
        l_valid = F.l1_loss(torch.mul(mask, x_hat), torch.mul(mask, x), reduction='sum') / mask.sum()
        _, l_vgg, l_style, l_tv = self.VGG16PartialLoss[0].forward(x_hat, x, mask)

        loss = l_vgg * self.l_vgg_weight + l_style * self.l_style_weight + l_tv * self.l_tv_weight + \
               l_valid * self.l_valid_weight + l_hole * self.l_hole_weight
        self.log('train_loss', {'total_loss': loss, 'l_hole': l_hole * self.l_hole_weight,
                                'l_valid': l_valid * self.l_valid_weight, 'l_vgg': l_vgg * self.l_vgg_weight,
                                'l_style': l_style * self.l_style_weight, 'l_tv': l_tv * self.l_tv_weight},
                 batch_size=batch_size)
        return loss

    def validation_step(self, batch, batch_idx):
        x, mask, _ = batch
        assert x.shape[0] == mask.shape[0], "batch size mismatch"
        batch_size = x.shape[0]
        x_hat = self(x, mask)
        l_hole = F.l1_loss(torch.mul(1 - mask, x_hat), torch.mul(1 - mask, x), reduction='sum') / (1 - mask).sum()
        l_valid = F.l1_loss(torch.mul(mask, x_hat), torch.mul(mask, x), reduction='sum') / mask.sum()
        _, l_vgg, l_style, l_tv = self.VGG16PartialLoss[0].forward(x_hat, x, mask)

        loss = l_vgg * self.l_vgg_weight + l_style * self.l_style_weight + l_tv * self.l_tv_weight + \
               l_valid * self.l_valid_weight + l_hole * self.l_hole_weight
        self.log('val_loss', loss, batch_size=batch_size)

        # [-1, 1] for lpips
        x_hat_norm = x_hat * 2 - 1
        x_norm = x * 2 - 1
        lpips = self.lpips[0](x_hat_norm, x_norm)
        self.lpips[0].reset()  # Don't accumulate magnitudes

        ssim = self.ssim(x_hat, x)
        psnr = self.psnr(x_hat, x)

        self.log('lpips', lpips, on_step=False, on_epoch=True, prog_bar=False, batch_size=batch_size)
        self.log('ssim', ssim, on_step=False, on_epoch=True, prog_bar=False, batch_size=batch_size)
        self.log('psnr', psnr, on_step=False, on_epoch=True, prog_bar=False, batch_size=batch_size)
        return {"val_loss": loss, "lpips": lpips, "ssim": ssim, "psnr": psnr}

    def on_validation_start(self):
        self.lpips[0].to(self.device)

    def on_validation_end(self):
        self.lpips[0].cpu()

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["lpips"] for x in outputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_lpips", avg_acc)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lrs[0])
        return optimizer

Definetely my bad. I forgot to feed the val dataloader to

trainer.fit(model, train_loader)

Therefore the on_validation_end hook was never called.

1 Like