Issues with uniform/loguniform and batch_size after adjustments

gerlepper · August 30, 2024, 10:07am

Hi,

I used to work with this code couple of months ago and it worked perfectly. However, when I set up my new Colab workspace, there are many issues with tune. Full workflow:

!pip install ray[tune]
!pip install hpbandster ConfigSpace

import numpy as np
import os
import pandas as pd
import seaborn as sns
import torch
import torchvision

from functools import partial
from matplotlib import pyplot as plt
from random import randrange
from ray import train
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import HyperBandForBOHB, ASHAScheduler
from ray.tune.search.bohb import TuneBOHB
from ray.tune.search.hyperopt import HyperOptSearch
from ray.train import Checkpoint, report, get_context
from tabulate import tabulate
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import random_split
from torch.utils.data.sampler import SubsetRandomSampler
from torchsummary import summary

from ray.train import Checkpoint
import tempfile

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# standard cast into Tensor and pixel values normalization in [-1, 1] range
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# extra transfrom for the training data, in order to achieve better performance
train_transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    torchvision.transforms.RandomCrop(32, padding=4, padding_mode='reflect'),
    torchvision.transforms.RandomHorizontalFlip(),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=train_transform
)
validationset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform
)
testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform
)

IDX_TO_LABEL = {v: k for k, v in trainset.class_to_idx.items()}

class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        self.res1 = nn.Sequential(nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True)
        ), nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True))
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        self.res2 = nn.Sequential(nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True)
        ), nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True))
        )

        self.classifier = nn.Sequential(
            nn.MaxPool2d(4),
            nn.Flatten(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.res1(x) + x
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.res2(x) + x
        x = self.classifier(x)

        return x

# percentage of training set to use as validation
valid_size = 0.2

# obtain training indices that will be used for validation
num_train = len(trainset)
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
TRAIN_SAMPLER = SubsetRandomSampler(train_idx)
VALID_SAMPLER = SubsetRandomSampler(valid_idx)

# number of subprocesses to use for data loading
NUM_WORKERS = 2

def data_loaders(trainset, validationset, testset, size):
    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=size,
        sampler=TRAIN_SAMPLER, num_workers=NUM_WORKERS
    )
    validloader = torch.utils.data.DataLoader(
        validationset, batch_size=size,
        sampler=VALID_SAMPLER, num_workers=NUM_WORKERS
    )
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=size, num_workers=NUM_WORKERS
    )

    return trainloader, validloader, testloader

def train_cifar(
    config, trainset, validationset, testset,
    epochs=10, tuning=False
):
    net = ResNet()
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        net.parameters(),
        lr=config['lr'],
        betas=(config['beta1'], config['beta2']),
        amsgrad=config['amsgrad'],
    )

    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
    # should be restored
    checkpoint: train.Checkpoint = train.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            torch.load(os.path.join(checkpoint_dir, "checkpoint.pt"))

    trainloader, validloader, testloader = data_loaders(
        trainset, validationset, testset, config['batch_size']
    )

    train_loss_list = []
    accuracy_list = []

    # track minimum validation loss
    valid_loss_min = np.Inf

    for epoch in range(epochs):

        train_loss = 0.0
        net.train()
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)

        correct = 0
        valid_loss = 0.0
        net.eval()
        for inputs, labels in validloader:
            with torch.no_grad():
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                loss = criterion(outputs, labels)

                valid_loss += loss.item() * inputs.size(0)

                predicted = torch.max(outputs.data, 1)[1]

                correct += (predicted == labels).sum().item()

        train_loss = train_loss / len(TRAIN_SAMPLER)
        valid_loss = valid_loss / len(VALID_SAMPLER)
        accuracy = correct / len(VALID_SAMPLER)

        train_loss_list.append(train_loss)
        accuracy_list.append(accuracy)

        if not tuning:
            print(
                f'Epoch: {epoch} \tTraining Loss: {train_loss:.6f} \t'
                f'Validation Loss: {valid_loss:.6f} \t'
                f'Validation Accuracy: {accuracy:.6f}'
            )

            if valid_loss <= valid_loss_min:
                print(
                    'Validation loss decreased ('
                    f'{valid_loss_min:.6f} --> {valid_loss:.6f}).  '
                    'Saving model ...'
                )

                torch.save(net.state_dict(), 'cnn.pt')
                valid_loss_min = valid_loss
        else:
            # Here we save a checkpoint. It is automatically registered with
            # Ray Tune and will potentially be passed as the `checkpoint_dir`
            # parameter in future iterations.
            #with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
               # path = os.path.join(checkpoint_dir, 'checkpoint')
              #  torch.save((net.state_dict(), optimizer.state_dict()), path)

            # tune.report(mean_loss=valid_loss, accuracy=accuracy


            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
                path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
                torch.save(
                    (net.state_dict(), optimizer.state_dict()), path
                )
                checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
                train.report(
                    {"mean_loss": valid_loss, "accuracy": accuracy},
                    checkpoint=checkpoint,
                )

    print(f'\n{color.bold}----------- Finished Training -----------{color.end}')

    if not tuning:
        return train_loss_list, accuracy_list

config = {
    'batch_size': tune.choice([16, 32, 128]),
    'lr': tune.loguniform(1e-6, 1e-2),
    'beta1': tune.uniform(0.8, 0.96),
    'beta2': tune.uniform(0.98, 0.999999),
    'amsgrad': tune.choice([True, False]),
}

search_alg = TuneBOHB(metric="mean_loss", mode="min")
scheduler = HyperBandForBOHB(
    time_attr='training_iteration',
    metric='mean_loss',
    mode='min',
    max_t=100
)
reporter = CLIReporter(
    metric_columns=['mean_loss', 'accuracy', 'training_iteration'],
    metric='mean_loss',
    mode='min',
    sort_by_metric=True
)

result = tune.run(
    tune.with_parameters(
        train_cifar,
        tuning=True,
        trainset=trainset,
        validationset=validationset,
        testset=testset
    ),
    num_samples=30,
    resources_per_trial={'gpu': 1},
    scheduler=scheduler,
    config=config,
    search_alg=search_alg,
    progress_reporter=reporter
)

After running the last piece I get:

TypeError Traceback (most recent call last)
in <cell line: 23>()
21 )
22
—> 23 result = tune.run(
24 tune.with_parameters(
25 train_cifar,

8 frames
/usr/local/lib/python3.10/dist-packages/ray/tune/search/bohb/bohb_search.py in resolve_value(par, domain)
309 lower = math.ceil(domain.lower / quantize) * quantize
310 upper = math.floor(domain.upper / quantize) * quantize
→ 311 return ConfigSpace.UniformFloatHyperparameter(
312 par, lower=lower, upper=upper, q=quantize, log=True
313 )

TypeError: UniformFloatHyperparameter.init() got an unexpected keyword argument ‘q’

Then, if I try to change uniform / loguniform spaces with, for example,

'lr': ray.tune.choice([0.0001,0.1])

etc., I get errors related to batch_size:

2024-08-30 09:32:28,133 WARNING tune.py:902 – AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.
±-------------------------------------------------------------------+
| Configuration for experiment train_cifar_2024-08-30_09-32-28 |
±-------------------------------------------------------------------+
| Search algorithm SearchGenerator |
| Scheduler HyperBandForBOHB |
| Number of trials 30 |
±-------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_cifar_2024-08-30_09-32-28
To visualize your results with TensorBoard, run: tensorboard --logdir /tmp/ray/session_2024-08-30_07-55-06_272177_260/artifacts/2024-08-30_09-32-28/train_cifar_2024-08-30_09-32-28/driver_artifacts

Trial status: 1 PENDING
Current time: 2024-08-30 09:32:28. Total running time: 0s
Logical resource usage: 0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
±--------------------------------+
| Trial name status |
±--------------------------------+
| train_cifar_c4eccae3 PENDING |
±--------------------------------+

Trial train_cifar_c4eccae3 started with configuration:
±----------------------------------------------+
| Trial train_cifar_c4eccae3 config |
±----------------------------------------------+
| amsgrad 1 |
| batch_size 16 |
| beta1 0.90686 |
| beta2 0.98488 |
| lr 0.06593 |
±----------------------------------------------+
(train_cifar pid=31271) Configuration received: {‘amsgrad’: 1, ‘batch_size’: 16, ‘beta1’: 0.906863121917, ‘beta2’: 0.9848844711283, ‘lr’: 0.0659343170261}
2024-08-30 09:32:34,367 ERROR tune_controller.py:1331 – Trial task failed for trial train_cifar_c4eccae3
Traceback (most recent call last):
File “/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py”, line 110, in resolve_future
result = ray.get(future)
File “/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py”, line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File “/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py”, line 103, in wrapper
return func(*args, **kwargs)
File “/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py”, line 2661, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File “/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py”, line 871, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::ImplicitFunc.train() (pid=31271, ip=172.28.0.12, actor_id=812653cc902351e920cc5e6401000000, repr=train_cifar)
File “/usr/local/lib/python3.10/dist-packages/ray/tune/trainable/trainable.py”, line 331, in train
raise skipped from exception_cause(skipped)
File “/usr/local/lib/python3.10/dist-packages/ray/air/_internal/util.py”, line 98, in run
self._ret = self._target(*self._args, **self._kwargs)
File “/usr/local/lib/python3.10/dist-packages/ray/tune/trainable/function_trainable.py”, line 45, in
training_func=lambda: self._trainable_func(self.config),
File “/usr/local/lib/python3.10/dist-packages/ray/tune/trainable/function_trainable.py”, line 250, in _trainable_func
output = fn()
File “/usr/local/lib/python3.10/dist-packages/ray/tune/trainable/util.py”, line 130, in inner
return trainable(config, **fn_kwargs)
File “”, line 29, in train_cifar
File “”, line 5, in data_loaders
File “/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py”, line 357, in init
batch_sampler = BatchSampler(sampler, batch_size, drop_last)
File “/usr/local/lib/python3.10/dist-packages/torch/utils/data/sampler.py”, line 268, in init
raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
ValueError: batch_size should be a positive integer value, but got batch_size=16
(train_cifar pid=31271) Batch size received in data_loaders: 16
(train_cifar pid=31271) TRAIN_SAMPLER: <torch.utils.data.sampler.SubsetRandomSampler object at 0x7903c59f1f90>, VALID_SAMPLER: <torch.utils.data.sampler.SubsetRandomSampler object at 0x7903c59f1ed0>

Trial train_cifar_c4eccae3 errored after 0 iterations at 2024-08-30 09:32:34. Total running time: 6s
Error file: /tmp/ray/session_2024-08-30_07-55-06_272177_260/artifacts/2024-08-30_09-32-28/train_cifar_2024-08-30_09-32-28/driver_artifacts/train_cifar_c4eccae3_1_amsgrad=1,batch_size=16,beta1=0.9069,beta2=0.9849,lr=0.0659_2024-08-30_09-32-28/error.txt

I printed out the batch_size and, well, it is an integer positive number.

Sorry for the chaos but I have no clue what might be wrong here…

Topic		Replies	Views
Config tune variables aren't matching the training types Ray Tune	12	1137	December 5, 2022
Trial with unexpected good status encountered: PENDING	11	586	May 19, 2023
Auto-finding batch size in Lightning not working with Tune Ray Tune	2	502	March 8, 2023
Getting serialization error when using Ray Tune Ray Tune	9	2354	May 21, 2023
Error When Trying to Tune a Trainable Function	8	2580	August 29, 2023

Issues with uniform/loguniform and batch_size after adjustments

Related topics