Ray Tune - CUDA OOM Error

image

Hello, I am trying to build a HPO Process.
But I encounter CUDA OOM Error without any message for allocated memory! in the middle of the process.

I already checked the maximum gpu usage with largest model and largest batch size.
So I guess the ray cannot return gpu memory appropriately.

I want to ask your help what I should I try for trouble shooting.

import json
import os

import numpy as np
import ray
import torch

from ray import tune
from ray.tune import Stopper
from ray.tune.schedulers import ASHAScheduler
from ray.tune.suggest import ConcurrencyLimiter

from classifitron.config import setup, default_argument_parser
from classifitron.engine.trainer import DefaultTrainer



class Trainable(tune.Trainable):
    
    def setup(self, config):
        self.cfg = config["base"]

        self.cfg.defrost()

        # This is for hyperparameters
        self.cfg.MODEL.ARCHITECTURE = config["ARCHITECTURE"]
        self.cfg.SOLVER.BASE_LR = config["BASE_LR"]
        self.cfg.SOLVER.IMS_PER_BATCH = config["IMS_PER_BATCH"]
        self.cfg.SOLVER.LR_SCHEDULER_TYPE = config["LR_SCHEDULER_TYPE"]
        self.cfg.SOLVER.TYPE = config["SOLVER"]
        self.cfg.DATALOADER.SAMPLER = config["SAMPLER"]
        self.cfg.TEST.IMS_PER_BATCH = 64

        self.cfg.freeze()

        self.trainer = DefaultTrainer(self.cfg)
        self.trainer.data_loader = self.trainer.data_loader
        self.dataloader_val =self.trainer.data_loader_val

    def step(self):

        total_loss = 0.0
        for i in range(self.cfg.TEST.EVAL_PERIOD):
            self.trainer.global_step += 1
            batch = next(self.trainer.data_loader)

            losses = self.trainer.training_step(batch)
            losses = self.trainer.training_step_end(losses)
            total_loss += losses

        else:
            valid_info = self.trainer.validate(data_loader_val=self.dataloader_val)

        self._iteration += self.cfg.TEST.EVAL_PERIOD - 1

        return {
            "loss": float(total_loss.numpy() / self.cfg.TEST.EVAL_PERIOD),
            "acc": float(valid_info["acc"].numpy()),
        }

    def cleanup(self):
        torch.cuda.empty_cache()


class LossStopper(Stopper):
    def __call__(self, trial_id, result):
        if np.isnan(result["loss"]):
            return True
        if result["loss"] > 5000:
            return True

        return False

    def stop_all(self):
        return False


def run_tune(args):
    cfg = setup(args)

    config = {
        "base": cfg,
        "ARCHITECTURE": tune.choice(
            [
                "DenseNet-121",
                "ResNet-50",
                "ResNeXt-101",
            ]
        ),
        "BASE_LR": tune.loguniform(0.00001, 0.05),
        "IMS_PER_BATCH": tune.sample_from(
            lambda _: int(np.random.choice([2 ** k for k in range(5, 8)]))
        ),
        "SOLVER": tune.choice(
            [
                "SGD",
                "Adam",
            ]
        ),
    }

    ray.init(num_gpus=torch.cuda.device_count())

    asha_scheduler = ASHAScheduler(time_attr="training_iteration", max_t=12000, grace_period=1000)
    search_alg = tune.suggest.basic_variant.BasicVariantGenerator(max_concurrent=2)
    search_alg.mode = None

    analysis = tune.run(
        Trainable,
        name="classification",
        config=config,
        scheduler=asha_scheduler,
        metric="acc",
        search_alg=search_alg,
        mode="max",
        num_samples=100,
        stop=LossStopper(),
        resources_per_trial={
            "gpu": cfg.TRAIN.RESOURCES_PER_TRIAL.GPU,
        },
    )



def main():
    parser = default_argument_parser()
    args = parser.parse_args()
    run_tune(args)


if __name__ == "__main__":
    main()
1 Like