Ray Tune - CUDA OOM Error

nijkah · July 26, 2021, 8:28am

Hello, I am trying to build a HPO Process.
But I encounter CUDA OOM Error without any message for allocated memory! in the middle of the process.

I already checked the maximum gpu usage with largest model and largest batch size.
So I guess the ray cannot return gpu memory appropriately.

I want to ask your help what I should I try for trouble shooting.

import json
import os

import numpy as np
import ray
import torch

from ray import tune
from ray.tune import Stopper
from ray.tune.schedulers import ASHAScheduler
from ray.tune.suggest import ConcurrencyLimiter

from classifitron.config import setup, default_argument_parser
from classifitron.engine.trainer import DefaultTrainer



class Trainable(tune.Trainable):
    
    def setup(self, config):
        self.cfg = config["base"]

        self.cfg.defrost()

        # This is for hyperparameters
        self.cfg.MODEL.ARCHITECTURE = config["ARCHITECTURE"]
        self.cfg.SOLVER.BASE_LR = config["BASE_LR"]
        self.cfg.SOLVER.IMS_PER_BATCH = config["IMS_PER_BATCH"]
        self.cfg.SOLVER.LR_SCHEDULER_TYPE = config["LR_SCHEDULER_TYPE"]
        self.cfg.SOLVER.TYPE = config["SOLVER"]
        self.cfg.DATALOADER.SAMPLER = config["SAMPLER"]
        self.cfg.TEST.IMS_PER_BATCH = 64

        self.cfg.freeze()

        self.trainer = DefaultTrainer(self.cfg)
        self.trainer.data_loader = self.trainer.data_loader
        self.dataloader_val =self.trainer.data_loader_val

    def step(self):

        total_loss = 0.0
        for i in range(self.cfg.TEST.EVAL_PERIOD):
            self.trainer.global_step += 1
            batch = next(self.trainer.data_loader)

            losses = self.trainer.training_step(batch)
            losses = self.trainer.training_step_end(losses)
            total_loss += losses

        else:
            valid_info = self.trainer.validate(data_loader_val=self.dataloader_val)

        self._iteration += self.cfg.TEST.EVAL_PERIOD - 1

        return {
            "loss": float(total_loss.numpy() / self.cfg.TEST.EVAL_PERIOD),
            "acc": float(valid_info["acc"].numpy()),
        }

    def cleanup(self):
        torch.cuda.empty_cache()


class LossStopper(Stopper):
    def __call__(self, trial_id, result):
        if np.isnan(result["loss"]):
            return True
        if result["loss"] > 5000:
            return True

        return False

    def stop_all(self):
        return False


def run_tune(args):
    cfg = setup(args)

    config = {
        "base": cfg,
        "ARCHITECTURE": tune.choice(
            [
                "DenseNet-121",
                "ResNet-50",
                "ResNeXt-101",
            ]
        ),
        "BASE_LR": tune.loguniform(0.00001, 0.05),
        "IMS_PER_BATCH": tune.sample_from(
            lambda _: int(np.random.choice([2 ** k for k in range(5, 8)]))
        ),
        "SOLVER": tune.choice(
            [
                "SGD",
                "Adam",
            ]
        ),
    }

    ray.init(num_gpus=torch.cuda.device_count())

    asha_scheduler = ASHAScheduler(time_attr="training_iteration", max_t=12000, grace_period=1000)
    search_alg = tune.suggest.basic_variant.BasicVariantGenerator(max_concurrent=2)
    search_alg.mode = None

    analysis = tune.run(
        Trainable,
        name="classification",
        config=config,
        scheduler=asha_scheduler,
        metric="acc",
        search_alg=search_alg,
        mode="max",
        num_samples=100,
        stop=LossStopper(),
        resources_per_trial={
            "gpu": cfg.TRAIN.RESOURCES_PER_TRIAL.GPU,
        },
    )



def main():
    parser = default_argument_parser()
    args = parser.parse_args()
    run_tune(args)


if __name__ == "__main__":
    main()

Topic		Replies	Views
Terminated Trials hold memory causing OOM Ray Tune	1	429	October 26, 2021
GPU memory not being freed every other trial in Ray Tune	3	701	February 21, 2023
Ray Out of Memory Issue Ray Tune	1	186	April 30, 2024
RuntimeError: No CUDA GPUs are available Ray Tune	12	14531	February 3, 2023
Out of Memory because of ray::ImplicitFunc.train Ray Tune	0	321	August 28, 2023

Ray Tune - CUDA OOM Error

Related topics