Hello, I am trying to build a HPO Process.
But I encounter CUDA OOM Error without any message for allocated memory! in the middle of the process.
I already checked the maximum gpu usage with largest model and largest batch size.
So I guess the ray cannot return gpu memory appropriately.
I want to ask your help what I should I try for trouble shooting.
import json
import os
import numpy as np
import ray
import torch
from ray import tune
from ray.tune import Stopper
from ray.tune.schedulers import ASHAScheduler
from ray.tune.suggest import ConcurrencyLimiter
from classifitron.config import setup, default_argument_parser
from classifitron.engine.trainer import DefaultTrainer
class Trainable(tune.Trainable):
def setup(self, config):
self.cfg = config["base"]
self.cfg.defrost()
# This is for hyperparameters
self.cfg.MODEL.ARCHITECTURE = config["ARCHITECTURE"]
self.cfg.SOLVER.BASE_LR = config["BASE_LR"]
self.cfg.SOLVER.IMS_PER_BATCH = config["IMS_PER_BATCH"]
self.cfg.SOLVER.LR_SCHEDULER_TYPE = config["LR_SCHEDULER_TYPE"]
self.cfg.SOLVER.TYPE = config["SOLVER"]
self.cfg.DATALOADER.SAMPLER = config["SAMPLER"]
self.cfg.TEST.IMS_PER_BATCH = 64
self.cfg.freeze()
self.trainer = DefaultTrainer(self.cfg)
self.trainer.data_loader = self.trainer.data_loader
self.dataloader_val =self.trainer.data_loader_val
def step(self):
total_loss = 0.0
for i in range(self.cfg.TEST.EVAL_PERIOD):
self.trainer.global_step += 1
batch = next(self.trainer.data_loader)
losses = self.trainer.training_step(batch)
losses = self.trainer.training_step_end(losses)
total_loss += losses
else:
valid_info = self.trainer.validate(data_loader_val=self.dataloader_val)
self._iteration += self.cfg.TEST.EVAL_PERIOD - 1
return {
"loss": float(total_loss.numpy() / self.cfg.TEST.EVAL_PERIOD),
"acc": float(valid_info["acc"].numpy()),
}
def cleanup(self):
torch.cuda.empty_cache()
class LossStopper(Stopper):
def __call__(self, trial_id, result):
if np.isnan(result["loss"]):
return True
if result["loss"] > 5000:
return True
return False
def stop_all(self):
return False
def run_tune(args):
cfg = setup(args)
config = {
"base": cfg,
"ARCHITECTURE": tune.choice(
[
"DenseNet-121",
"ResNet-50",
"ResNeXt-101",
]
),
"BASE_LR": tune.loguniform(0.00001, 0.05),
"IMS_PER_BATCH": tune.sample_from(
lambda _: int(np.random.choice([2 ** k for k in range(5, 8)]))
),
"SOLVER": tune.choice(
[
"SGD",
"Adam",
]
),
}
ray.init(num_gpus=torch.cuda.device_count())
asha_scheduler = ASHAScheduler(time_attr="training_iteration", max_t=12000, grace_period=1000)
search_alg = tune.suggest.basic_variant.BasicVariantGenerator(max_concurrent=2)
search_alg.mode = None
analysis = tune.run(
Trainable,
name="classification",
config=config,
scheduler=asha_scheduler,
metric="acc",
search_alg=search_alg,
mode="max",
num_samples=100,
stop=LossStopper(),
resources_per_trial={
"gpu": cfg.TRAIN.RESOURCES_PER_TRIAL.GPU,
},
)
def main():
parser = default_argument_parser()
args = parser.parse_args()
run_tune(args)
if __name__ == "__main__":
main()