How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I’m running Ray Tune 2.2.0 with a PyTorch Lightning module and found that tune.report(…) inside TuneReportCallback is unable to relay metrics back to the Ray session. Diving deeper, I found that the Ray session is disabled during the training/validation steps of the PyTorch Lightning module.
This is the error I have been receiving:
Session not detected. You should not be calling
report
outsidetuner.fit()
or while using the class API.
…
ValueError: Trial returned a result which did not include the specified metric(s)ptl/val_loss
thattune.TuneConfig()
expects. Make sure your calls totune.report()
include the metric, or set the TUNE_DISABLE_STRICT_METRIC_CHECKING environment variable to 1.
This is a simplified version of my code:
def inside_tune():
return ray.tune.is_session_enabled()
class HPDeepRegressionModel(pl.LightningModule):
def __init__(self, cfg):
print("Initializing HpDeepRegressionModel...")
super().__init__()
self.save_hyperparameters()
print("Init: Inside raytune session ", inside_tune())
# hyperparameters
self.in_length = cfg['in_length']
self.num_blocks = cfg['num_blocks']
self.layer_length = cfg['layer_width']
self.out_length = cfg['out_length']
self.dropout_rate = cfg['dropout']
self.optimizer_lr = cfg['optimizer_lr']
self._create_layers()
self.init_metrics()
print("Using default MSELoss...")
self.loss = nn.MSELoss()
print("Init end: Inside raytune session ", inside_tune())
def _create_layers(self):
self.layers = nn.ModuleList()
self.layers.append(nn.Linear(self.in_length, self.layer_length))
for block_idx in range(0, self.num_blocks-1):
self.layers.append(nn.Linear(self.layer_length, self.layer_length))
self.layers.append(nn.ReLU())
self.layers.append(nn.Dropout(self.dropout_rate))
self.layers.append(nn.Linear(self.layer_length, self.out_length))
def init_metrics(self):
# loss metrics
self.train_mse_loss = MeanSquaredError()
self.val_mse_loss = MeanSquaredError()
self.train_mae_loss = MeanAbsoluteError()
self.val_mae_loss = MeanAbsoluteError()
def _shared_forward(self, x: Dict[str, torch.Tensor]):
out = x['features']
for i, layer in enumerate(self.layers):
out = layer(out)
return out
def forward(self, x: Dict[str, torch.Tensor]):
return self._shared_forward(x)
def training_step(self, batch, batch_idx):
print("Train: Inside raytune session ", inside_tune())
model_output = self.forward(batch)
batch_size = batch['target'].shape[0]
targets = batch['target'].to(model_output.dtype)
loss = self.loss(model_output, targets)
self.log("train_loss", loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
self.train_mse_loss(model_output, targets)
self.log("train_mse_loss", self.train_mse_loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
self.train_mae_loss(model_output, targets)
self.log("train_mae_loss", self.train_mae_loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
print("Train end: Inside raytune session ", inside_tune())
return loss
def validation_step(self, batch, batch_idx):
model_output = self.forward(batch)
batch_size = batch['target'].shape[0]
targets = batch['target'].to(model_output.dtype)
print("Valid: Inside raytune session ", inside_tune())
loss = self.loss(model_output, targets)
self.log("val_loss", loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
self.val_mse_loss(model_output, targets)
self.log("valid_mse_loss", self.val_mse_loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
self.val_mae_loss(model_output, targets)
self.log("valid_mae_loss", self.val_mae_loss, on_step=True, on_epoch=True,
prog_bar=True, logger=True, batch_size=batch_size)
return loss
def validation_epoch_end(self, output):
print("On validation epoch end: ray session: ", inside_tune())
self.log("ptl/val_loss", float(output[0]))
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.optimizer_lr)
def train_epoch_start(self):
print("On train epoch start: ray session: ", inside_tune())
def train_epoch_end(self):
print("On train epoch end: ray session: ", inside_tune())
def inside_tune():
return ray.tune.is_session_enabled()
def main(cfg, verbose: bool = True):
logger = TensorBoardLogger(save_dir=cfg.logging.path, version=cfg.logging.name, name="")
dm = instantiate(cfg.dataset)
# Ray Tune hyperparameter search
results = ray_tune_train(cfg, dm, logger, verbose=verbose)
return results
def train_fn(model_cfg: Dict, cfg, data_module: pl.LightningDataModule, logger: TensorBoardLogger, verbose: bool = True):
model = HPDeepRegressionModel(model_cfg)
print("Initialized model: Inside raytune session: ", inside_tune())
lr_monitor = LearningRateMonitor(logging_interval='step')
checkpoint_callback = ModelCheckpoint(save_top_k=cfg.checkpoint.save_top_k, monitor="epoch",
mode="max", filename="model-{epoch}")
tune_report_callback = TuneReportCallback(["ptl/val_loss"],
on="validation_end")
print("Initializing trainer: Inside raytune session: ", inside_tune())
trainer = pl.Trainer(
**cfg.trainer,
logger=logger,
callbacks=[lr_monitor, checkpoint_callback, tune_report_callback]
)
print("Initialized trainer: Inside raytune session: ", inside_tune())
print("Trainer fit: Inside raytune session: ", inside_tune())
trainer.fit(model, datamodule=data_module, ckpt_path=get_checkpoint_path(cfg))
print("Trainer callback metrics: ", trainer.callback_metrics)
def ray_tune_train(cfg, data_module: pl.LightningDataModule, logger: TensorBoardLogger, verbose: bool = True):
print("--Started raytune session: ", inside_tune())
num_epochs = cfg.trainer.max_epochs
num_gpus = 0
gpus_per_trial = 0
num_hp_samples = 5 # number of times to sample from hyperparameter space
hp_configs = {
"layer_width": tune.randint(8, 32),
"num_blocks": tune.randint(1, 32),
"dropout": tune.uniform(0, 0.5),
"optimizer_lr": tune.loguniform(1e-4, 1e-1),
"in_length": cfg.model.nn.in_length,
"out_length": cfg.model.nn.out_length,
}
train_fn_with_parameters = tune.with_parameters(train_fn,
cfg=cfg,
data_module=data_module,
logger=logger,
verbose=verbose
)
scheduler = ASHAScheduler(
max_t=num_epochs,
grace_period=1,
reduction_factor=2)
reporter = CLIReporter(
parameter_columns=["layer_width", "num_blocks", "dropout", "optimizer_lr"],
metric_columns=["ptl/val_loss", "training_iteration"])
tuner = tune.Tuner(
train_fn_with_parameters,
tune_config=tune.TuneConfig(
metric="ptl/val_loss",
mode="min",
scheduler=scheduler,
num_samples=num_hp_samples,
),
run_config=air.RunConfig(
local_dir="./ray_results",
name="tune_asha_regression",
progress_reporter=reporter,
),
param_space=hp_configs,
)
results = tuner.fit()
print("Best hyperparameters found were: ", results.get_best_result().config)
return results
if __name__ == "__main__":
main(cfg)
From the checkpoint flags with inside_tune()
, I received the following print statements:
Init: Inside raytune session True
Init end: Inside raytune session True
Initialized model: Inside raytune session: True
Initializing trainer: Inside raytune session: True
Initialized trainer: Inside raytune session: True
Trainer fit: Inside raytune session: True
Train: Inside raytune session False
Train end: Inside raytune session False
Valid: Inside raytune session False
On validation epoch end: ray session: False
Trainer fit end: Inside raytune session: True
Any insights on how to solve this issue would be of great help. Thanks!