def main(num_samples=10, max_num_epochs=10):
data_dir = os.path.abspath("./state-10")
config = {
'encoder': [501,1024,512,64,32,8,8],
'decoder': [8,16,32,64,128,501],
'loss1': tune.grid_search([0.001, 0.01, 0.1, 1.0]),
'loss2': tune.grid_search([0.001, 0.01, 0.1, 1.0]),
'loss3': tune.grid_search([0.001, 0.01, 0.1, 1.0]),
'lr': tune.loguniform(1e-4, 1e-1),
'batch_size': tune.choice([2, 4, 8, 16]),
'ts': tune.choice([2, 4, 5, 10]),
'IsOld':'N'
}
trainer = train_PMV_01(config,checkpoint_dir=None, data_dir=data_dir)
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=max_num_epochs,
grace_period=1,
reduction_factor=2)
reporter = CLIReporter(
metric_columns=["loss", "training_iteration"])
results = tune.run(
train_PMV_01,
config=config,
fail_fast="raise",
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter)
best_trial = results.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
best_trial.last_result["accuracy"]))
best_trained_model = model(best_trial.config['encoder'], best_trial.config['decoder'])
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
if gpus_per_trial > 1:
best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)
Thanks for responding. The config parameters are passed through the train_PMV_01 as the following:
def train_PMV_01(config,checkpoint_dir=None, data_dir=None):
model = K_autoencoder_01(config['encoder'],config['decoder'])
model.train()
device = "cpu"
model.to(device)
loss_function = loss(config['loss1'], config['loss2'], config['loss3'], config['ts'])
train_loader, val_loader, test_loader = get_dataloaders(data_dir, config['batch_size'])
optimizer = torch.optim.Adam (model.parameters(), lr=config['lr'])
if checkpoint_dir:
model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint"))
model.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
for epoch in range(10):
running_loss = 0.0
epoch_steps = 0
for i, data in enumerate(train_loader):
X=data
X = X.to(device)
model.zero_grad()
Loss = loss_function(model,X)
Loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 10.)
optimizer.step()
running_loss += Loss.item()
epoch_steps += 1
if i % 2000 == 1999:
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
running_loss / epoch_steps))
running_loss = 0.0
val_loss = 0.0
val_steps = 0
for i, data in enumerate(val_loader):
with torch.no_grad():
X=data
X = X.to(device)
Loss = loss_function(model,X)
Loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 10.)
optimizer.step()
val_loss += Loss.item()
val_steps += 1
with tune.checkpoint_dir(epoch) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((model.state_dict(), optimizer.state_dict()), path)
tune.report(loss=(val_loss / val_steps))
print("Finished Training")
I would receive this error when I try to run the main function.
TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'
RayTaskError(TypeError): ray::ImplicitFunc.train() (pid=14256, ip=127.0.0.1, repr=train_PMV_01)