Hi Kai,
This issue happens 25-50% of the time, other weights and params transfers are successful. Also, I am sure that all checkpoints are saved.
I have a pretty long class, hope it is enough:
hyperparams
self.hypers = {
"inc_layers" : [0,1,2],
"ker1" : { '0':[1,2,3], '1':[1,2,3]},
"ker2" : { '0':[1,2,3], '1':[1,2,3]},
'FEAT_ker_mult1' : lambda: np.random.randint(1,4), # excluding
'FEAT_ker_mult2' : lambda: np.random.randint(1,4),
'FEAT_n_fil' : [16,32,64,98],
'GRU_encoder' : lambda: np.random.randint(0,4),
'GRU_nodes' : lambda: np.random.randint(5,100), ## upper exclusive
'attention' : ['both','first','last'],
'BahN' : lambda: np.random.randint(5,50),
'comm_D_Drop' : lambda: np.random.uniform(0.85,0.95),
'common_dense_num' : lambda: np.random.randint(1,4),
'comm_D_N' : lambda: np.random.randint(50,300),
'maxP' : lambda: np.random.randint(1,5),
'bs' : [8,16,24],
'lr' : [0.01 , 0.005 , 0.001 , 0.0005, 0.0001]
}
fixed parameters
self.conf= {
#model related
'loss' : 'mse',
'opt' : 'adam',
'metrics' : 'mape',
'val_split' : 0.25,
'verb' : 1,
'bs' : 64,
'activation' : LeakyReLU (alpha= 0.05 )
}
Tuner
def tune(self):
scheduler = PopulationBasedTraining(
time_attr="training_iteration",
perturbation_interval=2,
hyperparam_mutations = self.hypers
)
stopper = MaximumIterationStopper(max_iter=10)
analysis = tune.run(
self.fit_tuner,
scheduler = scheduler,
raise_on_failed_trial = False,
metric = 'mse',
mode = 'min',
verbose = 1,
local_dir = self.tuner_workdir,
num_samples = 5,
config = self.conf
)
return(analysis)
Trainable
def fit_tuner(self,hp, checkpoint_dir=None):
callbacks = [ TuneReportCheckpointCallback(
metrics={"val_loss": "val_loss"},
filename="model",
on ="epoch_end") ]
nn=self.model(hp) ### self.model returns compiled model
nn.fit(
self.train_X, self.train_Y,
validation_split=self.conf['val_split'],
verbose=self.conf['verb'],
batch_size=hp['bs'],
epochs=10,
callbacks=callbacks)
Thanks!