I’m running the following code to pass Environment Variable to Tuner
def train_model(config):
.....
# Create checkpoint.
torch.save(
{"epoch": epoch,
"model_state_dict": model.state_dict()},
os.path.join(checkpoint_dir, f"{train.get_context().get_trial_name()}_model_epoch{str(epoch).zfill(8)}.pt"),
)
checkpoint = Checkpoint.from_directory(checkpoint_dir)
metrics = {"loss": running_loss / epoch_steps}
train.report(metrics=metrics, checkpoint=checkpoint)
run_config = RunConfig(name="LSTM_AE",
storage_path=os.path.abspath(TUNE_CHECKPOINT),
checkpoint_config=CheckpointConfig(num_to_keep=1),
sync_config=SyncConfig(sync_artifacts=True,
sync_timeout=600000),
log_to_file=True,
verbose=1
)
tune_resurce = {"cpu": 8}
ray.init(runtime_env={"env_vars": {"PYTHONWARNINGS": "ignore::DeprecationWarning",
"TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S": "600"}
},
ignore_reinit_error=True)
trainable_with_resources = tune.with_resources(train_model, tune_resurce)
tuner = tune.Tuner(
trainable=trainable_with_resources,
tune_config=tune_config,
param_space=param_space,
run_config=run_config
)
results = tuner.fit()
but I still got the following warning
Experiment checkpoint syncing has been triggered multiple times in the last 30.0 seconds. A sync will be triggered whenever a trial has checkpointed more than `num_to_keep` times since last sync or if 300 seconds have passed since last sync. If you have set `num_to_keep` in your `CheckpointConfig`, consider increasing the checkpoint frequency or keeping more checkpoints. You can supress this warning by changing the `TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S` environment variable.
Same problem even if I comment the sync_config
option