In my current setup I would like to upload any checkpoints during tune runs to a s3 compatible storage to serve the best models at a later point. To achieve this I crated a sync config and a sync function.
When I start this setup. The sync is called once at the start of the tune run but never after that. Currently I’m trying to get this to work on my local machine but I would also want to run this in a ray cluster.
def sync_func(local, remote):
import boto3
s3 = boto3.resource('s3',
endpoint_url='...',
aws_access_key_id='...',
aws_secret_access_key='...')
bucket = s3.Bucket("ray-tests")
import logging
for root, dirs, files in os.walk(local):
dir = os.path.basename(root)
for file in files:
path = os.path.join(root, file)
remote_path = f"{dir}/{file}" if dir is not "" else file
remote = remote.replace("\\", "/")
print(f"Trying to upload file {remote}/{remote_path}")
bucket.upload_file(path, remote + "/" + remote_path)
sync_config = SyncConfig(
sync_to_cloud=sync_func,
sync_on_checkpoint=True,
upload_dir="trials"
)
tune.run(
ImpalaTrainer,
config=config,
checkpoint_at_end=True,
checkpoint_freq=15,
trial_dirname_creator=lambda x: x.trial_id,
metric="episode_reward_mean",
mode="max",
sync_config=sync_config,
)