I am trying to understand how to use ray.tune.Tuner.restore
method and I wrote the below piece of code:
import argparse
from ray import tune, train
import tempfile
import numpy as np
import os
import pickle
import time
from ray.train import Checkpoint
def trainable(config: dict):
values = []
checkpoint = train.get_checkpoint()
if checkpoint:
with checkpoint.as_directory() as checkpoint_dir:
with open(os.path.join(checkpoint_dir, 'values.pkl'), 'rb') as f:
values = pickle.load(f)
for i in range(100):
value = np.random.uniform(config['mean'], config['std'])
time.sleep(1)
values.append(value)
with tempfile.TemporaryDirectory() as tempdir:
with open(os.path.join(tempdir, 'values.pkl'), 'wb') as f:
pickle.dump(values, f)
train.report(metrics={'score': np.mean(values)},
checkpoint=Checkpoint.from_directory(tempdir))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--restore', action='store_true', default=False)
args = parser.parse_args()
tune_config = tune.TuneConfig(mode='min', num_samples=10, metric='score')
storage_dir = '/Users/arun/code/ml/ray/fault_tolerance'
run_config = train.RunConfig(storage_path=storage_dir)
if not args.restore:
tuner = tune.Tuner(trainable,
param_space={
"mean": tune.uniform(5, 10),
"std": tune.uniform(1, 2)
},
tune_config=tune_config,
run_config=run_config)
else:
tuner = tune.Tuner.restore(storage_dir,
trainable=trainable,
resume_errored=True)
results = tuner.fit()
best_result = results.get_best_result()
all_results = results.get_dataframe()
print(all_results)
print(best_result)
After I started the job as python3 restore.py
, I terminated it manually and restarted it as python3 restore.py --restore
to check whether the process is getting restored or not. When I run python3 restore.py --restore
, I get the following error:
Traceback (most recent call last):
File "/Users/arun/code/ml/ray/restore.py", line 47, in <module>
tuner = tune.Tuner.restore(storage_dir,
File "/Users/arun/Applications/miniconda3/envs/dc/lib/python3.9/site-packages/ray/tune/tuner.py", line 251, in restore
tuner_internal = TunerInternal(
File "/Users/arun/Applications/miniconda3/envs/dc/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py", line 119, in __init__
self._restore_from_path_or_uri(
File "/Users/arun/Applications/miniconda3/envs/dc/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py", line 337, in _restore_from_path_or_uri
with fs.open_input_file(os.path.join(fs_path, _TUNER_PKL)) as f:
File "pyarrow/_fs.pyx", line 770, in pyarrow._fs.FileSystem.open_input_file
File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 113, in pyarrow.lib.check_status
FileNotFoundError: [Errno 2] Failed to open local file '/Users/arun/code/ml/ray/fault_tolerance/tuner.pkl'. Detail: [errno 2] No such file or directory
I have the following questions: qn 1: Is the above piece of logically correct? qn2: What is the right way to use tune.Tuner.restore?