I am trying to run a second piece of Ray code after Ray Tune completes, but I am met with this error:
2023-12-14 22:10:04,454 WARNING tune_controller.py:746 -- Trial controller checkpointing failed: [Errno 2] No such file or directory:
If I run this and stop here, no problems:
analysis = tune.run(
tune.with_parameters(tune_obj, df=df, horizon=horizon, freq=freq),
num_samples=2,
config=config,
scheduler=scheduler,
local_dir=output_path,
checkpoint_config=CheckpointConfig(
num_to_keep=1, checkpoint_frequency=0), # tried fiddling with checkpoint opts
storage_path=output_path,
resume=False,
search_alg=HyperOptSearch(metric='mse', mode='min'),
max_concurrent_trials=2,
progress_reporter=reporter,
resources_per_trial={"gpu": 1}
)
# get the best hyperparameters
best_config = analysis.get_best_config(metric="mse", mode="min")
print("Best Config:", best_config)
After tuning, I want to run this second stage. But any other ray operation after this will yield Ray Tune checkpoint errors.
@ray.remote(num_gpus=1)
class BatchPredictor:
def __init__(self, model, data):
self.model = model
...
def predict(self, feature_name):
...
return forecasts
# storage model / df
model_ref = ray.put(nf_best)
test_df_ref = ray.put(test_df)
num_actors = 24
actors = [BatchPredictor.remote(model_ref, test_df_ref) for _ in range(num_actors)]
pool = ActorPool(actors)
feature_names = test_df['unique_id'].unique()
# submit
for name in feature_names:
pool.submit(lambda a, v: a.predict.remote(v), name)
# get results
while pool.has_next():
tdf = pool.get_next()