Hi, I am facing issues with ray tune with probably newer versions…
lets say i start raytune with below mincode..
import os
import time
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
import ray
from ray.air import session
import json
curdir = os.getcwd()
exp_output_root = "ray_outputs"
exp_name = "demo_exp"
def train_fn(config):
results = {}
outdir = session.get_trial_dir()
print("======> Trial dir seen inside trainable:", outdir)
for i in range(100000):
#time.sleep(1)
session.report({"iter": i})
results[i] = {"accuracy": 100, "iter": i}
if i% 1000 == 0:
with open(os.path.join(outdir, "exps_outputs.json"), "w") as f:
json.dump(results, f, indent=4)
print(f"saving the results file to {outdir} at iter: {i}")
time.sleep(2)
print("======> Trial dir seen inside trainable:", session.get_trial_dir())
if True:
# -----------------------------
# 1st run (fresh)
# -----------------------------
tuner = tune.Tuner(
trainable=train_fn,
run_config=RunConfig(
name=exp_name,
storage_path=os.path.join(curdir, exp_output_root), # 👈 store under project dir
checkpoint_config=CheckpointConfig(num_to_keep=1),
),
tune_config=tune.TuneConfig(num_samples=1),
)
results = tuner.fit()
print("\nResults saved to:", results.experiment_path)
Lets say the code is interrupted in the middle, and I restore using below code AND also load the json file from the disk before starting the loop! BUT THIS JSON FILE IS NOT PRESent as the session.get_trial_dir points to new location….
import os
import time
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
import ray
from ray.air import session
import json
curdir = os.getcwd()
exp_output_root = "ray_outputs"
exp_name = "demo_exp"
def train_fn(config):
results = {}
outdir = session.get_trial_dir()
print("======> Trial dir seen inside trainable:", outdir)
# FAILS
with open(os.path.join(outdir, "exps_outputs.json"), "r") as f:
results = json.load(f)
for i in range(100000):
#time.sleep(1)
session.report({"iter": i})
results[i] = {"accuracy": 100, "iter": i}
if i% 1000 == 0:
with open(os.path.join(outdir, "exps_outputs.json"), "w") as f:
json.dump(results, f, indent=4)
print(f"saving the results file to {outdir} at iter: {i}")
time.sleep(2)
print("======> Trial dir seen inside trainable:", session.get_trial_dir())
if True:
# -----------------------------
# 2nd run (restore)
# -----------------------------
restore_path = os.path.join(curdir, exp_output_root, exp_name)
tuner2 = tune.Tuner.restore(
path=restore_path, # 👈 must point to exact experiment directory
trainable=train_fn,
resume_errored=True,
)
results2 = tuner2.fit()
BUT, I am now unable to load, exp_outputs.json that should be present in the trial directory.. this points to /tmp/ray/session_xxxnewtime/