Tuner restore do not loads the existing files in the trial directory correctly.. get_trial_dir() points to new location when restored

KKNakka · August 18, 2025, 1:20pm

Hi, I am facing issues with ray tune with probably newer versions…

lets say i start raytune with below mincode..

import os
import time
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
import ray 
from ray.air import session
import json 



curdir = os.getcwd()
exp_output_root = "ray_outputs"
exp_name = "demo_exp"


def train_fn(config):
    
    results = {}
    outdir = session.get_trial_dir()

    print("======> Trial dir seen inside trainable:", outdir)


  
    for i in range(100000):
        #time.sleep(1)
        session.report({"iter": i})

        results[i] = {"accuracy": 100, "iter": i}

        if i% 1000 == 0:

            with open(os.path.join(outdir, "exps_outputs.json"), "w") as f:
                json.dump(results, f, indent=4)
            print(f"saving the results file to {outdir} at iter: {i}")

            time.sleep(2)


    print("======> Trial dir seen inside trainable:", session.get_trial_dir())


if True:

    # -----------------------------
    # 1st run (fresh)
    # -----------------------------
    tuner = tune.Tuner(
        trainable=train_fn,
        run_config=RunConfig(
            name=exp_name,
            storage_path=os.path.join(curdir, exp_output_root),  # 👈 store under project dir
            checkpoint_config=CheckpointConfig(num_to_keep=1),
        ),
        tune_config=tune.TuneConfig(num_samples=1),
    )

    results = tuner.fit()

    print("\nResults saved to:", results.experiment_path)

Lets say the code is interrupted in the middle, and I restore using below code AND also load the json file from the disk before starting the loop! BUT THIS JSON FILE IS NOT PRESent as the session.get_trial_dir points to new location….

import os
import time
from ray import tune
from ray.air import session, RunConfig, CheckpointConfig
import ray 
from ray.air import session
import json 



curdir = os.getcwd()
exp_output_root = "ray_outputs"
exp_name = "demo_exp"


def train_fn(config):
    
    results = {}
    outdir = session.get_trial_dir()

    print("======> Trial dir seen inside trainable:", outdir)

    # FAILS 
    with open(os.path.join(outdir, "exps_outputs.json"), "r") as f:
        results =  json.load(f)



    for i in range(100000):
        #time.sleep(1)
        session.report({"iter": i})

        results[i] = {"accuracy": 100, "iter": i}

        if i% 1000 == 0:

            with open(os.path.join(outdir, "exps_outputs.json"), "w") as f:
                json.dump(results, f, indent=4)
            print(f"saving the results file to {outdir} at iter: {i}")

            time.sleep(2)


    print("======> Trial dir seen inside trainable:", session.get_trial_dir())


if True:
    # -----------------------------
    # 2nd run (restore)
    # -----------------------------
    restore_path = os.path.join(curdir, exp_output_root, exp_name)

    tuner2 = tune.Tuner.restore(
        path=restore_path,   # 👈 must point to exact experiment directory
        trainable=train_fn,
        resume_errored=True,
    )

    results2 = tuner2.fit()

BUT, I am now unable to load, exp_outputs.json that should be present in the trial directory.. this points to /tmp/ray/session_xxxnewtime/

PhilippWillms · August 18, 2025, 3:03pm

Is there any particular reason for using the storage_path in the air.RunConfig?

If possible, give it a try to use the trial_dirname_creator in the TuneConfig.

tune.TuneConfig(
    trial_dirname_creator=...    
)

Topic		Replies	Views
Unable to restore Ray Tune previous experiment checkpoint Ray Tune	8	1028	June 1, 2023
Ray Tune stores absolute paths in checkpoints and cannot resume if checkpoints are moved Ray Tune	7	977	March 6, 2023
Ray Tune - how to load trial results from a different location?	2	458	October 23, 2023
Using Tuner.restore in ray Checkpointing, Restoring	0	514	November 29, 2023
Tuner cannot restore the checkpoints! Ray Tune	10	955	November 20, 2023

Tuner restore do not loads the existing files in the trial directory correctly.. get_trial_dir() points to new location when restored

Related topics