Restoring Agent with Ray.tune

fardinabbasi · October 20, 2023, 2:53pm

What happened + What you expected to happen

I trained a RL agent using Ray Tune on my local PC, and all the results, including checkpoints, are saved in a directory named “RL_LTR,” as shown below:
Screenshot 2023-10-20 180912

Next, I uploaded this directory to Colab and attempted to restore the agent using the tune.Tuner.restore method. However, when I tried to test my agent, I encountered the following error message:
Path not found: (<pyarrow._fs.LocalFileSystem object at 0x7f22d0608e30>, /mainfs/scratch/sb5e19/RL_LTR/TD3_TRAIN/TD3_TRAIN/TD3_RankingEnv_ce7f8b78_1_AlgorithmConfig__prior_exploration_config=None,disable_action_flattening=False,disable_execution_plan_ap_2023-10-19_18-22-51/checkpoint_000019)
It appears that I cannot access the checkpoints from a system other than the one where the model was trained.

Reproduction script

class DRLlibv2:
    def __init__(
        self,
        trainable: str | Any,
        params: dict,
        train_env=None,
        run_name: str = "tune_run",
        local_dir: str = "tune_results",
        search_alg=None,
        concurrent_trials: int = 0,
        num_samples: int = 0,
        scheduler_=None,
        # num_cpus: float | int = 2,
        dataframe_save: str = "tune.csv",
        metric: str = "episode_reward_mean",
        mode: str | list[str] = "max",
        max_failures: int = 0,
        training_iterations: int = 100,
        checkpoint_num_to_keep: None | int = None,
        checkpoint_freq: int = 0,
        reuse_actors: bool = True
    ):
        self.params = params

        # if train_env is not None:
        #     register_env(self.params['env'], lambda env_config: train_env(env_config))


        self.train_env = train_env
        self.run_name = run_name
        self.local_dir = local_dir
        self.search_alg = search_alg
        if concurrent_trials != 0:
            self.search_alg = ConcurrencyLimiter(
                self.search_alg, max_concurrent=concurrent_trials
            )
        self.scheduler_ = scheduler_
        self.num_samples = num_samples
        self.trainable = trainable
        if isinstance(self.trainable, str):
            self.trainable = self.trainable.upper()
        # self.num_cpus = num_cpus
        self.dataframe_save = dataframe_save
        self.metric = metric
        self.mode = mode
        self.max_failures = max_failures
        self.training_iterations = training_iterations
        self.checkpoint_freq = checkpoint_freq
        self.checkpoint_num_to_keep = checkpoint_num_to_keep
        self.reuse_actors = reuse_actors

    def train_tune_model(self):
        """
        Tuning and training the model
        Returns the results object
        """
        # if ray.is_initialized():
        #   ray.shutdown()

        # ray.init(num_cpus=self.num_cpus, num_gpus=self.params['num_gpus'], ignore_reinit_error=True)

        if self.train_env is not None:
            register_env(self.params['env'], lambda env_config: self.train_env)


        tuner = tune.Tuner(
            self.trainable,
            param_space=self.params,
            tune_config=TuneConfig(
                search_alg=self.search_alg,
                scheduler=self.scheduler_,
                num_samples=self.num_samples,
                # metric=self.metric,
                # mode=self.mode,
                **({'metric': self.metric, 'mode': self.mode} if self.scheduler_ is None else {}),
                reuse_actors=self.reuse_actors,

            ),
            run_config=RunConfig(
                name=self.run_name,
                storage_path=self.local_dir,
                failure_config=FailureConfig(
                    max_failures=self.max_failures, fail_fast=False
                ),
                stop={"training_iteration": self.training_iterations},
                checkpoint_config=CheckpointConfig(
                    num_to_keep=self.checkpoint_num_to_keep,
                    checkpoint_score_attribute=self.metric,
                    checkpoint_score_order=self.mode,
                    checkpoint_frequency=self.checkpoint_freq,
                    checkpoint_at_end=True,
                ),
                verbose=3,#Verbosity mode. 0 = silent, 1 = default, 2 = verbose, 3 = detailed
            ),
        )

        self.results = tuner.fit()
        if self.search_alg is not None:
            self.search_alg.save_to_dir(self.local_dir)
        # ray.shutdown()
        return self.results

    def infer_results(self, to_dataframe: str = None, mode: str = "a"):
        """
        Get tune results in a dataframe and best results object
        """
        results_df = self.results.get_dataframe()

        if to_dataframe is None:
            to_dataframe = self.dataframe_save

        results_df.to_csv(to_dataframe, mode=mode)

        best_result = self.results.get_best_result()
        # best_result = self.results.get_best_result()
        # best_metric = best_result.metrics
        # best_checkpoint = best_result.checkpoint
        # best_trial_dir = best_result.log_dir
        # results_df = self.results.get_dataframe()

        return results_df, best_result

    def restore_agent(
        self,
        checkpoint_path: str = "",
        restore_search: bool = False,
        resume_unfinished: bool = True,
        resume_errored: bool = False,
        restart_errored: bool = False,
    ):
        """
        Restore errored or stopped trials
        """
        # if restore_search:
        # self.search_alg = self.search_alg.restore_from_dir(self.local_dir)
        if checkpoint_path == "":
            checkpoint_path = self.results.get_best_result().checkpoint._local_path

        restored_agent = tune.Tuner.restore(
            checkpoint_path, trainable = self.trainable,
            param_space=self.params,
            restart_errored=restart_errored,
            resume_unfinished=resume_unfinished,
            resume_errored=resume_errored,
        )
        print(restored_agent)
        self.results = restored_agent.get_results()

        if self.search_alg is not None:
            self.search_alg.save_to_dir(self.local_dir)
        return self.results

    def get_test_agent(self, test_env_name: str=None, test_env=None, checkpoint=None):
        """
        Get test agent
        """
        # if test_env is not None:
        #     register_env(test_env_name, lambda config: [test_env])

        if checkpoint is None:
            checkpoint = self.results.get_best_result().checkpoint

        testing_agent = Algorithm.from_checkpoint(checkpoint)
        # testing_agent.config['env'] = test_env_name

        return testing_agent

local_dir = Path.cwd()/"TD3_TRAIN"

drl_agent = DRLlibv2(
    trainable="TD3",
    # train_env = RankingEnv,
    # num_cpus = num_cpus,
    run_name = "TD3_TRAIN",
    local_dir = local_dir,
    params = train_config.to_dict(),
    num_samples = 1,#Number of samples of hyperparameters config to run
    # training_iterations=5,
    checkpoint_freq=5,
    # scheduler_=scheduler_,
    search_alg=search_alg,
    metric = "episode_reward_mean",
    mode = "max"
    # callbacks=[wandb_callback]
)

results = drl_agent.restore_agent((local_dir/"TD3_TRAIN").as_posix())
test_agent = drl_agent.get_test_agent()

justinvyu · October 23, 2023, 6:57pm

Hi @fardinabbasi,

This is a known issue that’s being tracked here: [Train/Tune] Restore an experiment from a different machine/path · Issue #40585 · ray-project/ray · GitHub

Targeting a fix for Ray 2.9, but will keep this thread updated if a nightly is available earlier for you to use. Thanks for raising this issue.

justinvyu · November 3, 2023, 1:06am

This should be fixed in nightly by [tune/train] Restore Tuner and results properly from moved storage path by justinvyu · Pull Request #40647 · ray-project/ray · GitHub.

Let me know if you get the chance to try it out!

fardinabbasi · November 10, 2023, 12:04pm

Hey @justinvyu ,
Thanks for letting me know!

After installing the nightly update, I’ve encountered issues, and I’m unsure of their origin. I’m running my code using Slurm with this template.

My system is Linux (x86_64), and py39 is a Conda environment with Python 3.9.

Error Log

/tmp/slurmd/job4985450/slurm_script: line 22: py39: command not found
IP Head: 10.13.33.111:6379
STARTING HEAD at red365
2023-11-10 11:33:59,339	ERROR services.py:1329 -- Failed to start the dashboard , return code 1
2023-11-10 11:33:59,340	ERROR services.py:1354 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2023-11-10 11:33:59,340	ERROR services.py:1398 -- 
The last 20 lines of /tmp/ray/session_2023-11-10_11-33-56_646084_2327/logs/dashboard.log (it contains the error message from the dashboard): 
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/dashboard/dashboard.py", line 75, in run
    await self.dashboard_head.run()
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/dashboard/head.py", line 325, in run
    modules = self._load_modules(self._modules_to_load)
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/dashboard/head.py", line 219, in _load_modules
    head_cls_list = dashboard_utils.get_all_modules(DashboardHeadModule)
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/dashboard/utils.py", line 121, in get_all_modules
    importlib.import_module(name)
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1030, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
  File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/dashboard/modules/log/log_manager.py", line 26, in <module>
    class ResolvedStreamFileInfo(BaseModel):
TypeError: NoneType takes no arguments
2023-11-10 11:33:56,642	INFO usage_lib.py:416 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
2023-11-10 11:33:56,644	INFO scripts.py:744 -- Local node IP: 10.13.33.111
2023-11-10 11:34:00,636	SUCC scripts.py:781 -- --------------------
2023-11-10 11:34:00,636	SUCC scripts.py:782 -- Ray runtime started.
2023-11-10 11:34:00,637	SUCC scripts.py:783 -- --------------------
2023-11-10 11:34:00,637	INFO scripts.py:785 -- Next steps
2023-11-10 11:34:00,637	INFO scripts.py:788 -- To add another node to this Ray cluster, run
2023-11-10 11:34:00,637	INFO scripts.py:791 --   ray start --address='10.13.33.111:6379'
2023-11-10 11:34:00,637	INFO scripts.py:800 -- To connect to this Ray cluster:
2023-11-10 11:34:00,637	INFO scripts.py:802 -- import ray
2023-11-10 11:34:00,637	INFO scripts.py:803 -- ray.init(_node_ip_address='10.13.33.111')
2023-11-10 11:34:00,637	INFO scripts.py:834 -- To terminate the Ray runtime, run
2023-11-10 11:34:00,637	INFO scripts.py:835 --   ray stop
2023-11-10 11:34:00,637	INFO scripts.py:838 -- To view the status of the cluster, use
2023-11-10 11:34:00,637	INFO scripts.py:839 --   ray status
2023-11-10 11:34:00,637	INFO scripts.py:952 -- --block
2023-11-10 11:34:00,637	INFO scripts.py:953 -- This command will now block forever until terminated by a signal.
2023-11-10 11:34:00,637	INFO scripts.py:956 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported.
2023-11-10 11:35:04,955	INFO worker.py:1502 -- Connecting to existing Ray cluster at address: 10.13.33.111:6379...
[2023-11-10 11:35:04,964 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:05,964 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:06,966 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:07,967 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:08,968 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:09,969 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:10,970 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:11,971 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:12,971 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
[2023-11-10 11:35:13,972 W 2716 2716] global_state_accessor.cc:429: Some processes that the driver needs to connect to have not registered with GCS, so retrying. Have you run 'ray start' on this node?
2023-11-10 11:35:14,972	INFO worker.py:1651 -- Failed to connect to the default Ray cluster address at 10.13.33.111:6379. This is most likely due to a previous Ray instance that has since crashed. To reset the default address to connect to, run `ray stop` or restart Ray with `ray start`.
num_gpus: 0, num_cpus: 40
Traceback (most recent call last):
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/_private/worker.py", line 1642, in init
    _global_node = ray._private.node.Node(
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/_private/node.py", line 254, in __init__
    node_info = ray._private.services.get_node_to_connect_for_driver(
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/_private/services.py", line 475, in get_node_to_connect_for_driver
    return global_state.get_node_to_connect_for_driver(node_ip_address)
  File "/mainfs/scratch/sb5e19/.conda/envs/py39/lib/python3.9/site-packages/ray/_private/state.py", line 772, in get_node_to_connect_for_driver
    return self.global_state_accessor.get_node_to_connect_for_driver(
  File "python/ray/includes/global_state_accessor.pxi", line 228, in ray._raylet.GlobalStateAccessor.get_node_to_connect_for_driver
RuntimeError: b'GCS has started but no raylets have registered yet.'

Topic		Replies	Views
Ray restore checkpoint in rllib RLlib	6	1615	August 11, 2021
Tuner cannot restore the checkpoints! Ray Tune	10	845	November 20, 2023
Empty checkpoint files with Tune.run RLlib	1	376	March 30, 2022
Unable to restore fully trained checkpoint RLlib	19	2796	October 21, 2023
[Rllib] how to restore trainer from different checkpoint files when training on server and local RLlib	1	284	February 3, 2023

Restoring Agent with Ray.tune

What happened + What you expected to happen

Reproduction script

Error Log

Related topics