Ray Checkpointing do not save policy_spec configuration in state

How severe does this issue affect your experience of using Ray?

  • High: It block complete my task and I can not work around it.

Hello, I have a problem that blocks me in my activities and I cannot solve it. I have tuned the hyperparameters of the QMix algorithm with Tune and now I want to use the trained policy for evaluation. However, I cannot import the policy from the checkpoint.
My tuning code is as follows:

import logging
import gymnasium as gym

import ray
from ray import air, tune
from ray.tune import register_env
from ray.rllib.algorithms.qmix import QMixConfig, QMix
from ray.rllib.utils.replay_buffers.replay_buffer import StorageUnit
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.env.multi_agent_env import ENV_STATE

from tempfile import TemporaryDirectory
from GENERAL_QMIX_ep_gym_env import EnergyPlusEnvWithGroupedAgents_v0

logger = logging.getLogger(__name__)

#path = "/home/german"
path = 'E:/Usuario/Cliope'
#path = 'C:/Users/grhen'

EPW_PATH = path+'/Documents/GitHub/EP_RLlib/EP_Wheater_Configuration/Mendoza_Obs_-hour-historico1.epw'
IDF_PATH = path+'/home/german/Documents/GitHub/EP_RLlib/EP_IDF_Configuration/model_1.epJSON'
CLIMATIC_STADS_PATH = path+'/Documents/GitHub/EP_RLlib/EP_Wheater_Configuration'
    
tune_runner = True

ray.init()

register_env(
"EPEnv",
lambda args: EnergyPlusEnvWithGroupedAgents_v0(args)
)

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    return "default_policy"
    
algo = QMixConfig().training(
        train_batch_size = 30, #if not tune_runner else tune.grid_search([30, 80, 100]),
        gamma = 0.7, #if not tune_runner else tune.grid_search([0.7, 0.9, 0.99]),
        lr = 0.1, #if not tune_runner else tune.grid_search([0.001, 0.01, 0.1]),
        #QMixConig
        mixer = 'qmix',
        # Mixing network. Either “qmix”, “vdn”, or None.
        mixing_embed_dim = 32,
        # Size of the mixing network embedding.
        double_q = True,
        # Whether to use Double_Q learning.
        target_network_update_freq = 100, #if not tune_runner else tune.grid_search([100, 300, 1000]),
        # Update the target network every target_network_update_freq sample steps.
        replay_buffer_config = {
            "type": "MultiAgentReplayBuffer",
            "capacity": 70000,
            "storage_unit": StorageUnit.FRAGMENTS,
        },
        #optim_alpha = 0.99,
        # RMSProp alpha.
        #optim_eps = 0.00001,
        # RMSProp epsilon.
        #grad_clip = None,
        # If not None, clip gradients during optimization at this value.
    ).environment(
        env="EPEnv",
        env_config={
            "csv": False,
            "epw": EPW_PATH,
            "output": TemporaryDirectory("output","QMIX_",'E:/Resultados_RLforEP').name,
            "idf": IDF_PATH,
            "idf_folderpath": path+"/Documents/GitHub/EP_RLlib/EP_IDF_Configuration",
            'idf_output_folder': path+"/Documents/models",
            'beta': 0.5, # Parámetro para ajustar las preferencias del usuario (valor entre 0 y 1)
            'E_max': 2.5/6, # in epJSON file: maximum_total_cooling_capacity/1000 / number_of_timesteps_per_hour
            'climatic_stads': CLIMATIC_STADS_PATH,
            'latitud':0,
            'longitud':0,
            'altitud':0,
            "separate_state_space": True,
            "one_hot_state_encoding": True,
            "episode": -1
        },
    ).framework(
        framework = 'torch',
    ).fault_tolerance(
        recreate_failed_workers = True,
        restart_failed_sub_environments=False,
    ).rollouts(
        num_rollout_workers = 0,
        create_env_on_local_worker=True,
        rollout_fragment_length = 'auto',# if not tune_runner else tune.grid_search([10, 20]),
        enable_connectors = True,
        batch_mode="truncate_episodes",
        num_envs_per_worker=1,
    ).exploration(
        explore = True,
        exploration_config = {
            "type": 'EpsilonGreedy',
            "initial_epsilon": 1.0,
            "final_epsilon": 0,
            "epsilon_timesteps": 900000
        },
    ).reporting(
        min_sample_timesteps_per_iteration = 1440,
    ).debugging(
        log_level = "INFO",
        seed=7,
    ).resources(
        num_gpus = 0,
    ).checkpointing(
        True
    ).rl_module(
        _enable_rl_module_api=False
    )

if tune_runner:
    tune.Tuner( # 3. train it,
        "QMIX",
        run_config=air.RunConfig(
            name='training_best_QMIX_4',
            local_dir='E:/ray_results',
            storage_path='E:/ray_results',
            stop={"episodes_total": 250},
            log_to_file=True,
            
            checkpoint_config=air.CheckpointConfig(
                checkpoint_at_end=True,
                checkpoint_frequency=2,
                num_to_keep=20
            ),
            failure_config=air.FailureConfig(
                # Tries to recover a run up to this many times.
                max_failures=10
            )
        ),
        param_space=algo.to_dict(),
    ).fit()
    
else:
    algo = algo.build()  # 2. build the algorithm,
    
    for _ in range(1500):
        print("Train N°"+str(_+1))
        print(algo.train()) # 3. train it,
        if _%10 == 0:
            print(algo.save(checkpoint_dir='E:/ray_results/save_algo'))
            algo.save_checkpoint(checkpoint_dir='E:/ray_results/save_policy')
            print(algo.get_policy())
            print(algo.get_policy().get_weights())
            print(algo.get_weights())

    for _ in range(40):
        print(algo.evaluate())  # 4. and evaluate it.

ray.shutdown()

When I use both configurations tune_runner = True and tune_runner = False the runs are executed correctly and I am able to display the policies and weights of the NN in the terminal. However, when I try to import the checkpoint policy the following error appears:

import ray
from ray.rllib.policy.policy import Policy

checkpoint_path = "E:/ray_results/training_best_QMIX_4/QMIX_EPEnv_1fd3f_00000_0_2023-10-04_15-52-10/checkpoint_000001"
#checkpoint_path = "C:/Users/grhen/ray_results/ajuste_modelo_general_QMIX_5/QMIX_EPEnv_24cb3_00005_5_gamma=0.9900,lr=0.0100,mixing_embed_dim=32_2023-09-17_09-38-46/checkpoint_001400"
# Use the `from_checkpoint` utility of the Policy class:
my_restored_policy = Policy.from_checkpoint(checkpoint_path)
print(my_restored_policy)
#print(algo.evaluate())  # 4. and evaluate it.
ray.shutdown()
Traceback (most recent call last):
  File "e:\Usuario\Cliope\Documents\GitHub\EP_RLlib\GENERAL_QMIX_init_evaluation.py", line 7, in <module>
    my_restored_policy = Policy.from_checkpoint(checkpoint_path)
  File "C:\Users\Usuario\anaconda3\envs\qmix_env\lib\site-packages\ray\rllib\policy\policy.py", line 338, in from_checkpoint
    policies[policy_id] = Policy.from_state(policy_state)
  File "C:\Users\Usuario\anaconda3\envs\qmix_env\lib\site-packages\ray\rllib\policy\policy.py", line 365, in from_state
    raise ValueError(
ValueError: No `policy_spec` key was found in given `state`! Cannot create new Policy.

I tried setting the multi_agent method of the algorithm, but the error persists.
Could someone tell me if I need to change something in my settings or how to fix the problem? Thank you so much.