I created a DRL environment which action and observation space are defined by:
self.action_space = Box(low=0, high=self.T / (self.N - 1), shape=(self.N,), dtype=np.float16)
self.observation_space = Box(low=0, high=high_value, shape=(self.N,), dtype=np.float16)
When I print the spaces inside my env I get:
(RolloutWorker pid=76410) action_space: Box(0.0, 5.0, (5,), float16)
(RolloutWorker pid=76410) observation_space: Box(0.0, 6.438, (5,), float16)
After training for a high number of iteration using ray.tune.
I load the model from the checkpoint and want to compute a single action. However, I get action out of the action_space. The action are greater than what they used to be as if the training was lost.
Here is the code used for that:
def ray_model_C(channel_gains, algo):
channel_gains = np.array(channel_gains, dtype=np.float16)
print(f"Observation (channel gains): {channel_gains}")
policy = algo.get_policy("default_policy")
print(f"Policy action space: {policy.action_space}")
print(f"Policy observation space: {policy.observation_space}")
print(f"Clipping actions: {algo.config.clip_actions}")
action, state_out, info = algo.compute_single_action(
observation=channel_gains,
state=None,
prev_action=None,
prev_reward=None,
info=None,
policy_id="default_policy",
full_fetch=True,
explore=False,
unsquash_action=True,
clip_action=True,
episode=None
)
print(f"Raw action: {action}")
print(f"State out: {state_out}")
print(f"Info: {info}")
def centralized_creation(config, checkpoint_path):
ray.init(ignore_reinit_error=True)
ray.tune.register_env("TDMCentralized-v1", lambda config: TDMCentralized(config))
high_value = (config["channel_gain_mean"] * np.sqrt(np.pi / 2)) + 3 * (np.sqrt((2 - np.pi / 2)) * config["channel_gain_mean"])
myModel = PPOConfig()
myModel = myModel.framework("torch")
myModel = myModel.training(
grad_clip=0.5,
model={
"fcnet_hiddens": [64,64],
"fcnet_activation": "relu"
},
)
myModel = myModel.exploration(explore=False)
myModel = myModel.environment(
env="TDMCentralized-v1",
env_config=config,
action_space=Box(low=0, high=config["T"]/(config["N"]-1), shape=(config["N"],), dtype=np.float16),
observation_space=Box(low=0, high=high_value, shape=(config["N"],), dtype=np.float16)
)
myModel.clip_actions = True # Enable action clipping
#myModel.config.clip_actions = True
algo = myModel.build()
algo.restore(os.path.expanduser(checkpoint_path))
policy = algo.get_policy("default_policy")
print(policy.action_space)
return algo
checkpoint_C = "~/ray_results/TDMCentralized-vf/488bd_00000_0_2024-08-13_18-01-21/checkpoint_000004"
config = {
"N": 5,
"SNR_db": 25,
"T": 20,
"R_threshold": 8,
"R_factor": 50,
"time_factor": 100,
"max_steps": 100,
"channel_gain_mean": 2
}
algo = centralized_creation(config, checkpoint_C)
channel_gains = np.random.rayleigh(config["channel_gain_mean"], config["N"])
Here is the output:
Box(0.0, 5.0, (5,), float16)
Observation (channel gains): [3.44 5.727 5.2 1.207 3.389]
Policy action space: Box(0.0, 5.0, (5,), float16)
Policy observation space: Box(0.0, 6.438, (5,), float16)
Clipping actions: True
Raw action: [24.664581 23.697128 21.950151 33.310787 25.196285]
State out: []
Info: {'vf_preds': -91.99593, 'action_dist_inputs': array([24.664581 , 23.697128 , 21.950151 , 33.310787 , 25.196285 ,
-8.420193 , -6.4748316, -9.53656 , -7.5164757, -7.0803776],
dtype=float32), 'action_prob': 1.0, 'action_logp': 0.0}
(RolloutWorker pid=76410) action_space: Box(0.0, 5.0, (5,), float16)
(RolloutWorker pid=76410) observation_space: Box(0.0, 6.438, (5,), float16)
I am running it on Windows10 WSL2 Ubuntu 22.04.
If anything else is needed, feel free to ask.
Thanks.