I try to use PPO with env with action masking but I have some problem with configuration:
import gym
from gym.spaces import Box, Dict, Discrete
import numpy as np
import random
from ray import tune
from ray.tune.registry import register_env
class ParametricActionsCartPoleNoEmbeddings(gym.Env):
"""Same as the above ParametricActionsCartPole.
However, action embeddings are not published inside observations,
but will be learnt by the model.
At each step, we emit a dict of:
- the actual cart observation
- a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
- action embeddings (w/ "dummy embedding" for invalid actions) are
outsourced in the model and will be learned.
"""
def __init__(self, max_avail_actions):
# Randomly set which two actions are valid and available.
self.left_idx, self.right_idx = random.sample(range(max_avail_actions), 2)
self.valid_avail_actions_mask = np.array(
[0.0] * max_avail_actions, dtype=np.float32
)
self.valid_avail_actions_mask[self.left_idx] = 1
self.valid_avail_actions_mask[self.right_idx] = 1
self.action_space = Discrete(max_avail_actions)
self.wrapped = gym.make("CartPole-v0")
self.observation_space = Dict(
{
"valid_avail_actions_mask": Box(0, 1, shape=(max_avail_actions,)),
"cart": self.wrapped.observation_space,
}
)
self._skip_env_checking = True
def reset(self):
return {
"valid_avail_actions_mask": self.valid_avail_actions_mask,
"cart": self.wrapped.reset(),
}
def step(self, action):
if action == self.left_idx:
actual_action = 0
elif action == self.right_idx:
actual_action = 1
else:
raise ValueError(
"Chosen action was not one of the non-zero action embeddings",
action,
self.valid_avail_actions_mask,
self.left_idx,
self.right_idx,
)
orig_obs, rew, done, info = self.wrapped.step(actual_action)
obs = {
"valid_avail_actions_mask": self.valid_avail_actions_mask,
"cart": orig_obs,
}
return obs, rew, done, info
if __name__ == "__main__":
def env_creator(env_config={}):
return ParametricActionsCartPoleNoEmbeddings(max_avail_actions=6) # return an env instance
register_env("my_env", env_creator)
tune.run("PPO",
# algorithm specific configuration
config={"env": "my_env",
"evaluation_interval": 2,
"evaluation_num_episodes": 20},
local_dir="cartpole_v1", # directory to save results
checkpoint_freq=2, # frequency between checkpoints
keep_checkpoints_num=6, )
The error is:
(PPOTrainer pid=330560) raise ValueError(
(PPOTrainer pid=330560) ValueError: (‘Chosen action was not one of the non-zero action embeddings’, 5, array([0., 0., 1., 0., 1., 0.], dtype=float32), 4, 2)
Traceback (most recent call last):
Does someone know how to chenge agent configuration to use env with masking?