I want to solve a problem that needs shielding action and has continuous and discrete mixed action space, I can find a code that can mask some discrete action.This is done by adding -inf to the action distribution of the shield.
But I don’t know how to mask continuous action, The above method doesn’t work.Maybe it’s easy.Can I get some effective suggestions from here? My English is not very good, please forgive me if I have offended you.
Here is an environmental case that needs shielding action and has continuous and discrete mixed action space.
Can I use PPO to solve a problem?
import random
import gym
import gym.spaces
import numpy as np
import traceback
import pprint
class GridEnv1(gym.Env):
'''
A line segment of length 10,
The starting point is 0 and the ending point is 10,
The number of times starts from 0
Even: move one unit length left or right
Odd: you can move [- 10, 10] to the left or right
If you reach the known interval between [9,10], you will win, + 100
If you take more than 10 steps, you will die - 100. If you don't take [0, 10], the game will end - 100
'''
def __init__(self,env_config=None):
self.isOdd=0
#
self.action_space=gym.spaces.Dict({
'even':gym.spaces.Box(-10,10,shape=(1,)),
'odd':gym.spaces.Discrete(2)
})
self.observation_space=gym.spaces.Dict({
'postion':gym.spaces.Box(-10,20,shape=(1,)),
# 'isOdd':gym.spaces.Discrete(2)
'action_mask':gym.spaces.Box(0,1,shape=(4,))
})
self.reset()
#gym.spaces.dict
#t=gym.make('MountainCarContinuous-v0')
#t
def reset(self):
'''
:return: state
'''
# 0-chose discrete 1-chose cont
self.observation = {
'postion':[0],
'action_mask':np.array([1,1,0,0])
}
#self.reward = 10
self.done=False
self.step_num=0
self.isOdd=1
return self.observation
def step(self, action)->tuple:
'''
:param action:list
:return: tuple ->[observation,reward,done,info]
'''
#pprint.pprint(traceback.extract_stack())
self.step_num += 1
reward = -1.0
if self.isOdd == 0:
actionval=action['even'][0]
else:
actionval = action['odd'] if action['odd']==1 else -1
self.observation['postion'][0] += actionval
self.isOdd=1-self.isOdd
self.observation['action_mask']=1-self.observation['action_mask']
#
if self.step_num > 10 or self.observation['postion'][0] < 0 or self.observation['postion'][0] > 10:
reward -= 100.0
self.done = True
# print('last %d action %d now %d' % (self.observation[0] - action, action, self.observation[0]))
# print(self.done)
return self.observation, reward, self.done, {}
if self.observation['postion'][0] >= 9 and self.observation['postion'][0] <= 10:
reward = 100.0
self.done = True
# print(self.done)
# print('last %d action %d now %d'%(self.observation[0]-action,action,self.observation[0]))
return self.observation, reward, self.done, {}
def render(self, mode='human'):
pass