Hi there,
I have been trying to train an agent to in a custom environment that has a Box type action space with the shape (6,).
I had noticed that while training, all of the actions taken were within the low and high boundaries defined in the environments initialisation, but on inference when I load a checkpointed agent using agent.restore(), followed by agent.compute_single_action(…) it does not adhere to these boundaries.
I have tried to sanity check this phenomenon by training a PPO agent on the MountainCarContinuous-v0 environment. I notice the same phenomenon if I change the default action_space to an action space with additional dimensions as shown in the code below (which are just ignored during step so that the environment still functions).
I would really appreciate some help with this. It is something that appears to happen with both tensorflow backend (‘tf2’) as well as torch.
Ray version: 2.0.0.dev0
Torch version: 1.5.1
Gym version: 0.17.2
Thanks in advance!
Training code:
import argparse
import os
import json
import ray
from ray import tune
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.policy.policy import Policy
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.test_utils import check_learning_achieved
import ray.rllib.agents.ppo as ppo
from ray.tune.registry import register_env
parser = argparse.ArgumentParser()
parser.add_argument("--as-test", action="store_true")
from ray.rllib.utils.framework import try_import_tf, try_import_torch
import gym
from gym.spaces import Box, Dict, Discrete
import numpy as np
import random
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3"
save_path = "PATH TO CHECKPOINT DIRECTORY"
if __name__ == "__main__":
args = parser.parse_args()
ray.shutdown()
ray.init()
config = dict({
"env": "MountainCarContinuous-v0",
"framework":"torch",
"timesteps_per_iteration":256,
"rollout_fragment_length": 64,
"train_batch_size":128,
"sgd_minibatch_size":16,
})
stop = {
"training_iteration":100
}
results = tune.run("PPO",
local_dir=save_path,
stop=stop,
config=config,
verbose=3,
restore=checkpoint_path,
checkpoint_freq=1)
print(results)
#check values of particular fields in the 'results' dict and stop if any conditions are met
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
Testing code:
import argparse
import os
import json
import matplotlib.pyplot as plt
import torch
import ray
from ray import tune
import ray.rllib.agents.ppo as ppo
import gym
from gym.spaces import Box, Dict, Discrete
import numpy as np
import random
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3"
path = "PATH TO CHECKPOINT"
ray.shutdown()
ray.init()
with open("{}/params.json".format(path),'r') as f:
config = json.load(f)
agent = ppo.PPOTrainer(config)
agent.restore("{}".format(path))
env = gym.make('MountainCarContinuous-v0')
obs = env.reset()
print('action: {}'.format(agent.compute_single_action(obs)))
Original MountainCarContinuous-v0 action space:
self.action_space = spaces.Box(
low=-1.0,
high=1.0,
shape=(1,),
dtype=np.float32
)
Modified MountainCarContinuous-v0 action space:
self.action_space = spaces.Box(
low=-1.0,
high=1.0,
shape=(6,),
dtype=np.float32
)