Hey all,
I am using a custom gym environment where the action space is a <gym.spaces.Box> of (2,1) with the actions being bounded between [0,…,1]. I am training a PPO agent in this environment. This seems to work, as the reward is roughly similar to what Stablebaselines3 gets to. I use tune for training.
However, when I try to compute single actions after loading a checkpoint for evaluation using eval_agent.compute_single_action(obs)
I get actions outside of the environment bounds (e.g., [-0.16, -0.6]).
I followed for instance: Serve Tutorial (without serve)
I am using Ray 1.9.1 on Windows 10 with python 3.8.
For both training and evaluation I do not change the config files too much. What am I missing here? Something to do with clipping?
Thanks!
Thomas
My rollout script:
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.tune.registry import register_env
import config as conf
import gym
import custom_gyms
from custom_gyms.gyms.envs.ui_env_movement import UiEnvMovement
import pygame
from itertools import count
import numpy as np
ray.init()
env_config = conf.config.copy()
rllib_config = {
"env_config": env_config,
"num_workers": 0,
"framework": "torch",
"num_gpus": 0,
"evaluation_config": {
"explore": False
}
}
stop = {
"training_iteration": 60
}
def rollout_ppo(config, checkpoint_path):
eval_agent = PPOTrainer(config, env=UiEnvMovement)
eval_agent.restore(checkpoint_path)
env = eval_agent.workers.local_worker().env
for rollout in count():
# env.environment.setup_render()
# env.environment.update_folder(f"results/test/")
obs = env.reset()
done = False
# env.render()
# env.environment.screenshot()
# gifs = []
while not done:
action = eval_agent.compute_single_action(obs)
print(obs, action)
obs, reward, done, info = env.step(action)
# env.render()
# env.environment.screenshot()
# if done:
# gif = env.environment.to_gif()
# gifs.append(gif)
# break
pygame.quit()
if rollout == 20:
break
rollout_ppo(rllib_config,
checkpoint_path="ray_results/PPO/PPO_UIMovement_0dede_00000_0_2021-12-31_00-48-57/checkpoint_000060/checkpoint-60")
My training script:
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray import tune
from ray.tune.registry import register_env
import config as conf
import gym
import custom_gyms
from custom_gyms.gyms.envs.ui_env_movement import UiEnvMovement
from ray.tune.integration.wandb import WandbLoggerCallback
ray.init()
_config = conf.config.copy()
select_env = "UIMovement"
register_env(select_env, lambda c: UiEnvMovement(c))
env_config = conf.config.copy()
rllib_config = {
"env": select_env,
"env_config": env_config,
"num_workers": 10,
"framework": "torch",
"num_gpus": 0
}
stop = {
"training_iteration": 60
}
tune_analysis = tune.run(
PPOTrainer,
config=rllib_config,
verbose=1,
fail_fast=True,
local_dir=[dir],
checkpoint_freq=10,
checkpoint_at_end=True,
num_samples=1,
stop=stop
)
ray.shutdown()
How I set my env action space:
self.action_space = spaces.Box(low=np.array([0., 0.]),
high=np.array([1.0, 1.0]), dtype=np.float64)```