Hi everyone,
we are having troubles with rllib when using Tuple action spaces in our gym environment. It seems that the preprocessor is not applied to actions if we use a requirement view that uses the action from the previous time step. However, this is only the case if we use a Tuple action space in our environment. We created a minimal example.
env.py:
import gym
import numpy as np
class DebugEnv(gym.Env):
def __init__(self, config=None):
# self.action_space = gym.spaces.Tuple((
# gym.spaces.Discrete(2),
# gym.spaces.Box(-5, 5, (2, ), dtype=np.float32)
# ))
self.action_space = gym.spaces.Box(-5, 5, (2, ), dtype=np.float32)
self.counter = 0
self.observation_space = gym.spaces.Box(0, 1, (3,), dtype=np.float32)
def reset(self):
return self.observation_space.sample()
def step(self, action):
self.counter += 1
return self.observation_space.sample(), 0, self.counter == 9, {}
model.py:
import gym
import torch
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelConfigDict
class Model(TorchModelV2, torch.nn.Module):
def __init__(
self,
obs_space: gym.spaces.Space,
action_space: gym.spaces.Space,
num_outputs: int,
model_config: ModelConfigDict,
name: str,
*args,
**kwargs
):
TorchModelV2.__init__(
self, obs_space, action_space, num_outputs, model_config, name
)
torch.nn.Module.__init__(self)
self.view_requirements["prev_obs"] = ViewRequirement(
data_col="obs", space=self.obs_space, shift=-1
)
self.view_requirements["prev_act"] = ViewRequirement(
data_col="actions", space=self.action_space, shift=-1
)
self.linear = torch.nn.Linear(3, num_outputs)
@override(TorchModelV2)
def forward(self, input_dict, state, seq_lens):
prev_obs = input_dict["prev_obs"]
prev_act = input_dict["prev_act"]
self.batch_size = prev_obs.shape[0]
x = self.linear(input_dict["obs"])
return x, []
@override(TorchModelV2)
def value_function(self):
return torch.zeros([self.batch_size])
run.py:
import matplotlib as mpl
mpl.use('Agg')
import numpy as np
import random
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.models import ModelCatalog
from env import DebugEnv
from model import Model
import torch
seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
ModelCatalog.register_custom_model("my_torch_model", Model)
config = {
"framework": "torch",
"env": DebugEnv,
"evaluation_config": {
'num_envs_per_worker': 1,
},
"evaluation_interval": 10,
"evaluation_duration": 1,
"evaluation_num_workers": 1,
'rollout_fragment_length': 1,
'train_batch_size': 640,
'num_workers': 4,
'num_envs_per_worker': 5,
'num_sgd_iter': 10,
'num_gpus': 1,
"lr": 1e-5,
"model": {
"custom_model": "my_torch_model",
}
}
def max_min_stopper(trial_id, result):
max_iter = 10000
if result["training_iteration"] < 10:
return False
if result["episode_reward_min"] >= result["episode_reward_max"] * 0.95:
return True
elif result["training_iteration"] >= max_iter:
return True
return False
def run_ray_experiment(debug=False):
if debug:
ray.init(local_mode=True)
config['num_workers'] = 1
config['evaluation_interval'] = 1
else:
ray.init(local_mode=True)
ray.tune.run(
PPOTrainer,
config=config,
local_dir=logdir,
name=name,
log_to_file="experiment.log",
stop=max_min_stopper,
)
if __name__ == "__main__":
logdir = 'results'
name = 'test'
run_ray_experiment()
This code works as expected. If the action space in gym.py is changed to the one commented out, however, rllib crashes when trying to concatenate the samples.
Might this be a bug or are we overlooking things?
We are happy about any help!
Best regards
Fedor