Rllib with Tuple action space

Hi everyone,

we are having troubles with rllib when using Tuple action spaces in our gym environment. It seems that the preprocessor is not applied to actions if we use a requirement view that uses the action from the previous time step. However, this is only the case if we use a Tuple action space in our environment. We created a minimal example.

env.py:

import gym
import numpy as np


class DebugEnv(gym.Env):
    def __init__(self, config=None):
        # self.action_space = gym.spaces.Tuple((
        #     gym.spaces.Discrete(2),
        #     gym.spaces.Box(-5, 5, (2, ), dtype=np.float32)
        # ))
        self.action_space = gym.spaces.Box(-5, 5, (2, ), dtype=np.float32)

        self.counter = 0

        self.observation_space = gym.spaces.Box(0, 1, (3,), dtype=np.float32)

    def reset(self):
        return self.observation_space.sample()

    def step(self, action):
        self.counter += 1
        return self.observation_space.sample(), 0, self.counter == 9, {}

model.py:

import gym
import torch

from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelConfigDict


class Model(TorchModelV2, torch.nn.Module):
    def __init__(
        self,
        obs_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        num_outputs: int,
        model_config: ModelConfigDict,
        name: str,
        *args,
        **kwargs
    ):

        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name
        )
        torch.nn.Module.__init__(self)

        self.view_requirements["prev_obs"] = ViewRequirement(
            data_col="obs", space=self.obs_space, shift=-1
        )
        self.view_requirements["prev_act"] = ViewRequirement(
            data_col="actions", space=self.action_space, shift=-1
        )

        self.linear = torch.nn.Linear(3, num_outputs)

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        prev_obs = input_dict["prev_obs"]
        prev_act = input_dict["prev_act"]
        self.batch_size = prev_obs.shape[0]
        x = self.linear(input_dict["obs"])
        return x, []

    @override(TorchModelV2)
    def value_function(self):
        return torch.zeros([self.batch_size])

run.py:

import matplotlib as mpl
mpl.use('Agg')
import numpy as np
import random
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.models import ModelCatalog
from env import DebugEnv
from model import Model
import torch

seed = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)


ModelCatalog.register_custom_model("my_torch_model", Model)


config = {
    "framework": "torch",
    "env": DebugEnv,

    "evaluation_config": {
        'num_envs_per_worker': 1,
    },
    "evaluation_interval": 10,
    "evaluation_duration": 1,
    "evaluation_num_workers": 1,

    'rollout_fragment_length': 1,
    'train_batch_size': 640,
    'num_workers': 4,
    'num_envs_per_worker': 5,
    'num_sgd_iter': 10,
    'num_gpus': 1,
    "lr": 1e-5,
    "model": {
        "custom_model": "my_torch_model",
    }
}

def max_min_stopper(trial_id, result):
    max_iter = 10000
    if result["training_iteration"] < 10:
        return False
    if result["episode_reward_min"] >= result["episode_reward_max"] * 0.95:
        return True
    elif result["training_iteration"] >= max_iter:
        return True
    return False


def run_ray_experiment(debug=False):
    if debug:
        ray.init(local_mode=True)
        config['num_workers'] = 1
        config['evaluation_interval'] = 1
    else:
        ray.init(local_mode=True)

    ray.tune.run(
        PPOTrainer,
        config=config,
        local_dir=logdir,
        name=name,
        log_to_file="experiment.log",
        stop=max_min_stopper,
    )


if __name__ == "__main__":
    logdir = 'results'
    name = 'test'
    run_ray_experiment()

This code works as expected. If the action space in gym.py is changed to the one commented out, however, rllib crashes when trying to concatenate the samples.

Might this be a bug or are we overlooking things?
We are happy about any help!

Best regards
Fedor

Turns out that this problem does not exist anymore in ray 2.2.0, which was released ~30 minutes after I posted this. Good timing!

Best regards
Fedor

1 Like