Multiagent PPO with custom model gives actions that are outside of the action space

I am running a Multi Agent PPO model where the action space is defined as follows:

self.action_space = gym.spaces.Discrete(5)

The code is very similar to the custom keras model implementation found here:

When I call the self.client.get_action function or self.client.compute_single_actionfunction, I get actions that are not bound by the action space. I am expecting actions that are between 0-4, meanwhile I receive values that are 4000, 1335 and so on.

Do you have a reproduction script?

Maybe using ray/random_env.py at master · ray-project/ray · GitHub

class RLAgent_v0(MultiAgentEnv):
    def __init__(self, return_agent_actions=False, part=False):
        low = np.full((16, 256, 10), -1000)
        high = np.full((16, 256, 10), 1000)
        self.obs = []
        self.action_space = gym.spaces.Discrete(5)
        self.observation_space = gym.spaces.Box(
            low=low, high=high, shape=(16, 256, 10), dtype=np.float32
        )


class KerasModel(TFModelV2):

    """Custom model for policy gradient algorithms."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        super(KerasModel, self).__init__(obs_space, action_space,
                                         num_outputs, model_config, name)
        import tensorflow as tf
        import tensorflow.keras.metrics
        import tensorflow.keras.losses

        self.inputs = tf.keras.layers.Input(
            shape=obs_space.shape, name="observations")

        layer_1 = tf.keras.layers.Conv2D(
            2, 3,
            name="layer1",
            padding='same',
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(1.0))(self.inputs)
        layer_2 = tf.keras.layers.Flatten()(layer_1)
        # layer_out = tf.reshape(layer_1, (1, -1))
        value_out = tf.keras.layers.Dense(
            1,
            name="value_out",
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(0.01))(layer_2)

        self.base_model = tf.keras.Model(self.inputs, [layer_2, value_out])

        self.register_variables(self.base_model.variables)

        self.base_model.summary()

    # @tf.function
    def forward(self, input_dict, state, seq_lens):
        model_out, self._value_out = self.base_model(input_dict["obs"])
        return model_out, state

    # @tf.function
    def value_function(self):
        import tensorflow as tf
        import tensorflow.keras.metrics
        import tensorflow.keras.losses
        return tf.reshape(self._value_out, [-1])

    # @tf.function
    def metrics(self):
        import tensorflow as tf
        import tensorflow.keras.metrics
        import tensorflow.keras.losses

        return {"foo": tf.constant(42.0)}

This is the environment and model I have defined