RLlib PPO custom model only get flattened observations

I want to create a custom model for my ppo agent, and it seemed like it should be easy enough. But I have had some problems which I don’t manage to debug myself so I tried to create a mwe in hope that someone here had a clue.

The initial problem seems to be that my environment has Tuple state spaces, and I read that __call__ for the TFModelV2 will flatten the state and make it available as input_dict["obs_flat"], while the original observation should be in input_dict["obs"]. This seems to not be true, and I seems to get the same flattened state in both. I don’t understand why and have tracked the flattened observation to come from the function _get_input_dict_and_dummy_batch in dynamic_tf_policy.py.

I then tried to run it with just some Dense network assuming the flattened observation, but then I get other errors from keras that I’m not really understanding, the keras model runs find by itself so I assume it is still some error with the interaction with Ray that is the problem.

import numpy as np
import tensorflow as tf
import gym

import ray
import ray.tune as tune
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.models import ModelCatalog

class TestEnv(gym.Env):
    def __init__(self, config):
        self.n_items = config["n_items"]
        self.action_space = gym.spaces.Tuple((gym.spaces.Discrete(self.n_items), gym.spaces.Box(-1, 1, shape=(1,))))
        self.observation_space = gym.spaces.Tuple((
                gym.spaces.Box(-100, 100, shape=(self.n_items,)),
                gym.spaces.Box(-100, 100, shape=(self.n_items,)),
                gym.spaces.Box(-100, 100, shape=(2,)),
            ))
        
    def reset(self):
        self.items_feat1 = np.zeros(self.n_items)
        self.items_feat2 = np.zeros(self.n_items)
        return (self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)]))

    def step(self, action):
        i, a = action
        self.items_feat2[i] += a
        self.items_feat1 += self.items_feat2
        state = (self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)]))
        reward = np.sum(np.abs(1 - self.items_feat1))
        return state, reward, False, {}

class MyConvNetwork(TFModelV2):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, n_items):
        self.original_space = obs_space.original_space if \
            hasattr(obs_space, "original_space") else obs_space
        assert isinstance(self.original_space, (gym.spaces.Tuple)), \
            "`obs_space.original_space` must be Tuple!"

        super().__init__(self.original_space, action_space, num_outputs,
                         model_config, name)
        
        ## Test 1: Assuming input is tuples
        inputs_conv1 = tf.keras.layers.Input(shape=(n_items,))
        inputs_conv2 = tf.keras.layers.Input(shape=(n_items,))
        inputs_rest = tf.keras.layers.Input(shape=(2,))
        inputs = [inputs_conv1, inputs_conv2, inputs_rest]
        inputs_concat = tf.keras.layers.Concatenate()(inputs)

        conv1 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv1)
        conv2 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv2)
        convall = tf.keras.layers.Concatenate()([conv1, conv2])
        prob_conv = tf.keras.layers.Conv1D(1, 1)(convall)
        prob_reshaped = tf.keras.layers.Reshape((-1,))(prob_conv)

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        dense_out = tf.keras.layers.Dense(2)(dense)

        action_out = tf.keras.layers.Concatenate()([prob_reshaped, dense_out])

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        value_out = tf.keras.layers.Dense(1)(dense)

        ## Test 2: Assuming input is flattened, and making net simpler
        # inputs = tf.keras.layers.Input(shape=(2 * n_items + 2,))

        # catprob = tf.keras.layers.Dense(n_items)(inputs)
        # dense_out = tf.keras.layers.Dense(2)(inputs) # 2 outputs for Gaussian
        # action_out = tf.keras.layers.Concatenate()([catprob, dense_out])

        # dense = tf.keras.layers.Dense(256, activation='relu')(inputs)
        # value_out = tf.keras.layers.Dense(1)(dense)

        ## This is the same for both
        self.base_model = tf.keras.Model(inputs=inputs, outputs=[action_out, value_out])
        self.base_model.summary()
        self.register_variables(self.base_model.variables)

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        # obs = restore_original_dimensions(input_dict["obs"], self.obs_space)
        obs = input_dict["obs"]
        logit_tuple, values = self.base_model(obs)
        self._value_out = tf.reshape(values, [-1])
        return logit_tuple, state

    @override(ModelV2)
    def value_function(self):
        return self._value_out

ModelCatalog.register_custom_model("my_cnn_model", MyConvNetwork)

if __name__ == "__main__":
    ray.init(address="auto")
    trainer = PPOTrainer( 
        env=TestEnv,
        config={
            "env_config": {
                "n_items": 10
            },
            "model": {
                "custom_model": "my_cnn_model",
                "custom_model_config": {
                    "n_items": 10
                }
            }
        })
    for i in range(1000):
        result = trainer.train()

Error from Test 1:

2021-04-28 16:02:27.686484: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-28 16:02:29,703 INFO worker.py:656 -- Connecting to existing Ray cluster at address: 10.10.124.35:6379
2021-04-28 16:02:29,769 INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
Traceback (most recent call last):
  File "mwe.py", line 101, in <module>
    trainer = PPOTrainer( 
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 106, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 465, in __init__
    super().__init__(config, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/trainable.py", line 96, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 629, in setup
    self._init(self.config, self.env_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 133, in _init
    self.workers = self._make_workers(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 700, in _make_workers
    return WorkerSet(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 79, in __init__
    remote_spaces = ray.get(self.remote_workers(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/worker.py", line 1379, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): ray::RolloutWorker.foreach_policy() (pid=33668, ip=10.10.124.35)
  File "python/ray/_raylet.pyx", line 422, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 456, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 459, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 463, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 415, in ray._raylet.execute_task.function_executor
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 460, in __init__
    self._build_policy_map(policy_dict, policy_config)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1077, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/policy/tf_policy_template.py", line 217, in __init__
    DynamicTFPolicy.__init__(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/policy/dynamic_tf_policy.py", line 282, in __init__
    dist_inputs, self._state_out = self.model(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/models/modelv2.py", line 209, in __call__
    res = self.forward(restored, state or [], seq_lens)
  File "mwe.py", line 89, in forward
    logit_tuple, values = self.base_model(obs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer_v1.py", line 760, in __call__
    input_spec.assert_input_compatibility(self.input_spec, inputs,
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py", line 204, in assert_input_compatibility
    raise ValueError('Layer ' + layer_name + ' expects ' +
ValueError: Layer model expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'default_policy/Placeholder:0' shape=(?, 22) dtype=float32>]

Error from Test 2:

2021-04-28 15:49:19.241016: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-28 15:49:21,260 INFO worker.py:656 -- Connecting to existing Ray cluster at address: 10.10.124.35:6379
2021-04-28 15:49:21,322 INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2021-04-28 15:49:24.836316: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-28 15:49:24.837247: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-04-28 15:49:24.843727: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:82:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidth: 451.17GiB/s
2021-04-28 15:49:24.843790: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-28 15:49:24.846549: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-04-28 15:49:24.846595: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-04-28 15:49:24.847591: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-04-28 15:49:24.847924: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-04-28 15:49:24.850834: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10
2021-04-28 15:49:24.851528: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2021-04-28 15:49:24.851704: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-04-28 15:49:24.852989: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-04-28 15:49:24.853550: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-04-28 15:49:24.855360: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-28 15:49:24.856966: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:82:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidth: 451.17GiB/s
2021-04-28 15:49:24.857017: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-28 15:49:24.857077: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-04-28 15:49:24.857129: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-04-28 15:49:24.857177: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-04-28 15:49:24.857225: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-04-28 15:49:24.857273: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10
2021-04-28 15:49:24.857321: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11
2021-04-28 15:49:24.857369: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-04-28 15:49:24.860264: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-04-28 15:49:24.860345: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-28 15:49:25.508508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-04-28 15:49:25.508561: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2021-04-28 15:49:25.508585: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N 
2021-04-28 15:49:25.510938: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 9575 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:82:00.0, compute capability: 6.1)
Traceback (most recent call last):
  File "mwe.py", line 101, in <module>
    trainer = PPOTrainer( 
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 106, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 465, in __init__
    super().__init__(config, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/trainable.py", line 96, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 629, in setup
    self._init(self.config, self.env_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 133, in _init
    self.workers = self._make_workers(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 700, in _make_workers
    return WorkerSet(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 87, in __init__
    self._local_worker = self._make_worker(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 315, in _make_worker
    worker = cls(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 462, in __init__
    self.policy_map, self.preprocessors = self._build_policy_map(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1077, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/policy/eager_tf_policy.py", line 251, in __init__
    self.model = ModelCatalog.get_model_v2(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/models/catalog.py", line 347, in get_model_v2
    raise e
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/models/catalog.py", line 332, in get_model_v2
    instance = model_cls(obs_space, action_space,
  File "mwe.py", line 73, in __init__
    catprob = tf.keras.layers.Dense(n_items)(inputs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 951, in __call__
    return self._functional_construction_call(inputs, args, kwargs,
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1090, in _functional_construction_call
    outputs = self._keras_tensor_symbolic_call(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 822, in _keras_tensor_symbolic_call
    return self._infer_output_signature(inputs, args, kwargs, input_masks)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 862, in _infer_output_signature
    self._maybe_build(inputs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 2710, in _maybe_build
    self.build(input_shapes)  # pylint:disable=not-callable
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/layers/core.py", line 1185, in build
    self.kernel = self.add_weight(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 623, in add_weight
    variable = self._add_variable_with_custom_getter(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/training/tracking/base.py", line 805, in _add_variable_with_custom_getter
    new_variable = getter(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer_utils.py", line 130, in make_variable
    return tf_variables.VariableV1(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/ops/variables.py", line 260, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/ops/variables.py", line 206, in _variable_v1_call
    return previous_getter(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/ops/variables.py", line 67, in getter
    return captured_getter(captured_previous, **kwargs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/models/catalog.py", line 324, in track_var_creation
    created.add(v)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/ops/variables.py", line 1081, in __hash__
    raise TypeError("Variable is unhashable. "
TypeError: Variable is unhashable. Instead, use tensor.ref() as the key.

Hi @albheim,

This ran without issue for me. The observations were sometimes larger than 100 which was your high limit in the space so I added a method to clip them in step. I also changed the way I imported tensorflow since I have tf2 installed an you were using tf1 keras methods. Other than that I did not really change anything.

import numpy as np

import gym

import ray
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
from ray.rllib.utils.debug import summarize
from ray.rllib.agents.ppo import PPOTrainer

from ray.rllib.models import ModelCatalog
tf, tf2, _ = try_import_tf()

class TestEnv(gym.Env):
    def __init__(self, config):
        self.n_items = config["n_items"]
        self.action_space = gym.spaces.Tuple((gym.spaces.Discrete(self.n_items), gym.spaces.Box(-1, 1, shape=(1,))))
        self.observation_space = gym.spaces.Tuple((
            gym.spaces.Box(-100, 100, shape=(self.n_items,)),
            gym.spaces.Box(-100, 100, shape=(self.n_items,)),
            gym.spaces.Box(-100, 100, shape=(2,)),
        ))

    def reset(self):
        self.items_feat1 = np.zeros(self.n_items)
        self.items_feat2 = np.zeros(self.n_items)
        return (self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)]))

    def step(self, action):
        i, a = action
        self.items_feat2[i] = 1
        self.items_feat1 += self.items_feat2
        state = self.clip_observation_spaces((self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)])))
        reward = np.sum(np.abs(1 - self.items_feat1))
        return state, reward, False, {}

    def clip_observation_spaces(self, state):
        return tuple([np.clip(arry, space.low, space.high) for arry, space in zip(state, self.observation_space.spaces)])

class MyConvNetwork(TFModelV2):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, n_items):
        self.original_space = obs_space.original_space if \
            hasattr(obs_space, "original_space") else obs_space
        assert isinstance(self.original_space, (gym.spaces.Tuple)), \
            "`obs_space.original_space` must be Tuple!"

        super().__init__(self.original_space, action_space, num_outputs,
                         model_config, name)

        ## Test 1: Assuming input is tuples
        inputs_conv1 = tf.keras.layers.Input(shape=(n_items,))
        inputs_conv2 = tf.keras.layers.Input(shape=(n_items,))
        inputs_rest = tf.keras.layers.Input(shape=(2,))
        inputs = [inputs_conv1, inputs_conv2, inputs_rest]
        inputs_concat = tf.keras.layers.Concatenate()(inputs)

        conv1 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv1)
        conv2 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv2)
        convall = tf.keras.layers.Concatenate()([conv1, conv2])
        prob_conv = tf.keras.layers.Conv1D(1, 1)(convall)
        prob_reshaped = tf.keras.layers.Reshape((-1,))(prob_conv)

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        dense_out = tf.keras.layers.Dense(2)(dense)

        action_out = tf.keras.layers.Concatenate()([prob_reshaped, dense_out])

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        value_out = tf.keras.layers.Dense(1)(dense)

        ## Test 2: Assuming input is flattened, and making net simpler
        # inputs = tf.keras.layers.Input(shape=(2 * n_items + 2,))

        # catprob = tf.keras.layers.Dense(n_items)(inputs)
        # dense_out = tf.keras.layers.Dense(2)(inputs) # 2 outputs for Gaussian
        # action_out = tf.keras.layers.Concatenate()([catprob, dense_out])

        # dense = tf.keras.layers.Dense(256, activation='relu')(inputs)
        # value_out = tf.keras.layers.Dense(1)(dense)

        ## This is the same for both
        self.base_model = tf.keras.Model(inputs=inputs, outputs=[action_out, value_out])
        self.base_model.summary()
        self.register_variables(self.base_model.variables)

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        # obs = restore_original_dimensions(input_dict["obs"], self.obs_space)
        obs = input_dict["obs"]
        logit_tuple, values = self.base_model(obs)
        self._value_out = tf.reshape(values, [-1])
        return logit_tuple, state

    @override(ModelV2)
    def value_function(self):
        return self._value_out

ModelCatalog.register_custom_model("my_cnn_model", MyConvNetwork)

if __name__ == "__main__":
    ray.init(local_mode=True)
    trainer = PPOTrainer(
        env=TestEnv,
        config={
            "num_workers": 0,
            "env_config": {
                "n_items": 10
            },
            "model": {
                "custom_model": "my_cnn_model",
                "custom_model_config": {
                    "n_items": 10
                }
            }
        })
    for i in range(1000):
        result = trainer.train()
        print(summarize(result))

Thanks for the response!

What version of ray are you running? I’m on 1.1.0, maybe that is the difference?

Running the code you posted gives me the same (or at least very similar) error I had before.

Traceback (most recent call last):
  File "mwe3.py", line 104, in <module>
    trainer = PPOTrainer(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 106, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 465, in __init__
    super().__init__(config, logger_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/tune/trainable.py", line 96, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 629, in setup
    self._init(self.config, self.env_creator)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer_template.py", line 133, in _init
    self.workers = self._make_workers(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 700, in _make_workers
    return WorkerSet(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 87, in __init__
    self._local_worker = self._make_worker(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py", line 315, in _make_worker
    worker = cls(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 460, in __init__
    self._build_policy_map(policy_dict, policy_config)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1077, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/policy/tf_policy_template.py", line 217, in __init__
    DynamicTFPolicy.__init__(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/policy/dynamic_tf_policy.py", line 282, in __init__
    dist_inputs, self._state_out = self.model(
  File "/home/ubuntu/.local/lib/python3.8/site-packages/ray/rllib/models/modelv2.py", line 209, in __call__
    res = self.forward(restored, state or [], seq_lens)
  File "mwe3.py", line 92, in forward
    logit_tuple, values = self.base_model(obs)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer_v1.py", line 760, in __call__
    input_spec.assert_input_compatibility(self.input_spec, inputs,
  File "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py", line 204, in assert_input_compatibility
    raise ValueError('Layer ' + layer_name + ' expects ' +
ValueError: Layer model expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'default_policy/Placeholder:0' shape=(?, 22) dtype=float32>]

@albheim,

It worked for me with the ray 1.3.0 and nightly wheels it did not work with the ray 1.1.0 or 1.2.0 wheels.

@albheim It seems that the Tuple implementation in ray 1.1 and 1.2 was not complete but is in 1.3. Dict spaces seem to work. This might work for you if you have a hard dependency on 1.1 for some reason.

import numpy as np

import gym

import ray
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.modelv2 import \
    ModelV2, \
    restore_original_dimensions
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
from ray.rllib.utils.debug import summarize
from ray.rllib.agents.ppo import PPOTrainer

from ray.rllib.models import ModelCatalog
tf, tf2, _ = try_import_tf()

class TestEnv(gym.Env):
    def __init__(self, config):
        self.n_items = config["n_items"]
        self.action_space = gym.spaces.Tuple((gym.spaces.Discrete(self.n_items), gym.spaces.Box(-1, 1, shape=(1,))))
        self.observation_space = gym.spaces.Dict({
            "obs1": gym.spaces.Box(-100, 100, shape=(self.n_items,)),
            "obs2": gym.spaces.Box(-100, 100, shape=(self.n_items,)),
            "obs3": gym.spaces.Box(-100, 100, shape=(2,))
            })

    def reset(self):
        self.items_feat1 = np.zeros(self.n_items)
        self.items_feat2 = np.zeros(self.n_items)
        return dict(zip(("obs1","obs2","obs3"), (self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)]))))

    def step(self, action):
        i, a = action
        self.items_feat2[i] = 1
        self.items_feat1 += self.items_feat2
        state = self.clip_observation_spaces((self.items_feat1, self.items_feat2, np.array([np.sum(self.items_feat1), np.sum(self.items_feat2)])))
        state = dict(zip(("obs1","obs2","obs3"), state))

        reward = np.sum(np.abs(1 - self.items_feat1))
        return state, reward, False, {}

    def clip_observation_spaces(self, state):
        return [np.clip(arry, space.low, space.high) for arry, space in zip(state, self.observation_space.spaces.values())]

class MyConvNetwork(TFModelV2):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, n_items):
        self.original_space = obs_space.original_space if \
            hasattr(obs_space, "original_space") else obs_space
        assert isinstance(self.original_space, (gym.spaces.Dict)), \
            "`obs_space.original_space` must be Dictionary!"

        super().__init__(obs_space, action_space, num_outputs,
                         model_config, name)

        ## Test 1: Assuming input is tuples
        inputs_conv1 = tf.keras.layers.Input(shape=(n_items,))
        inputs_conv2 = tf.keras.layers.Input(shape=(n_items,))
        inputs_rest = tf.keras.layers.Input(shape=(2,))
        inputs = [inputs_conv1, inputs_conv2, inputs_rest]
        inputs_concat = tf.keras.layers.Concatenate()(inputs)

        conv1 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv1)
        conv2 = tf.keras.layers.Reshape((n_items, 1))(inputs_conv2)
        convall = tf.keras.layers.concatenate([conv1, conv2])
        prob_conv = tf.keras.layers.Conv1D(1, 1)(convall)
        prob_reshaped = tf.keras.layers.Flatten()(prob_conv)

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        dense_out = tf.keras.layers.Dense(2)(dense)

        action_out = tf.keras.layers.Concatenate()([prob_reshaped, dense_out])

        dense = tf.keras.layers.Dense(256, activation='relu')(inputs_concat)
        value_out = tf.keras.layers.Dense(1)(dense)

        ## Test 2: Assuming input is flattened, and making net simpler
        # inputs = tf.keras.layers.Input(shape=(2 * n_items + 2,))

        # catprob = tf.keras.layers.Dense(n_items)(inputs)
        # dense_out = tf.keras.layers.Dense(2)(inputs) # 2 outputs for Gaussian
        # action_out = tf.keras.layers.Concatenate()([catprob, dense_out])

        # dense = tf.keras.layers.Dense(256, activation='relu')(inputs)
        # value_out = tf.keras.layers.Dense(1)(dense)

        ## This is the same for both
        self.base_model = tf.keras.Model(inputs=inputs, outputs=[action_out, value_out])
        self.base_model.summary()
        self.register_variables(self.base_model.variables)

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs_dict = input_dict["obs"]
        obs = [obs_dict["obs1"],obs_dict["obs2"],obs_dict["obs3"]]
        logit_tuple, values = self.base_model(obs)
        self._value_out = tf.reshape(values, [-1])
        return logit_tuple, state

    @override(ModelV2)
    def value_function(self):
        return self._value_out

ModelCatalog.register_custom_model("my_cnn_model", MyConvNetwork)

if __name__ == "__main__":
    ray.init(local_mode=True)
    trainer = PPOTrainer(
        env=TestEnv,
        config={
            "num_workers": 0,
            "env_config": {
                "n_items": 10
            },
            "model": {
                "custom_model": "my_cnn_model",
                "custom_model_config": {
                    "n_items": 10
                }
            }
        })
    for i in range(1000):
        result = trainer.train()
        print(summarize(result))

Thanks, I tried 1.3 earlier today and it worked. Just forgot to come back and mark it as solved :slight_smile: