Debugging proof of concept env with custom GCN model

I’m working on a problem with the observation space being a undirected graph and I created a proof of concept environment to debug any issues as I’m also using a custom model. However, I’m seeing that the average reward doesn’t improve even though the task is relatively simple. Basically, the observation is an nxn graph, where each node has a state (either 1 or 0). The states are then used as features for the graph convolutional network. The action space is then the same number of nodes in the graph, which give a +1 reward for a state change of 0->1. Otherwise, the agent gets -1 reward for each time step. Additionally, I’m using PPO with mostly out of the box settings. From the policy model outputs, it seems like all actions are equally likely no matter the observation. This is backed up by the reward which never increases.

Here’s the environment:

class GraphEnv(gym.Env):
    def __init__(self):
        self.num_nodes = 8
        self.MAX_STEP = 100
        self.action_space = gym.spaces.Discrete(self.num_nodes)
        self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes, self.num_nodes))

    def reset(self):
        graph = nx.generators.random_regular_graph(3, self.num_nodes) #nx.generators.barbell_graph(self.num_nodes // 2 - 1, 2)
        self.obs = nx.to_numpy_array(graph, dtype=np.float32)

        np.fill_diagonal(self.obs, 0)
        self.current_steps = 0
        return self.obs

    def step(self, action):
        done = False
        info = {}
        if self.obs[action][action] == 0:
            self.obs[action][action] = 1
            reward = 1
        else:
            reward = 0.0

        if sum(self.obs.diagonal()) == self.num_nodes:
            done = True
            info = {}
            #reward = 2 / self.num_nodes

        self.current_steps += 1
        if self.current_steps >= self.MAX_STEP:
            done = True
        else:
            reward -= 1

        return self.obs, reward, done, info

and the custom model:

tf1, tf, tfv = try_import_tf()


class GraphModel(TFModelV2):
    def __init__(
        self, obs_space, action_space, num_outputs, model_config, name, **kwargs
    ):
        assert (
            isinstance(obs_space, gym.spaces.Box)
        )

        super().__init__(obs_space, action_space, num_outputs, model_config, name)
        batch_size = 32
        self.num_states = 2

        if obs_space.shape[0] == obs_space.shape[1]:
            self.num_nodes = obs_space.shape[0]
        else:
            raise Exception(f"Mismatch in shape of observation space {obs_space.shape[0]} {obs_space.shape[1]}")
        nx_graphs = [nx.generators.random_regular_graph(3, self.num_nodes)for i in range(batch_size)]
        graphs = [nx.to_numpy_array(nx_graph, dtype=np.float32) for nx_graph in nx_graphs]

        features = tf.constant(np.random.randint(0, 2, size=(batch_size, self.num_nodes, 1)))
        features = tf.keras.utils.to_categorical(features, num_classes=2)

        sg_graphs = []
        for i, graph in enumerate(graphs):
            np.fill_diagonal(graph, 0)

            edge_coords = np.where(graph == 1)
            edges = pd.DataFrame({"source": edge_coords[0], "target": edge_coords[1]})

            sg_graph = sg.StellarGraph(features[i], edges)
            sg_graphs.append(sg_graph)
        self.generator = PaddedGraphGenerator(graphs=sg_graphs)

        gc_model_1 = GCNSupervisedGraphClassification(
            layer_sizes=[8],
            activations=["tanh"],
            dropout=0,
            bias=True,
            generator=self.generator,
            kernel_initializer=normc_initializer(1.0)
        )
        gc_model_2 = GCNSupervisedGraphClassification(
            layer_sizes=[8],
            activations=["tanh"],
            dropout=0,
            bias=True,
            generator=self.generator,
            kernel_initializer=normc_initializer(1.0)
        )
        x_inp_policy, x_out_policy = gc_model_1.in_out_tensors()

        model_inputs = [
            tf.keras.layers.Input(gc_input.shape[1:])
            for gc_input in x_inp_policy
        ]

        gc_policy = gc_model_1(model_inputs)
        gc_value = gc_model_2(model_inputs)

        f1 = tf.keras.layers.Dense(128, name="fc_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_policy)
        fcv1 = tf.keras.layers.Dense(128, name="fc_value_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_value)

        f2 = tf.keras.layers.Dense(128, name="fc_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(f1)
        fcv2 = tf.keras.layers.Dense(128, name="fc_value_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(fcv1)

        fc_out = tf.keras.layers.Dense(self.num_nodes, name="fc_out", activation="linear", kernel_initializer=normc_initializer(0.01))(f2)
        value_out = tf.keras.layers.Dense(1, name="fc_value_out", activation="linear", kernel_initializer=normc_initializer(0.01))(fcv2)

        self.base_model = tf.keras.Model(inputs=model_inputs, outputs=[fc_out, value_out])

    def forward(self, input_dict, state, seq_lens):
        obs_batch_tensor = input_dict["obs"]
        
        if not isinstance(obs_batch_tensor, np.ndarray):
            obs_batch_ndarray = obs_batch_tensor.numpy()
        else:
            obs_batch_ndarray = obs_batch_tensor

        batch_size = obs_batch_ndarray.shape[0]
        
        features = tf.linalg.diag_part(obs_batch_tensor)
        one_hot_features = to_categorical(features, num_classes=self.num_states)
        
        zero_fill_array = tf.cast(tf.zeros((batch_size, self.num_nodes)), dtype=tf.float32)
        obs_batch_edges = tf.linalg.set_diag(obs_batch_tensor, zero_fill_array)

        normalized_graphs = self.normalize_batch(obs_batch_edges)

        logits, self._value_out = self.base_model([
            one_hot_features,
            np.ones((batch_size, self.num_nodes)),
            normalized_graphs
        ])

        return logits, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])

    def get_initial_state(self):
        return []
    
    def normalize_batch(self, obs_batch: tf.Tensor) -> tf.Tensor:
        adj_batch_tensor = obs_batch
        epsilon = 1e-5

        node_degrees = tf.reduce_sum(adj_batch_tensor, axis=1) + epsilon
        sqrt_tensor = tf.ones(node_degrees.shape) * -0.5

        pow_tensor = tf.math.pow(node_degrees, sqrt_tensor)
        zeros_tensor = tf.zeros(adj_batch_tensor.shape, dtype=tf.float32)
        normal_diag = tf.linalg.set_diag(zeros_tensor, pow_tensor)

        normalized_adjacency = tf.transpose(tf.matmul(adj_batch_tensor, normal_diag), perm=[0, 2, 1])
        return tf.matmul(normalized_adjacency, normal_diag)

Any insight or feedback would be appreciated.

Screenshot from 2023-07-02 06-41-25
Screenshot from 2023-07-02 06-41-42
Screenshot from 2023-07-02 06-47-17
Screenshot from 2023-07-02 06-47-26
Screenshot from 2023-07-02 06-47-34

For debugging, I simplified the observation to see if the issue was the model.Instead of 2D box observation state, I’m now passing just the diagonals (which are really the relevant part). Also I removed the custom model so now I’m running it with the default FCNet. Still seeing high entropy and no convergence. Here’s the changes I made:

class GraphEnv(gym.Env):
    def __init__(self):
        self.num_nodes = 32
        self.MAX_STEP = 100
        self.action_space = gym.spaces.Discrete(self.num_nodes)
        #self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes, self.num_nodes))
        self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes,))

    def reset(self):
        graph = nx.generators.random_regular_graph(3, self.num_nodes) #nx.generators.barbell_graph(self.num_nodes // 2 - 1, 2)
        self.obs = nx.to_numpy_array(graph, dtype=np.float32)

        np.fill_diagonal(self.obs, 0)
        self.current_steps = 0
        return self.obs.diagonal() #self.obs

    def step(self, action):
        done = False
        info = {}

        if self.obs[action][action] == 0:
            self.obs[action][action] = 1
            reward = 0.0
        else:
            reward = 0.0

        if all(self.obs.diagonal()):
            done = True
            info = {}
            reward = 5

        if self.current_steps >= self.MAX_STEP:
            done = True
        else:
            reward -= 1

        self.current_steps += 1

        return self.obs.diagonal(), reward, done, info#self.obs, reward, done, info

Hi @guillermo,

Can you post a full reproduction script. I think that would help people better understand the setup.

Yeah for sure, here’s a Colab script.