Debugging proof of concept env with custom GCN model

guillermo · July 2, 2023, 10:48am

I’m working on a problem with the observation space being a undirected graph and I created a proof of concept environment to debug any issues as I’m also using a custom model. However, I’m seeing that the average reward doesn’t improve even though the task is relatively simple. Basically, the observation is an nxn graph, where each node has a state (either 1 or 0). The states are then used as features for the graph convolutional network. The action space is then the same number of nodes in the graph, which give a +1 reward for a state change of 0->1. Otherwise, the agent gets -1 reward for each time step. Additionally, I’m using PPO with mostly out of the box settings. From the policy model outputs, it seems like all actions are equally likely no matter the observation. This is backed up by the reward which never increases.

Here’s the environment:

class GraphEnv(gym.Env):
    def __init__(self):
        self.num_nodes = 8
        self.MAX_STEP = 100
        self.action_space = gym.spaces.Discrete(self.num_nodes)
        self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes, self.num_nodes))

    def reset(self):
        graph = nx.generators.random_regular_graph(3, self.num_nodes) #nx.generators.barbell_graph(self.num_nodes // 2 - 1, 2)
        self.obs = nx.to_numpy_array(graph, dtype=np.float32)

        np.fill_diagonal(self.obs, 0)
        self.current_steps = 0
        return self.obs

    def step(self, action):
        done = False
        info = {}
        if self.obs[action][action] == 0:
            self.obs[action][action] = 1
            reward = 1
        else:
            reward = 0.0

        if sum(self.obs.diagonal()) == self.num_nodes:
            done = True
            info = {}
            #reward = 2 / self.num_nodes

        self.current_steps += 1
        if self.current_steps >= self.MAX_STEP:
            done = True
        else:
            reward -= 1

        return self.obs, reward, done, info

and the custom model:

tf1, tf, tfv = try_import_tf()


class GraphModel(TFModelV2):
    def __init__(
        self, obs_space, action_space, num_outputs, model_config, name, **kwargs
    ):
        assert (
            isinstance(obs_space, gym.spaces.Box)
        )

        super().__init__(obs_space, action_space, num_outputs, model_config, name)
        batch_size = 32
        self.num_states = 2

        if obs_space.shape[0] == obs_space.shape[1]:
            self.num_nodes = obs_space.shape[0]
        else:
            raise Exception(f"Mismatch in shape of observation space {obs_space.shape[0]} {obs_space.shape[1]}")
        nx_graphs = [nx.generators.random_regular_graph(3, self.num_nodes)for i in range(batch_size)]
        graphs = [nx.to_numpy_array(nx_graph, dtype=np.float32) for nx_graph in nx_graphs]

        features = tf.constant(np.random.randint(0, 2, size=(batch_size, self.num_nodes, 1)))
        features = tf.keras.utils.to_categorical(features, num_classes=2)

        sg_graphs = []
        for i, graph in enumerate(graphs):
            np.fill_diagonal(graph, 0)

            edge_coords = np.where(graph == 1)
            edges = pd.DataFrame({"source": edge_coords[0], "target": edge_coords[1]})

            sg_graph = sg.StellarGraph(features[i], edges)
            sg_graphs.append(sg_graph)
        self.generator = PaddedGraphGenerator(graphs=sg_graphs)

        gc_model_1 = GCNSupervisedGraphClassification(
            layer_sizes=[8],
            activations=["tanh"],
            dropout=0,
            bias=True,
            generator=self.generator,
            kernel_initializer=normc_initializer(1.0)
        )
        gc_model_2 = GCNSupervisedGraphClassification(
            layer_sizes=[8],
            activations=["tanh"],
            dropout=0,
            bias=True,
            generator=self.generator,
            kernel_initializer=normc_initializer(1.0)
        )
        x_inp_policy, x_out_policy = gc_model_1.in_out_tensors()

        model_inputs = [
            tf.keras.layers.Input(gc_input.shape[1:])
            for gc_input in x_inp_policy
        ]

        gc_policy = gc_model_1(model_inputs)
        gc_value = gc_model_2(model_inputs)

        f1 = tf.keras.layers.Dense(128, name="fc_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_policy)
        fcv1 = tf.keras.layers.Dense(128, name="fc_value_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_value)

        f2 = tf.keras.layers.Dense(128, name="fc_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(f1)
        fcv2 = tf.keras.layers.Dense(128, name="fc_value_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(fcv1)

        fc_out = tf.keras.layers.Dense(self.num_nodes, name="fc_out", activation="linear", kernel_initializer=normc_initializer(0.01))(f2)
        value_out = tf.keras.layers.Dense(1, name="fc_value_out", activation="linear", kernel_initializer=normc_initializer(0.01))(fcv2)

        self.base_model = tf.keras.Model(inputs=model_inputs, outputs=[fc_out, value_out])

    def forward(self, input_dict, state, seq_lens):
        obs_batch_tensor = input_dict["obs"]
        
        if not isinstance(obs_batch_tensor, np.ndarray):
            obs_batch_ndarray = obs_batch_tensor.numpy()
        else:
            obs_batch_ndarray = obs_batch_tensor

        batch_size = obs_batch_ndarray.shape[0]
        
        features = tf.linalg.diag_part(obs_batch_tensor)
        one_hot_features = to_categorical(features, num_classes=self.num_states)
        
        zero_fill_array = tf.cast(tf.zeros((batch_size, self.num_nodes)), dtype=tf.float32)
        obs_batch_edges = tf.linalg.set_diag(obs_batch_tensor, zero_fill_array)

        normalized_graphs = self.normalize_batch(obs_batch_edges)

        logits, self._value_out = self.base_model([
            one_hot_features,
            np.ones((batch_size, self.num_nodes)),
            normalized_graphs
        ])

        return logits, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])

    def get_initial_state(self):
        return []
    
    def normalize_batch(self, obs_batch: tf.Tensor) -> tf.Tensor:
        adj_batch_tensor = obs_batch
        epsilon = 1e-5

        node_degrees = tf.reduce_sum(adj_batch_tensor, axis=1) + epsilon
        sqrt_tensor = tf.ones(node_degrees.shape) * -0.5

        pow_tensor = tf.math.pow(node_degrees, sqrt_tensor)
        zeros_tensor = tf.zeros(adj_batch_tensor.shape, dtype=tf.float32)
        normal_diag = tf.linalg.set_diag(zeros_tensor, pow_tensor)

        normalized_adjacency = tf.transpose(tf.matmul(adj_batch_tensor, normal_diag), perm=[0, 2, 1])
        return tf.matmul(normalized_adjacency, normal_diag)

Any insight or feedback would be appreciated.

Screenshot from 2023-07-02 06-41-25

guillermo · July 3, 2023, 8:55am

For debugging, I simplified the observation to see if the issue was the model.Instead of 2D box observation state, I’m now passing just the diagonals (which are really the relevant part). Also I removed the custom model so now I’m running it with the default FCNet. Still seeing high entropy and no convergence. Here’s the changes I made:

class GraphEnv(gym.Env):
    def __init__(self):
        self.num_nodes = 32
        self.MAX_STEP = 100
        self.action_space = gym.spaces.Discrete(self.num_nodes)
        #self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes, self.num_nodes))
        self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes,))

    def reset(self):
        graph = nx.generators.random_regular_graph(3, self.num_nodes) #nx.generators.barbell_graph(self.num_nodes // 2 - 1, 2)
        self.obs = nx.to_numpy_array(graph, dtype=np.float32)

        np.fill_diagonal(self.obs, 0)
        self.current_steps = 0
        return self.obs.diagonal() #self.obs

    def step(self, action):
        done = False
        info = {}

        if self.obs[action][action] == 0:
            self.obs[action][action] = 1
            reward = 0.0
        else:
            reward = 0.0

        if all(self.obs.diagonal()):
            done = True
            info = {}
            reward = 5

        if self.current_steps >= self.MAX_STEP:
            done = True
        else:
            reward -= 1

        self.current_steps += 1

        return self.obs.diagonal(), reward, done, info#self.obs, reward, done, info

mannyv · July 3, 2023, 12:53pm

Hi @guillermo,

Can you post a full reproduction script. I think that would help people better understand the setup.

guillermo · July 3, 2023, 3:31pm

Yeah for sure, here’s a Colab script.

Topic		Replies	Views
Observation dependent continuous action space ("Masking" continuous action space) RLlib	4	1075	February 9, 2022
Policy returning NaN weights and NaN biases. In addition, Policy observation space is different than expected RLlib	9	1355	January 31, 2023
Training mean reward vs. evaluation mean rewward RLlib	4	1293	November 17, 2022
Prediction outside outside action space during inference	0	104	March 18, 2024
Callback on_episode_end does not report correct actions Configure Algorithm, Training, Evaluation, Scaling	2	17	February 12, 2025

Debugging proof of concept env with custom GCN model

Related topics