I’m working on a problem with the observation space being a undirected graph and I created a proof of concept environment to debug any issues as I’m also using a custom model. However, I’m seeing that the average reward doesn’t improve even though the task is relatively simple. Basically, the observation is an nxn graph, where each node has a state (either 1 or 0). The states are then used as features for the graph convolutional network. The action space is then the same number of nodes in the graph, which give a +1 reward for a state change of 0->1. Otherwise, the agent gets -1 reward for each time step. Additionally, I’m using PPO with mostly out of the box settings. From the policy model outputs, it seems like all actions are equally likely no matter the observation. This is backed up by the reward which never increases.
Here’s the environment:
class GraphEnv(gym.Env):
def __init__(self):
self.num_nodes = 8
self.MAX_STEP = 100
self.action_space = gym.spaces.Discrete(self.num_nodes)
self.observation_space = gym.spaces.Box(low=0, high=2, shape=(self.num_nodes, self.num_nodes))
def reset(self):
graph = nx.generators.random_regular_graph(3, self.num_nodes) #nx.generators.barbell_graph(self.num_nodes // 2 - 1, 2)
self.obs = nx.to_numpy_array(graph, dtype=np.float32)
np.fill_diagonal(self.obs, 0)
self.current_steps = 0
return self.obs
def step(self, action):
done = False
info = {}
if self.obs[action][action] == 0:
self.obs[action][action] = 1
reward = 1
else:
reward = 0.0
if sum(self.obs.diagonal()) == self.num_nodes:
done = True
info = {}
#reward = 2 / self.num_nodes
self.current_steps += 1
if self.current_steps >= self.MAX_STEP:
done = True
else:
reward -= 1
return self.obs, reward, done, info
and the custom model:
tf1, tf, tfv = try_import_tf()
class GraphModel(TFModelV2):
def __init__(
self, obs_space, action_space, num_outputs, model_config, name, **kwargs
):
assert (
isinstance(obs_space, gym.spaces.Box)
)
super().__init__(obs_space, action_space, num_outputs, model_config, name)
batch_size = 32
self.num_states = 2
if obs_space.shape[0] == obs_space.shape[1]:
self.num_nodes = obs_space.shape[0]
else:
raise Exception(f"Mismatch in shape of observation space {obs_space.shape[0]} {obs_space.shape[1]}")
nx_graphs = [nx.generators.random_regular_graph(3, self.num_nodes)for i in range(batch_size)]
graphs = [nx.to_numpy_array(nx_graph, dtype=np.float32) for nx_graph in nx_graphs]
features = tf.constant(np.random.randint(0, 2, size=(batch_size, self.num_nodes, 1)))
features = tf.keras.utils.to_categorical(features, num_classes=2)
sg_graphs = []
for i, graph in enumerate(graphs):
np.fill_diagonal(graph, 0)
edge_coords = np.where(graph == 1)
edges = pd.DataFrame({"source": edge_coords[0], "target": edge_coords[1]})
sg_graph = sg.StellarGraph(features[i], edges)
sg_graphs.append(sg_graph)
self.generator = PaddedGraphGenerator(graphs=sg_graphs)
gc_model_1 = GCNSupervisedGraphClassification(
layer_sizes=[8],
activations=["tanh"],
dropout=0,
bias=True,
generator=self.generator,
kernel_initializer=normc_initializer(1.0)
)
gc_model_2 = GCNSupervisedGraphClassification(
layer_sizes=[8],
activations=["tanh"],
dropout=0,
bias=True,
generator=self.generator,
kernel_initializer=normc_initializer(1.0)
)
x_inp_policy, x_out_policy = gc_model_1.in_out_tensors()
model_inputs = [
tf.keras.layers.Input(gc_input.shape[1:])
for gc_input in x_inp_policy
]
gc_policy = gc_model_1(model_inputs)
gc_value = gc_model_2(model_inputs)
f1 = tf.keras.layers.Dense(128, name="fc_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_policy)
fcv1 = tf.keras.layers.Dense(128, name="fc_value_1", activation="tanh", kernel_initializer=normc_initializer(1.0))(gc_value)
f2 = tf.keras.layers.Dense(128, name="fc_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(f1)
fcv2 = tf.keras.layers.Dense(128, name="fc_value_2", activation="tanh", kernel_initializer=normc_initializer(1.0))(fcv1)
fc_out = tf.keras.layers.Dense(self.num_nodes, name="fc_out", activation="linear", kernel_initializer=normc_initializer(0.01))(f2)
value_out = tf.keras.layers.Dense(1, name="fc_value_out", activation="linear", kernel_initializer=normc_initializer(0.01))(fcv2)
self.base_model = tf.keras.Model(inputs=model_inputs, outputs=[fc_out, value_out])
def forward(self, input_dict, state, seq_lens):
obs_batch_tensor = input_dict["obs"]
if not isinstance(obs_batch_tensor, np.ndarray):
obs_batch_ndarray = obs_batch_tensor.numpy()
else:
obs_batch_ndarray = obs_batch_tensor
batch_size = obs_batch_ndarray.shape[0]
features = tf.linalg.diag_part(obs_batch_tensor)
one_hot_features = to_categorical(features, num_classes=self.num_states)
zero_fill_array = tf.cast(tf.zeros((batch_size, self.num_nodes)), dtype=tf.float32)
obs_batch_edges = tf.linalg.set_diag(obs_batch_tensor, zero_fill_array)
normalized_graphs = self.normalize_batch(obs_batch_edges)
logits, self._value_out = self.base_model([
one_hot_features,
np.ones((batch_size, self.num_nodes)),
normalized_graphs
])
return logits, state
def value_function(self):
return tf.reshape(self._value_out, [-1])
def get_initial_state(self):
return []
def normalize_batch(self, obs_batch: tf.Tensor) -> tf.Tensor:
adj_batch_tensor = obs_batch
epsilon = 1e-5
node_degrees = tf.reduce_sum(adj_batch_tensor, axis=1) + epsilon
sqrt_tensor = tf.ones(node_degrees.shape) * -0.5
pow_tensor = tf.math.pow(node_degrees, sqrt_tensor)
zeros_tensor = tf.zeros(adj_batch_tensor.shape, dtype=tf.float32)
normal_diag = tf.linalg.set_diag(zeros_tensor, pow_tensor)
normalized_adjacency = tf.transpose(tf.matmul(adj_batch_tensor, normal_diag), perm=[0, 2, 1])
return tf.matmul(normalized_adjacency, normal_diag)
Any insight or feedback would be appreciated.