1. Severity of the issue: (select one)
High: Completely blocks me.
2. Environment:
- Ray version: 2.46.0
- Python version: 3.12.9
- OS: linux
- Cloud/Infrastructure:
3. What happened vs. what you expected:
- Expected: Using the .debugging seed paramter in the algorithm config leads to reproducible training instances
- Actual: Setting the paramter (or trying to seed some components manually) only led to some aspects being made reproducible (e.g. the layer weights when initializing my custom network), while training as a whole did not become reproducible
I am running a Multi Agent Learning Experiment using some Farama Pettingzoo environments, for example TicTacToe. I’m currently trying to introduce reproducibility to my training runs by seeding the random components. Since there is a seed paramter in the .debugging part of the algorithm configuration (I’m using PPO), I tried using that but different training instances with the same seed did not produce the same results.
When trying out the example found at https://github.com/ray-project/ray/blob/master/rllib/examples/debugging/deterministic_sampling_and_training.py, I noticed that simply running the script produced the expected results, however when running the script with num_agents = 2, the results began to diverge (note that this did not always happen, some runs produced the same results while others did not). I then included the debugging paramter in the example script found at https://github.com/ray-project/ray/blob/master/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py, since it uses a Multi Agent environment, but only has a single policy for both agents. In this case the results stayed consistent over multiple training instances when using the same seed.
Because of this, I believe that the error might have something to do with using a set of agents and policies. I also tried seeding some components manually, but this only achieved the same results as using the .debugging seed parameter, though the layer weights in my network are being initialized correctly when using the same seed, so that is not part of the issue.
I am currently testing against a random opponent (using a custom module) which uses the gymnasium.space sample method, which I have manually seeded with success, so this should not be part of the problem.
Here are the relevant custom modules I am using (these are closely modelled after the action masking module examples):
class ActMskModule(RLModule):
@override(RLModule)
def __init__(
self,
*,
observation_space: Optional[Space] = None,
action_space: Optional[Space] = None,
inference_only: Optional[bool] = None,
learner_only: bool = False,
model_config: Optional[Union[dict, DefaultModelConfig]] = None,
catalog_class = None,
**kwargs,
):
if not isinstance(observation_space, Dictionary):
raise ValueError(f"This Module requires a Dictionary with Observation and ActionMask, your is {observation_space}")
self.obs_space_with_mask = observation_space
self.observation_space = observation_space["observation"]
self._checked_observation = False
super().__init__(
observation_space = self.observation_space,
action_space = action_space,
inference_only = inference_only,
learner_only = learner_only,
model_config = model_config,
catalog_class = catalog_class,
**kwargs,
)
def _preprocess_batch(
self, batch: Dict[str, TensorType], **kwargs # type: ignore
) -> Tuple[TensorType, Dict[str, TensorType]]: # type: ignore
self._check_batch(batch)
action_mask = batch[Columns.OBS].pop("action_mask")
batch[Columns.OBS] = batch[Columns.OBS].pop("observation")
return action_mask, batch
def _mask_action_logits(
self, batch: Dict[str, TensorType], action_mask: TensorType # type: ignore
) -> Dict[str, TensorType]: # type: ignore
inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
batch[Columns.ACTION_DIST_INPUTS] += inf_mask
return batch
def _check_batch(self, batch:Dict[str, TensorType]) -> Optional[ValueError]: # type: ignore
if not self._checked_observation:
if "action_mask" not in batch[Columns.OBS]:
raise ValueError(
"No Action Mask found in Observation"
)
if "observation" not in batch[Columns.OBS]:
raise ValueError(
"No observation key found in observation"
)
self._checked_observation = True
class ActMskTorchModuleSeperateMLPNet(ActMskModule, DefaultPPOTorchRLModule):
@override(DefaultPPOTorchRLModule)
def setup(self):
super().setup()
width, height, in_channels = self.observation_space.shape
output_dim = self.action_space.n
input_dim = width*height*in_channels
hiddens = list(self.model_config.get("fcnet_hiddens", []) + list(
self.model_config.get("post_fcnet_hiddens", [])
))
layers = []
critic_layers = []
prev_layer_size = input_dim
for size in hiddens:
layer = nn.Linear(prev_layer_size, size)
nn.init.xavier_uniform_(layer.weight)
nn.init.zeros_(layer.bias)
critic_layer = nn.Linear(prev_layer_size, size)
nn.init.xavier_uniform_(critic_layer.weight)
nn.init.zeros_(critic_layer.bias)
layers.append(layer)
layers.append(nn.Tanh())
critic_layers.append(critic_layer)
critic_layers.append(nn.Tanh())
prev_layer_size = size
self._base_mlp_stack = nn.Sequential(nn.Flatten(), *layers)
self._critic_mlp_stack = nn.Sequential(nn.Flatten(), *critic_layers)
self._logits = nn.Linear(prev_layer_size, output_dim)
self._values = nn.Linear(prev_layer_size, 1)
self.observation_space = self.obs_space_with_mask
@override(DefaultPPOTorchRLModule)
def _forward(self, batch, **kwargs):
_, logits = self._compute_embeddings_and_logits(batch)
return {Columns.ACTION_DIST_INPUTS: logits,}
@override(DefaultPPOTorchRLModule)
def _forward_inference(self, batch: Dict[str, TensorType], **kwargs) -> Dict[str, TensorType]: # type: ignore
action_mask, batch = self._preprocess_batch(batch)
outs = super()._forward_inference(batch, **kwargs)
return self._mask_action_logits(outs, action_mask)
@override(DefaultPPOTorchRLModule)
def _forward_exploration(self, batch: Dict[str, TensorType], **kwargs) -> Dict[str, TensorType]: # type: ignore
action_mask, batch = self._preprocess_batch(batch)
outs = super()._forward_exploration(batch, **kwargs)
return self._mask_action_logits(outs, action_mask)
@override(DefaultPPOTorchRLModule)
def _forward_train(self, batch, **kwargs):
embeddings, logits = self._compute_embeddings_and_logits(batch)
return self._mask_action_logits({
Columns.ACTION_DIST_INPUTS: logits,
Columns.EMBEDDINGS: embeddings,
}, batch["action_mask"])
@override(ActMskTorchModule)
def compute_values(self, batch: Dict[str, TensorType], embeddings: Optional[Any] = None) -> TensorType: # type: ignore
if isinstance(batch[Columns.OBS], dict):
action_mask, batch = self._preprocess_batch(batch)
batch["action_mask"] = action_mask
obs = batch[Columns.OBS].permute(0,3,1,2).float()
embeddings = self._critic_mlp_stack(obs)
return self._values(embeddings).squeeze(-1)
@override(ActMskTorchModule)
def _compute_embeddings_and_logits(self, batch):
obs = batch[Columns.OBS].permute(0,3,1,2).float()
embeddings = self._base_mlp_stack(obs)
logits = self._logits(embeddings)
return embeddings, logits