1. Severity of the issue: (select one)
High: Completely blocks me.
2. Environment:
- Ray version: 2.51.1
- Python version: 3.12.10
- OS: Windows 11
3. What happened vs. what you expected:
- Expected: Setting the appropriate seeds ensures reproducibility with ray tune
- Actual: I am setting the seeds according to the documentation, however I can not ensure reproducibility between ray tune experiments.
In the following section you can see the Code of my main.py, where I set the seeds for all sorts of things, however I just can not get reproducibility with ray tune. Am I missing something? The docs on this are not very comprehensive and all the posts I could find on here are over three years old.
import random
from pathlib import Path
import numpy as np
import ray
import torch
from ray import tune
from ray.air.integrations.wandb import WandbLoggerCallback
from ray.rllib.core.rl_module import MultiRLModuleSpec, RLModuleSpec
from ray.rllib.examples.algorithms.mappo.mappo import MAPPOConfig
from ray.rllib.examples.algorithms.mappo.torch.shared_critic_torch_rl_module import SharedCriticTorchRLModule
from ray.tune.registry import register_env
from callbacks import MetricsLoggerCallback
from config.config import GENERATE_RANDOM_ROUTES, DEBUG, LOG_TO_WANDB
from rl_environment.observation_classes import CameraObservation, NoisyCameraObservation
from rl_environment.sumo_traffic_env import SumoTrafficEnv
SHARED_CRITIC_ID = "shared_critic"
SEED = 100
def env_creator(env_config):
current_file = Path(__file__)
project_base = current_file.parent.parent
# Pfade und maximale Simulationszeit festlegen
net: Path = project_base / "simulation_files" / "net.net.xml"
route: Path = project_base / "simulation_files" / "random.rou.xml"
trip: Path = project_base / "simulation_files" / "random.trips.xml"
additional: Path = project_base / "simulation_files" / "mytypes.add.xml"
return SumoTrafficEnv(
sumo_net_file=net,
sumo_route_file=route,
sumo_trip_file=trip,
sumo_additional_file=additional,
reward_function="negative_accumulated_waiting_time_since_last_step",
observation_class=NoisyCameraObservation,
show_gui=False,
simulation_time=600,
generate_random_routes=GENERATE_RANDOM_ROUTES,
sumo_simulation_seed=str(SEED)
)
if __name__ == "__main__":
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
initial_data_env = env_creator({})
agent_ids = initial_data_env.agents
ray.init(local_mode=DEBUG)
register_env("sumo_marl", env_creator)
observation_space = initial_data_env.observation_spaces[agent_ids[0]]
action_space = initial_data_env.observation_spaces[agent_ids[0]]
policies = [f"p_{agent_id}" for agent_id in agent_ids]
specs = {p: RLModuleSpec() for p in policies}
specs[SHARED_CRITIC_ID] = RLModuleSpec(
module_class=SharedCriticTorchRLModule,
observation_space=observation_space,
action_space=action_space,
learner_only=True,
model_config={"observation_spaces": initial_data_env.observation_spaces},
)
config = (
MAPPOConfig()
.environment("sumo_marl")
.env_runners(
num_env_runners=1 if not DEBUG else 0,
num_envs_per_env_runner=1,
num_cpus_per_env_runner=3,
sample_timeout_s=50000,
)
.multi_agent(
policies=policies + [SHARED_CRITIC_ID],
policy_mapping_fn=lambda aid, *a, **kw: f"p_{aid}",
)
.rl_module(
rl_module_spec=MultiRLModuleSpec(
rl_module_specs=specs,
),
)
.learners(
num_learners=0,
num_cpus_per_learner=3,
)
.training(
train_batch_size=6000,
minibatch_size=256,
)
.callbacks(MetricsLoggerCallback)
.debugging(
seed=SEED
)
)
param_space = config.to_dict()
param_space["seed"] = SEED
# Training starten
tuner = tune.Tuner(
config.algo_class,
param_space=param_space,
run_config=tune.RunConfig(
name="SharedCritic",
stop={"training_iteration": 100},
verbose=1,
callbacks=[WandbLoggerCallback(project="new_env", name="MAPPO CTDE Individual Policies NoisyCameraObservation NegativeAccumulatedWaitingTimeSinceLastStep")] if LOG_TO_WANDB else None,
checkpoint_config=tune.CheckpointConfig(
checkpoint_frequency=10, checkpoint_at_end=True
),
),
)
tuner.fit()
# algo = config.build_algo()
# print(algo.train())