1. Severity of the issue: (select one)
High: Completely blocks me.
2. Environment:
- Ray version: rllib 2.4.0
- Python version: 3.6
- OS: Ubuntu 18
- Cloud/Infrastructure: local running in a docker container
- Other libs/tools (if relevant):
3. What happened vs. what you expected:
- Expected: 4 parallel workers (one for each env) training a single PPO policy
- Actual: 2 workers created but only one environment/worker is used for training
ray.init()
register_my_stuff()
def env_creator(env_config):
worker_index = env_config.worker_index
env_id = (worker_index - 1) % 4
print(f"[ENV] Worker {worker_index} assigned env_id: {env_id}")
if env_id == 0:
return OhlcvEnv1()
elif env_id == 1:
return OhlcvEnv2()
elif env_id == 2:
return OhlcvEnv3()
else:
return OhlcvEnv()
register_env(“MultiParallelEnv”, lambda config: env_creator(config))
config = (
PPOConfig()
.environment(
env="MultiParallelEnv",##
env_config={"env_id": tune.grid_search([0, 1, 2, 3])}
)
.resources(
num_gpus=1, # use 1 GPU for the local (training) worker
num_cpus_per_worker=1
)
.framework("tf2")
.rollouts(num_rollout_workers=4,##
num_envs_per_worker=1)##
.training(
model={
"custom_model": "my_model"
},
gamma=0.99,
lr=1e-4, # Actor LR
train_batch_size=6000,
entropy_coeff=0.02,
entropy_coeff_schedule=[
[0, 0.02],
[1000, 0.001]
],
)
.evaluation(
evaluation_interval=1,
evaluation_duration=1,
evaluation_duration_unit="episodes",
evaluation_parallel_to_training=False,
evaluation_num_workers=1
)
)
algo = config.build()
max_iterations = 100000
for i in range(max_iterations):
result = algo.train()
eval_results = algo.evaluate()
ray.shutdown()