Issue with multiple environments training one PPO policy

1. Severity of the issue: (select one)
High: Completely blocks me.

2. Environment:

  • Ray version: rllib 2.4.0
  • Python version: 3.6
  • OS: Ubuntu 18
  • Cloud/Infrastructure: local running in a docker container
  • Other libs/tools (if relevant):

3. What happened vs. what you expected:

  • Expected: 4 parallel workers (one for each env) training a single PPO policy
  • Actual: 2 workers created but only one environment/worker is used for training

ray.init()
register_my_stuff()

def env_creator(env_config):
worker_index = env_config.worker_index

env_id = (worker_index - 1) % 4  

print(f"[ENV] Worker {worker_index} assigned env_id: {env_id}")
if env_id == 0:
    return OhlcvEnv1()
elif env_id == 1:
    return OhlcvEnv2()
elif env_id == 2:
    return OhlcvEnv3()
else:
    return OhlcvEnv()

register_env(“MultiParallelEnv”, lambda config: env_creator(config))

config = (
PPOConfig()
.environment(

    env="MultiParallelEnv",##
    env_config={"env_id": tune.grid_search([0, 1, 2, 3])}

)
.resources(
num_gpus=1, # use 1 GPU for the local (training) worker
num_cpus_per_worker=1

)
.framework("tf2") 
.rollouts(num_rollout_workers=4,##
          num_envs_per_worker=1)##
.training(
    model={
        "custom_model": "my_model"
        
    },
    gamma=0.99,
    lr=1e-4,  # Actor LR
    train_batch_size=6000,

    entropy_coeff=0.02,  
    entropy_coeff_schedule=[
         [0, 0.02],   
         [1000, 0.001] 
     ],
)

.evaluation(
    evaluation_interval=1,             
    evaluation_duration=1,             
    evaluation_duration_unit="episodes",  
    evaluation_parallel_to_training=False, 
    evaluation_num_workers=1
)

)

algo = config.build()

max_iterations = 100000
for i in range(max_iterations):
result = algo.train()
eval_results = algo.evaluate()

ray.shutdown()