Compute Advantages broadcasting issue

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I have been testing continual back propagation and the following is just a basic model I have been using for comparison purposes, so do not mind odd layer creation (unless this is part of the problem). I cannot pinpoint exactly, what is going on, but it seems as though the rewards and vpred_t dimensions are not aligned.

What is odd is that sometimes it happens and sometimes it does not happen…lately it has been happening within ~5 iterations and began in the beginning of the week around iteration 500. Does anyone have an idea of what this might be?

model:

class SimpleCustomTorchModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        # self.critic_fcnet = TorchFC(obs_space, action_space, 1, model_config, name + "_critic")
        self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name + 
                                   "_actor")
        # create network the same as CBP for equivalent testing
        hidden_layer_size = model_config['fcnet_hiddens'][0]
        self.act = nn.LeakyReLU()
        self.fc1 = nn.Linear(obs_space.shape[0], hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc3 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc4 = nn.Linear(hidden_layer_size, 1)

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        # Get the model output
        logits, _ = self.actor_fcnet(input_dict, state, seq_lens)
        means, log_stds = torch.chunk(logits, 2, -1)
        # assuming means are normalized between -1 and 1
        means_clamped = torch.clamp(means, -1, 1)
        # this is based on the means being -1 to 1 so the std_dev domain would be (0,1]
        # where exp(-10) and exp(0) would give the above domain for std_dev
        log_stds_clamped = torch.clamp(log_stds, -10, 0)
        logits = torch.cat((means_clamped, log_stds_clamped), dim = -1)

        obs = input_dict['obs']
        x = self.act(self.fc1(obs))
        x = self.act(self.fc2(x))
        x = self.act(self.fc3(x))
        # no activation on the output since this will be a scalar of value
        self.value = self.fc4(x)    
        return logits, state

    @override(TorchModelV2)
    def value_function(self):
        return self.value.squeeze(-1)

training loop:

%%time
config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 15,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 0.5,
    vf_clip_param = 15.0,
    clip_param = 0.2,
    grad_clip_by ='norm', 
    train_batch_size = 16_000, 
    sgd_minibatch_size = 4_000,
    grad_clip = 0.5,
    model = {'custom_model': 'SimpleCustomTorchModel', 
           'vf_share_layers': False,
           'fcnet_hiddens': [256,256],
           'fcnet_activation': 'LeakyReLU',
             #this isn't used for some models, but doesn't hurt to keep it
           'custom_model_config': {
                'num_gaussians': 2,
           }
            }
).environment(env = 'HalfCheetah-v4'
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
)


algo = config.build()

num_iterations = 1000
results = []

for i in range(num_iterations):
    result = algo.train()
    print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
    results.append([result['episode_reward_mean'], result['episode_len_mean']])
    
ray.shutdown()

edit: would help to provide the error trace:

RayTaskError(ValueError): ray::RolloutWorker.apply() (pid=83165, ip=172.17.0.2, actor_id=c958ad749106f876415dadf901000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fd5449b3430>)
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/utils/actor_manager.py", line 189, in apply
    raise e
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/utils/actor_manager.py", line 178, in apply
    return func(self, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/execution/rollout_ops.py", line 89, in <lambda>
    lambda w: w.sample(), local_worker=False, healthy_only=True
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/rollout_worker.py", line 694, in sample
    batches = [self.input_reader.next()]
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/sampler.py", line 91, in next
    batches = [self.get_data()]
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/sampler.py", line 273, in get_data
    item = next(self._env_runner)
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 348, in run
    outputs = self.step()
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 374, in step
    active_envs, to_eval, outputs = self._process_observations(
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 703, in _process_observations
    sample_batch = self._try_build_truncated_episode_multi_agent_batch(
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 1004, in _try_build_truncated_episode_multi_agent_batch
    episode.postprocess_episode(batch_builder=batch_builder, is_done=False)
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/episode_v2.py", line 320, in postprocess_episode
    post_batch = policy.postprocess_trajectory(post_batch, other_batches, self)
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/algorithms/ppo/ppo_torch_policy.py", line 215, in postprocess_trajectory
    return compute_gae_for_sample_batch(
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/postprocessing.py", line 204, in compute_gae_for_sample_batch
    batch = compute_advantages(
  File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/postprocessing.py", line 128, in compute_advantages
    delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
ValueError: operands could not be broadcast together with shapes (571,) (570,)