How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I have been testing continual back propagation and the following is just a basic model I have been using for comparison purposes, so do not mind odd layer creation (unless this is part of the problem). I cannot pinpoint exactly, what is going on, but it seems as though the rewards and vpred_t dimensions are not aligned.
What is odd is that sometimes it happens and sometimes it does not happen…lately it has been happening within ~5 iterations and began in the beginning of the week around iteration 500. Does anyone have an idea of what this might be?
model:
class SimpleCustomTorchModel(TorchModelV2, nn.Module):
def __init__(self, obs_space, action_space, num_outputs, model_config, name):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
nn.Module.__init__(self)
# self.critic_fcnet = TorchFC(obs_space, action_space, 1, model_config, name + "_critic")
self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name +
"_actor")
# create network the same as CBP for equivalent testing
hidden_layer_size = model_config['fcnet_hiddens'][0]
self.act = nn.LeakyReLU()
self.fc1 = nn.Linear(obs_space.shape[0], hidden_layer_size)
self.fc2 = nn.Linear(hidden_layer_size, hidden_layer_size)
self.fc3 = nn.Linear(hidden_layer_size, hidden_layer_size)
self.fc4 = nn.Linear(hidden_layer_size, 1)
@override(TorchModelV2)
def forward(self, input_dict, state, seq_lens):
# Get the model output
logits, _ = self.actor_fcnet(input_dict, state, seq_lens)
means, log_stds = torch.chunk(logits, 2, -1)
# assuming means are normalized between -1 and 1
means_clamped = torch.clamp(means, -1, 1)
# this is based on the means being -1 to 1 so the std_dev domain would be (0,1]
# where exp(-10) and exp(0) would give the above domain for std_dev
log_stds_clamped = torch.clamp(log_stds, -10, 0)
logits = torch.cat((means_clamped, log_stds_clamped), dim = -1)
obs = input_dict['obs']
x = self.act(self.fc1(obs))
x = self.act(self.fc2(x))
x = self.act(self.fc3(x))
# no activation on the output since this will be a scalar of value
self.value = self.fc4(x)
return logits, state
@override(TorchModelV2)
def value_function(self):
return self.value.squeeze(-1)
training loop:
%%time
config = PPOConfig().training(
gamma = 0.99,
lambda_ = 0.95,
# kl_coeff = 0.5,
num_sgd_iter = 15,
lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
vf_loss_coeff = 0.5,
vf_clip_param = 15.0,
clip_param = 0.2,
grad_clip_by ='norm',
train_batch_size = 16_000,
sgd_minibatch_size = 4_000,
grad_clip = 0.5,
model = {'custom_model': 'SimpleCustomTorchModel',
'vf_share_layers': False,
'fcnet_hiddens': [256,256],
'fcnet_activation': 'LeakyReLU',
#this isn't used for some models, but doesn't hurt to keep it
'custom_model_config': {
'num_gaussians': 2,
}
}
).environment(env = 'HalfCheetah-v4'
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
)
algo = config.build()
num_iterations = 1000
results = []
for i in range(num_iterations):
result = algo.train()
print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
results.append([result['episode_reward_mean'], result['episode_len_mean']])
ray.shutdown()
edit: would help to provide the error trace:
RayTaskError(ValueError): ray::RolloutWorker.apply() (pid=83165, ip=172.17.0.2, actor_id=c958ad749106f876415dadf901000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fd5449b3430>)
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/utils/actor_manager.py", line 189, in apply
raise e
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/utils/actor_manager.py", line 178, in apply
return func(self, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/execution/rollout_ops.py", line 89, in <lambda>
lambda w: w.sample(), local_worker=False, healthy_only=True
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/rollout_worker.py", line 694, in sample
batches = [self.input_reader.next()]
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/sampler.py", line 91, in next
batches = [self.get_data()]
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/sampler.py", line 273, in get_data
item = next(self._env_runner)
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 348, in run
outputs = self.step()
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 374, in step
active_envs, to_eval, outputs = self._process_observations(
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 703, in _process_observations
sample_batch = self._try_build_truncated_episode_multi_agent_batch(
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/env_runner_v2.py", line 1004, in _try_build_truncated_episode_multi_agent_batch
episode.postprocess_episode(batch_builder=batch_builder, is_done=False)
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/episode_v2.py", line 320, in postprocess_episode
post_batch = policy.postprocess_trajectory(post_batch, other_batches, self)
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/algorithms/ppo/ppo_torch_policy.py", line 215, in postprocess_trajectory
return compute_gae_for_sample_batch(
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/postprocessing.py", line 204, in compute_gae_for_sample_batch
batch = compute_advantages(
File "/usr/local/lib/python3.8/dist-packages/ray/rllib/evaluation/postprocessing.py", line 128, in compute_advantages
delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
ValueError: operands could not be broadcast together with shapes (571,) (570,)