[RLlib] Using Attention_Net + Prev_Action or Prev_Reward gives: Index error

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

Hi all,

It seems like there is a bug with the attention wrapper, when providing prev_action or prev_reward it crashes as per log below:

ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=3744, ip=127.0.0.1, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x000002BB8F98F790>)
  File "python\ray\_raylet.pyx", line 875, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 879, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 819, in ray._raylet.execute_task.function_executor
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 460, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 738, in __init__
    self._update_policy_map(policy_dict=self.policy_dict)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 460, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1985, in _update_policy_map
    self._build_policy_map(
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\util\tracing\tracing_helper.py", line 460, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 2097, in _build_policy_map
    new_policy = create_policy_for_framework(
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\utils\policy.py", line 142, in create_policy_for_framework
    return policy_class(observation_space, action_space, merged_config)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\algorithms\ppo\ppo_torch_policy.py", line 67, in __init__
    self._initialize_loss_from_dummy_batch()
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\policy\policy.py", line 1401, in _initialize_loss_from_dummy_batch
    actions, state_outs, extra_outs = self.compute_actions_from_input_dict(
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 518, in compute_actions_from_input_dict
    return self._compute_action_helper(
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\utils\threading.py", line 24, in wrapper
    return func(self, *a, **k)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\policy\torch_policy_v2.py", line 1133, in _compute_action_helper
    dist_inputs, state_out = self.model(input_dict, state_batches, seq_lens)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\models\modelv2.py", line 259, in __call__
    res = self.forward(restored, state or [], seq_lens)
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\models\torch\attention_net.py", line 406, in forward
    one_hot(
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\utils\torch_utils.py", line 465, in one_hot
    [nn.functional.one_hot(x[:, i].long(), n) for i, n in enumerate(nvec)],
  File "C:\personal\ai\ray_venv\lib\site-packages\ray\rllib\utils\torch_utils.py", line 465, in <listcomp>
    [nn.functional.one_hot(x[:, i].long(), n) for i, n in enumerate(nvec)],
IndexError: index 2 is out of bounds for dimension 1 with size 2

During handling of the above exception, another exception occurred:

Versions / Dependencies

OS: Win 11
Ray: Nightly Wheel
Python: 3.10

Reproduction script

import ray
from ray.rllib.env import PolicyServerInput
from ray.rllib.algorithms.ppo import PPOConfig

import numpy as np
from gymnasium.spaces import MultiDiscrete, Box


def _input(ioctx):
    # We are remote worker, or we are local worker with num_workers=0:
    # Create a PolicyServerInput.
    if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0:
        return PolicyServerInput(
            ioctx,
            '127.0.0.1',
            55556 + ioctx.worker_index - (1 if ioctx.worker_index > 0 else 0),
        )
    # No InputReader (PolicyServerInput) needed.
    else:
        return None


ppo_config = PPOConfig()
ppo_config.model = {
    "vf_share_layers": True,

    'use_attention': True,
    "max_seq_len": 64,
    "attention_num_transformer_units": 1,
    "attention_dim": 256,
    "attention_memory_inference": 64,
    "attention_memory_training": 64,
    "attention_num_heads": 8,
    "attention_head_dim": 32,
    "attention_position_wise_mlp_dim": 128,
    "attention_use_n_prev_actions": 2,
    # "attention_use_n_prev_rewards": 64,
    "attention_init_gru_gate_bias": 2.0,

    "conv_filters": [
        [64, [12, 16], [7, 9]],
        [128, [6, 6], 4],
        [256, [9, 9], 1]
    ],
    "conv_activation": "relu",
}

ppo_config.rollouts(num_rollout_workers=2, enable_connectors=False)
ppo_config.offline_data(input_=_input)

ppo_config.framework_str = 'torch'
ppo_config.log_sys_usage = False
ppo_config.compress_observations = True
ppo_config.shuffle_sequences = False

ppo_config.env = None
ppo_config.observation_space = Box(low=0, high=1, shape=(240, 320, 1), dtype=np.float32)
ppo_config.action_space = MultiDiscrete(
    [
        2,  # W
        2,  # A
        2,  # S
        2,  # D
        2,  # Space
        2,  # H
        2,  # J
        2,  # K
        2  # L
    ]
)

from ray import tune

name = "Attention_Repro1"
print(f"Starting: {name}")
tempyy = ppo_config.to_dict()
tune.run("PPO",
         resume='AUTO',
         config=tempyy,
         name=name,
         keep_checkpoints_num=20, checkpoint_score_attr="episode_reward_mean", mode='max',
         checkpoint_freq=1,
         metric="episode_reward_mean",
         max_failures=10,
         checkpoint_at_end=True)

Edit:
Also opened issue at: [RLlib] Using Attention_Net + Prev_Action or Prev_Reward gives: Index error · Issue #35334 · ray-project/ray · GitHub