Tensor dimension error while evaluating the model while evaluating Impela with Attention

1. Severity of the issue:
High: Completely blocks me.

2. Environment:

  • Ray version: 2.4.0
  • Python version: 3.8
  • OS: Linux
  • Other libs/tools (if relevant): Highway-env

Hello everyone,

I’m trying to run an experiment using Impela with Attention on Highway-env.
When I run the script, it successfully finishes the training but when It attempts to evaluate the model. It raises Tensor dimension error:

The Error:

Error in compute_single_action: Tensors must have same number of dimensions: got 2 and 3
Traceback (most recent call last):
  File "highway_rllib_impela with attention_claude_fix.py", line 305, in evaluate_agent
    action, state, _ = algo.compute_single_action(
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py", line 1595, in compute_single_action
    action, state, extra = policy.compute_single_action(
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 545, in compute_single_action
    out = self.compute_actions_from_input_dict(
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py", line 522, in compute_actions_from_input_dict
    return self._compute_action_helper(
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/utils/threading.py", line 24, in wrapper
    return func(self, *a, **k)
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py", line 1141, in _compute_action_helper
    dist_inputs, state_out = self.model(input_dict, state_batches, seq_lens)
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/models/modelv2.py", line 259, in __call__
    res = self.forward(restored, state or [], seq_lens)
    return forward_call(*args, **kwargs)
  File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/models/torch/modules/relative_multi_head_attention.py", line 120, in forward
    inputs = torch.cat((memory.detach(), inputs), dim=1)
RuntimeError: Tensors must have same number of dimensions: got 2 and 3

I can run the script with Impela and LSTM but with Attention enabled I face the above mentioned error. I’d appreciate if someone can help me with this. The full script is give below. Because I need to use Highway-env v1.8 and dependency issues I had to use Ray/RLlib v2.4.

The Code:

#!/usr/bin/env python3
"""
Requirements:
pip install ray[rllib]==2.4.0
pip install highway-env==1.8.0
pip install gymnasium
pip install torch (optional)
"""

import os
import sys
import gymnasium as gym
import numpy as np
import ray
from ray.rllib.algorithms.impala import Impala
from ray import tune

from ray.rllib.env.env_context import EnvContext
import highway_env

# Suppress some warnings for cleaner output
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)


def make_highway_env(config: EnvContext):
    """
    Environment factory function for RLlib.
    Creates and configures a Highway-env intersection environment.
    """

    highway_env.register_highway_envs()

    # Create the base environment
    env = gym.make("intersection-v0")

    # Configure the environment with your exact specifications
    env.configure({
        "observation": {
            "type": "Kinematics",
            "vehicles_count": 5,
            "features": ["presence", "x", "y", "vx", "vy"],  # Include presence feature
            "features_range": {
                "x": [-100, 100],
                "y": [-100, 100],
                "vx": [-20, 20],
                "vy": [-20, 20],
                "presence": [0, 1]  # Binary presence indicator
            },
            "absolute": False,  # Use relative coordinates
            "normalize": True,
            "see_behind": True,  # Important for intersection safety
            "order": "sorted",
            "flatten": False,
            "observe_intentions": False
        },
        "action": {
            "type": "DiscreteMetaAction",
            "longitudinal": True,
            "lateral": False,
            "target_speeds": [0, 4.5, 9]  # Conservative speeds for intersection
        },

        # Intersection-specific parameters
        "duration": 20,  # Longer episodes for better learning
        "destination": "o1",
        "controlled_vehicles": 1,
        "initial_vehicle_count": 8,  # Fewer vehicles initially for easier learning
        "spawn_probability": 0.4,  # Lower spawn rate for simpler scenarios

        # Reward configuration (binary reward machine)
        "collision_reward": -20,
        "high_speed_reward": 1,
        "arrived_reward": 400,
        "reward_speed_range": [7.0, 9.0],
        "normalize_reward": False,
        "offroad_terminal": False,

        # Simulation parameters
        "policy_frequency": 5,  # Even lower frequency for more reaction time
        "simulation_frequency": 12,

        # Vehicle behavior - make other vehicles more predictable
        "other_vehicles_type": "highway_env.vehicle.behavior.IDMVehicle",


    })

    obs, info = env.reset()

    return env
    # return wrapped_env


def train_agent(num_iterations=10, checkpoint_dir="./highway_checkpoints"):
    """
    Train an agent on the Highway intersection environment.

    Args:
        config: model config dictionary
        num_iterations: Number of training iterations
        checkpoint_dir: Directory to save checkpoints

    Returns:
        Path to the final checkpoint
    """
    print("Starting RLlib training with Highway-env...")

    # Initialize Ray
    ray.init(ignore_reinit_error=True, log_to_driver=False)

    # Register the environment
    tune.register_env("highway-intersection", make_highway_env)

    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Build and train the algorithm

    # model config dictionary
    config = {
        "env": "highway-intersection",
        "framework": "torch",

        # IMPALA parameters
        "lr": 1e-4,
        "gamma": 0.95,
        "vtrace": True,
        "vtrace_clip_rho_threshold": 1.0,
        "vtrace_clip_pg_rho_threshold": 1.0,
        "entropy_coeff": 0.01,
        "vf_loss_coeff": 0.5,
        "grad_clip": 40.0,

        # Adjusted for attention
        "num_workers": 1,
        "num_envs_per_worker": 1,
        "rollout_fragment_length": 50,
        "train_batch_size": 500,

        # ATTENTION MODEL
        "model": {
            "fcnet_hiddens": [256, 256],
            "fcnet_activation": "relu",
            "use_attention": True,
            "attention_num_transformer_units": 2,
            "attention_dim": 128,
            "attention_num_heads": 8,
            "attention_head_dim": 16,
            "attention_memory_inference": 50,
            "attention_memory_training": 50,
            "attention_position_wise_mlp_dim": 256,
            "use_lstm": False,
        },

        "num_gpus": 1,
        "num_cpus_per_worker": 1,
        "evaluation_interval": 10,
        "evaluation_num_episodes": 5,
    }

    algo = Impala(config=config)

    print(f"Training for {num_iterations} iterations...")
    print("Iteration | Episode Reward Mean | Episode Length Mean")
    print("-" * 55)

    try:
        best_reward = -float('inf')
        best_checkpoint = None

        for i in range(num_iterations):
            # Train for one iteration
            result = algo.train()

            # Extract metrics
            episode_reward_mean = result.get('episode_reward_mean', 0)
            episode_len_mean = result.get('episode_len_mean', 0)

            print(f"{i + 1:9d} | {episode_reward_mean:17.2f} | {episode_len_mean:17.2f}")

            # Save checkpoint every 5 iterations or if it's the best so far
            if (i + 1) % 5 == 0 or episode_reward_mean > best_reward:
                checkpoint_path = algo.save(checkpoint_dir)
                if episode_reward_mean > best_reward:
                    best_reward = episode_reward_mean
                    best_checkpoint = checkpoint_path
                    print(f"New best checkpoint saved: {checkpoint_path}")

        # Save final checkpoint
        final_checkpoint = algo.save(checkpoint_dir)
        print(f"Training completed! Final checkpoint: {final_checkpoint}")

        return best_checkpoint or final_checkpoint

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")
        final_checkpoint = algo.save(checkpoint_dir)
        return final_checkpoint
    finally:
        algo.stop()


def evaluate_agent(checkpoint_path, num_episodes=5, render=True):
    """
    Evaluate a trained agent.

    Args:
        config: model config dictionary
        checkpoint_path: Path to the saved checkpoint
        num_episodes: Number of episodes to evaluate
        render: Whether to render the environment
    """
    print(f"\nEvaluating agent from checkpoint: {checkpoint_path}")

    # Register the environment (needed for loading the checkpoint)
    tune.register_env("highway-intersection", make_highway_env)

    # Load the trained algorithm
    # Detect available framework (same as training)
    try:
        import torch
        framework = "torch"
    except ImportError:
        framework = "tf2"

    # model config dictionary
    config = {
        "env": "highway-intersection",
        "framework": framework,

        # IMPALA parameters
        "lr": 1e-4,
        "gamma": 0.95,
        "vtrace": True,
        "vtrace_clip_rho_threshold": 1.0,
        "vtrace_clip_pg_rho_threshold": 1.0,
        "entropy_coeff": 0.01,
        "vf_loss_coeff": 0.5,
        "grad_clip": 40.0,

        # Adjusted for attention
        "num_workers": 1,
        "num_envs_per_worker": 1,
        "rollout_fragment_length": 50,
        "train_batch_size": 500,

        # # ATTENTION MODEL
        "model": {
            "fcnet_hiddens": [256, 256],
            "fcnet_activation": "relu",
            "use_attention": True,
            "attention_num_transformer_units": 2,
            "attention_dim": 128,
            "attention_num_heads": 8,
            "attention_head_dim": 16,
            "attention_memory_inference": 50,
            "attention_memory_training": 50,
            "attention_position_wise_mlp_dim": 256,
            "use_lstm": False,
        },

        "num_gpus": 1,
        "num_cpus_per_worker": 1,
        "evaluation_interval": 10,
        "evaluation_num_episodes": 5,
    }

    algo = Impala(config=config)

    algo.restore(checkpoint_path)

    # Create evaluation environment
    env = make_highway_env({})

    episode_rewards = []
    episode_lengths = []
    crash_count = 0
    success_count = 0

    print(f"Running {num_episodes} evaluation episodes...")
    print("Episode | Reward | Length | Result")
    print("-" * 35)

    for episode in range(num_episodes):
        obs, info = env.reset()
        episode_reward = 0
        episode_length = 0
        done = truncated = False

        # Initialize LSTM states
        state = algo.get_policy().get_initial_state()

        while not (done or truncated):
            # Get action from trained agent
            # action = algo.compute_single_action(obs, explore=False)
            action, state, _ = algo.compute_single_action(obs, state=state, explore=False)

            # Step environment
            obs, reward, done, truncated, info = env.step(action)
            episode_reward += reward
            episode_length += 1

            # Render if requested
            if render:
                try:
                    env.render()
                except:
                    pass  # Skip rendering if it fails

        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)

        # Determine episode result
        if info.get('crashed', False):
            result = "Crashed"
            crash_count += 1
        elif done:
            result = "Success"
            success_count += 1
        else:
            result = "Timeout"

        print(f"{episode + 1:7d} | {episode_reward:6.2f} | {episode_length:6d} | {result}")

    # Print summary statistics
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    mean_length = np.mean(episode_lengths)

    print("\nEvaluation Summary:")
    print(f"Mean Episode Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    print(f"Mean Episode Length: {mean_length:.1f}")
    print(f"Success Rate: {sum(1 for r in episode_rewards if r > 0) / len(episode_rewards) * 100:.1f}%")
    print(f"Crash count: {crash_count}")
    print(f"Success count: {success_count}")

    env.close()
    algo.stop()

    return {
        'mean_reward': mean_reward,
        'std_reward': std_reward,
        'mean_length': mean_length,
        'episode_rewards': episode_rewards
    }


def main():
    print("=" * 50)

    # Check if checkpoint already exists
    checkpoint_dir = "./highway_checkpoints"

    # Training phase
    print("Training Phase")
    print("-" * 20)

    NUM_TRAINING_ITER = 1
    NUM_EVALUATION_EPISODE = 1
    RENDER = True
    checkpoint_path = train_agent(num_iterations=NUM_TRAINING_ITER, checkpoint_dir=checkpoint_dir)
    print(f"Training completed successfully!")
    # print(f"Final checkpoint path {checkpoint_path}")

    # Evaluation phase
    print("\nEvaluation Phase")
    print("-" * 20)
    results = evaluate_agent(checkpoint_path, num_episodes=NUM_EVALUATION_EPISODE, render=RENDER)

    print(f"Final performance: {results['mean_reward']:.2f} average reward")

    ray.shutdown()

    return 0


if __name__ == "__main__":
    # Run main demo
    exit_code = main()
    sys.exit(exit_code)