1. Severity of the issue:
High: Completely blocks me.
2. Environment:
- Ray version: 2.4.0
- Python version: 3.8
- OS: Linux
- Other libs/tools (if relevant): Highway-env
Hello everyone,
I’m trying to run an experiment using Impela with Attention on Highway-env.
When I run the script, it successfully finishes the training but when It attempts to evaluate the model. It raises Tensor dimension error:
The Error:
Error in compute_single_action: Tensors must have same number of dimensions: got 2 and 3
Traceback (most recent call last):
File "highway_rllib_impela with attention_claude_fix.py", line 305, in evaluate_agent
action, state, _ = algo.compute_single_action(
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py", line 1595, in compute_single_action
action, state, extra = policy.compute_single_action(
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 545, in compute_single_action
out = self.compute_actions_from_input_dict(
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py", line 522, in compute_actions_from_input_dict
return self._compute_action_helper(
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/utils/threading.py", line 24, in wrapper
return func(self, *a, **k)
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py", line 1141, in _compute_action_helper
dist_inputs, state_out = self.model(input_dict, state_batches, seq_lens)
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/models/modelv2.py", line 259, in __call__
res = self.forward(restored, state or [], seq_lens)
return forward_call(*args, **kwargs)
File "/home/xyz/miniconda3/envs/highway_to_keep_unchange/lib/python3.8/site-packages/ray/rllib/models/torch/modules/relative_multi_head_attention.py", line 120, in forward
inputs = torch.cat((memory.detach(), inputs), dim=1)
RuntimeError: Tensors must have same number of dimensions: got 2 and 3
I can run the script with Impela and LSTM but with Attention enabled I face the above mentioned error. I’d appreciate if someone can help me with this. The full script is give below. Because I need to use Highway-env v1.8 and dependency issues I had to use Ray/RLlib v2.4.
The Code:
#!/usr/bin/env python3
"""
Requirements:
pip install ray[rllib]==2.4.0
pip install highway-env==1.8.0
pip install gymnasium
pip install torch (optional)
"""
import os
import sys
import gymnasium as gym
import numpy as np
import ray
from ray.rllib.algorithms.impala import Impala
from ray import tune
from ray.rllib.env.env_context import EnvContext
import highway_env
# Suppress some warnings for cleaner output
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
def make_highway_env(config: EnvContext):
"""
Environment factory function for RLlib.
Creates and configures a Highway-env intersection environment.
"""
highway_env.register_highway_envs()
# Create the base environment
env = gym.make("intersection-v0")
# Configure the environment with your exact specifications
env.configure({
"observation": {
"type": "Kinematics",
"vehicles_count": 5,
"features": ["presence", "x", "y", "vx", "vy"], # Include presence feature
"features_range": {
"x": [-100, 100],
"y": [-100, 100],
"vx": [-20, 20],
"vy": [-20, 20],
"presence": [0, 1] # Binary presence indicator
},
"absolute": False, # Use relative coordinates
"normalize": True,
"see_behind": True, # Important for intersection safety
"order": "sorted",
"flatten": False,
"observe_intentions": False
},
"action": {
"type": "DiscreteMetaAction",
"longitudinal": True,
"lateral": False,
"target_speeds": [0, 4.5, 9] # Conservative speeds for intersection
},
# Intersection-specific parameters
"duration": 20, # Longer episodes for better learning
"destination": "o1",
"controlled_vehicles": 1,
"initial_vehicle_count": 8, # Fewer vehicles initially for easier learning
"spawn_probability": 0.4, # Lower spawn rate for simpler scenarios
# Reward configuration (binary reward machine)
"collision_reward": -20,
"high_speed_reward": 1,
"arrived_reward": 400,
"reward_speed_range": [7.0, 9.0],
"normalize_reward": False,
"offroad_terminal": False,
# Simulation parameters
"policy_frequency": 5, # Even lower frequency for more reaction time
"simulation_frequency": 12,
# Vehicle behavior - make other vehicles more predictable
"other_vehicles_type": "highway_env.vehicle.behavior.IDMVehicle",
})
obs, info = env.reset()
return env
# return wrapped_env
def train_agent(num_iterations=10, checkpoint_dir="./highway_checkpoints"):
"""
Train an agent on the Highway intersection environment.
Args:
config: model config dictionary
num_iterations: Number of training iterations
checkpoint_dir: Directory to save checkpoints
Returns:
Path to the final checkpoint
"""
print("Starting RLlib training with Highway-env...")
# Initialize Ray
ray.init(ignore_reinit_error=True, log_to_driver=False)
# Register the environment
tune.register_env("highway-intersection", make_highway_env)
# Create checkpoint directory
os.makedirs(checkpoint_dir, exist_ok=True)
# Build and train the algorithm
# model config dictionary
config = {
"env": "highway-intersection",
"framework": "torch",
# IMPALA parameters
"lr": 1e-4,
"gamma": 0.95,
"vtrace": True,
"vtrace_clip_rho_threshold": 1.0,
"vtrace_clip_pg_rho_threshold": 1.0,
"entropy_coeff": 0.01,
"vf_loss_coeff": 0.5,
"grad_clip": 40.0,
# Adjusted for attention
"num_workers": 1,
"num_envs_per_worker": 1,
"rollout_fragment_length": 50,
"train_batch_size": 500,
# ATTENTION MODEL
"model": {
"fcnet_hiddens": [256, 256],
"fcnet_activation": "relu",
"use_attention": True,
"attention_num_transformer_units": 2,
"attention_dim": 128,
"attention_num_heads": 8,
"attention_head_dim": 16,
"attention_memory_inference": 50,
"attention_memory_training": 50,
"attention_position_wise_mlp_dim": 256,
"use_lstm": False,
},
"num_gpus": 1,
"num_cpus_per_worker": 1,
"evaluation_interval": 10,
"evaluation_num_episodes": 5,
}
algo = Impala(config=config)
print(f"Training for {num_iterations} iterations...")
print("Iteration | Episode Reward Mean | Episode Length Mean")
print("-" * 55)
try:
best_reward = -float('inf')
best_checkpoint = None
for i in range(num_iterations):
# Train for one iteration
result = algo.train()
# Extract metrics
episode_reward_mean = result.get('episode_reward_mean', 0)
episode_len_mean = result.get('episode_len_mean', 0)
print(f"{i + 1:9d} | {episode_reward_mean:17.2f} | {episode_len_mean:17.2f}")
# Save checkpoint every 5 iterations or if it's the best so far
if (i + 1) % 5 == 0 or episode_reward_mean > best_reward:
checkpoint_path = algo.save(checkpoint_dir)
if episode_reward_mean > best_reward:
best_reward = episode_reward_mean
best_checkpoint = checkpoint_path
print(f"New best checkpoint saved: {checkpoint_path}")
# Save final checkpoint
final_checkpoint = algo.save(checkpoint_dir)
print(f"Training completed! Final checkpoint: {final_checkpoint}")
return best_checkpoint or final_checkpoint
except KeyboardInterrupt:
print("\nTraining interrupted by user")
final_checkpoint = algo.save(checkpoint_dir)
return final_checkpoint
finally:
algo.stop()
def evaluate_agent(checkpoint_path, num_episodes=5, render=True):
"""
Evaluate a trained agent.
Args:
config: model config dictionary
checkpoint_path: Path to the saved checkpoint
num_episodes: Number of episodes to evaluate
render: Whether to render the environment
"""
print(f"\nEvaluating agent from checkpoint: {checkpoint_path}")
# Register the environment (needed for loading the checkpoint)
tune.register_env("highway-intersection", make_highway_env)
# Load the trained algorithm
# Detect available framework (same as training)
try:
import torch
framework = "torch"
except ImportError:
framework = "tf2"
# model config dictionary
config = {
"env": "highway-intersection",
"framework": framework,
# IMPALA parameters
"lr": 1e-4,
"gamma": 0.95,
"vtrace": True,
"vtrace_clip_rho_threshold": 1.0,
"vtrace_clip_pg_rho_threshold": 1.0,
"entropy_coeff": 0.01,
"vf_loss_coeff": 0.5,
"grad_clip": 40.0,
# Adjusted for attention
"num_workers": 1,
"num_envs_per_worker": 1,
"rollout_fragment_length": 50,
"train_batch_size": 500,
# # ATTENTION MODEL
"model": {
"fcnet_hiddens": [256, 256],
"fcnet_activation": "relu",
"use_attention": True,
"attention_num_transformer_units": 2,
"attention_dim": 128,
"attention_num_heads": 8,
"attention_head_dim": 16,
"attention_memory_inference": 50,
"attention_memory_training": 50,
"attention_position_wise_mlp_dim": 256,
"use_lstm": False,
},
"num_gpus": 1,
"num_cpus_per_worker": 1,
"evaluation_interval": 10,
"evaluation_num_episodes": 5,
}
algo = Impala(config=config)
algo.restore(checkpoint_path)
# Create evaluation environment
env = make_highway_env({})
episode_rewards = []
episode_lengths = []
crash_count = 0
success_count = 0
print(f"Running {num_episodes} evaluation episodes...")
print("Episode | Reward | Length | Result")
print("-" * 35)
for episode in range(num_episodes):
obs, info = env.reset()
episode_reward = 0
episode_length = 0
done = truncated = False
# Initialize LSTM states
state = algo.get_policy().get_initial_state()
while not (done or truncated):
# Get action from trained agent
# action = algo.compute_single_action(obs, explore=False)
action, state, _ = algo.compute_single_action(obs, state=state, explore=False)
# Step environment
obs, reward, done, truncated, info = env.step(action)
episode_reward += reward
episode_length += 1
# Render if requested
if render:
try:
env.render()
except:
pass # Skip rendering if it fails
episode_rewards.append(episode_reward)
episode_lengths.append(episode_length)
# Determine episode result
if info.get('crashed', False):
result = "Crashed"
crash_count += 1
elif done:
result = "Success"
success_count += 1
else:
result = "Timeout"
print(f"{episode + 1:7d} | {episode_reward:6.2f} | {episode_length:6d} | {result}")
# Print summary statistics
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)
mean_length = np.mean(episode_lengths)
print("\nEvaluation Summary:")
print(f"Mean Episode Reward: {mean_reward:.2f} ± {std_reward:.2f}")
print(f"Mean Episode Length: {mean_length:.1f}")
print(f"Success Rate: {sum(1 for r in episode_rewards if r > 0) / len(episode_rewards) * 100:.1f}%")
print(f"Crash count: {crash_count}")
print(f"Success count: {success_count}")
env.close()
algo.stop()
return {
'mean_reward': mean_reward,
'std_reward': std_reward,
'mean_length': mean_length,
'episode_rewards': episode_rewards
}
def main():
print("=" * 50)
# Check if checkpoint already exists
checkpoint_dir = "./highway_checkpoints"
# Training phase
print("Training Phase")
print("-" * 20)
NUM_TRAINING_ITER = 1
NUM_EVALUATION_EPISODE = 1
RENDER = True
checkpoint_path = train_agent(num_iterations=NUM_TRAINING_ITER, checkpoint_dir=checkpoint_dir)
print(f"Training completed successfully!")
# print(f"Final checkpoint path {checkpoint_path}")
# Evaluation phase
print("\nEvaluation Phase")
print("-" * 20)
results = evaluate_agent(checkpoint_path, num_episodes=NUM_EVALUATION_EPISODE, render=RENDER)
print(f"Final performance: {results['mean_reward']:.2f} average reward")
ray.shutdown()
return 0
if __name__ == "__main__":
# Run main demo
exit_code = main()
sys.exit(exit_code)