I have such algorithm config with impala which works just fine;
from ray.rllib.algorithms import ImpalaConfig
config = (
ImpalaConfig()
.environment(env=env_name, disable_env_checking=True)
.rollouts(
num_rollout_workers=args.num_rollout_workers,
num_envs_per_worker=args.num_envs_per_worker,
rollout_fragment_length=200 * args.num_envs_per_worker,
batch_mode="truncate_episodes"
)
.evaluation(
evaluation_parallel_to_training=False,
evaluation_interval=100,
evaluation_duration=10,
evaluation_num_workers=0,
evaluation_sample_timeout_s=60
)
.callbacks(partial(callback, path=output_folder_path))
.training(
gamma=0.99, # Discount factor
lr=1e-5, # Learning rate
train_batch_size=200 * args.num_envs_per_worker * args.num_rollout_workers, # Batch size
entropy_coeff=0.001, # Entropy cost
vf_loss_coeff=40, # Baseline cost
grad_clip=42, # Max norm gradient
optimizer={"type": "RMSProp"},
model={
"dim": 88,
"conv_filters": [
[32, [3, 3], 5], # Layer 1
[64, [3, 3], 5], # Layer 2
[128, [3, 3], 2], # Layer 3
],
"conv_activation": "relu",
"fcnet_hiddens": [1024, 1024],
"post_fcnet_activation": "tanh",
"use_lstm": True,
"lstm_cell_size": 1024,
"max_seq_len": 16, # LSTM unroll length
"vf_share_layers": False,
"lstm_use_prev_action": True,
"lstm_use_prev_reward": True,
"post_fcnet_hiddens": [1024],
}
)
.resources(
num_gpus=args.num_gpus,
num_cpus_per_worker=1,
num_gpus_per_worker=(args.num_gpus / args.num_rollout_workers if args.num_rollout_workers > 0 else 0),
)
.framework("torch")
.fault_tolerance(
recreate_failed_workers=True,
restart_failed_sub_environments=True
)
)
Then using similar configuration, i would like to implement DQNConfig;
from ray.rllib.algorithms.dqn import DQNConfig
config = (
DQNConfig()
.environment(env=env_name, disable_env_checking=True)
.rollouts(
num_rollout_workers=args.num_rollout_workers,
num_envs_per_worker=args.num_envs_per_worker,
rollout_fragment_length=200 * args.num_envs_per_worker,
batch_mode="truncate_episodes", # Necessary for RNNs
)
.exploration(
explore=True,
exploration_config={
"type": "EpsilonGreedy",
"initial_epsilon": 1.0,
"final_epsilon": 0.1,
"epsilon_timesteps": 10000,
}
).evaluation(
evaluation_parallel_to_training=False,
evaluation_interval=100,
evaluation_duration=10,
evaluation_num_workers=0,
evaluation_sample_timeout_s=60
)
.callbacks(partial(callback, path=output_folder_path))
.training(
gamma=0.99, # Discount factor
lr=1e-5, # Learning rate
train_batch_size=200 * args.num_envs_per_worker * args.num_rollout_workers, # Batch size
grad_clip=42, # Max norm gradient
optimizer={"type": "RMSProp"},
model={
"dim": 88,
"conv_filters": [
[32, [3, 3], 5], # Layer 1
[64, [3, 3], 5], # Layer 2
[128, [3, 3], 2], # Layer 3
],
"conv_activation": "relu",
"fcnet_hiddens": [1024, 1024],
"post_fcnet_activation": "tanh",
"use_lstm": True,
"lstm_cell_size": 1024,
"max_seq_len": 16, # LSTM unroll length
"vf_share_layers": False,
"lstm_use_prev_action": True,
"lstm_use_prev_reward": True,
"post_fcnet_hiddens": [1024],
},
dueling=True,
double_q=True,
n_step=3,
target_network_update_freq=500,
replay_buffer_config={
"type": "ReplayBuffer",
"capacity": 50000, # Replay buffer capacity
"replay_sequence_length": 16,
"seq_lens": 16,
}
)
.resources(
num_gpus=args.num_gpus,
num_cpus_per_worker=1,
num_gpus_per_worker=args.num_gpus / args.num_rollout_workers,
)
.framework("torch")
.fault_tolerance(
recreate_failed_workers=True,
restart_failed_sub_environments=True
)
)
But then i gives me error of ;
File "/Users/berkayeren/PycharmProjects/rl-learning/.venv/lib/python3.9/site-packages/ray/rllib/models/torch/recurrent_net.py", line 217, in forward [repeated 7x across cluster]
(RolloutWorker pid=8413) assert seq_lens is not None [repeated 7x across cluster]
It seems like in ModelV2 line 237;
237 if seq_lens is None:
238 seq_lens = input_dict.get(SampleBatch.SEQ_LENS)
seq_lens is none and it also sets to again none at line 238 which causes to assertion in next lines.
I cannot solve the issue somehow.
I’m using;
python 3.9
ray 2.9.3
torch 2.5.1
minigrid 3.0.0
gymnasium 1.0.0