Custom eval returning episode return mean as NaN

I am using a custom eval function because I have some custom metrics I want to save. But my episode return mean, max and min are always NaN. I debugged and i realized the collected metric list is empty.

Ray: 2.10.0

.rollouts(
            num_rollout_workers=1,
            num_envs_per_worker=10,
            create_env_on_local_worker=False,
            #rollout_fragment_length=100,
            #batch_mode="complete_episodes",
        )

        .resources(
                num_gpus=0.1 if torch.cuda.is_available() else 0,
                num_cpus_per_worker=1,
                num_gpus_per_worker=(1 - 0.1) if torch.cuda.is_available() else 0,
            )
            
        .evaluation(
        evaluation_interval=1,
        evaluation_duration=1,
        evaluation_duration_unit="episode",
        always_attach_evaluation_results=True,
        evaluation_num_workers=1,
        custom_evaluation_function=custom_eval_function,
        evaluation_config={"explore": False},
        )
def custom_eval_function(algorithm, eval_workers):
    """Custom evaluation function to collect the best validation loss from eval_env.

    Args:
        algorithm: Algorithm class to evaluate.
        eval_workers: Evaluation WorkerSet.

    Returns:
        metrics: Evaluation metrics dict.
    """
    # Optional: Modify evaluation environments if needed
    # For this example, we'll assume the environments are already set up

    # Number of evaluation episodes to run
    num_eval_episodes = 1
    for i in range(num_eval_episodes):
        #print("Custom evaluation round", i)
        # Run one evaluation episode per worker
        eval_workers.foreach_worker(
            func=lambda w: w.sample(), local_worker=False,
            timeout_seconds=99999
        )

    metric_lists = eval_workers.foreach_worker(
        func=lambda w: w.get_metrics(),
        timeout_seconds=99999,
    )

    print(f"Metric lists: {metric_lists}")

Also, the evaluation/hist_stats/episode_reward and evaluation/hist_stats/episode_lengths were both empty. Doesn’t that mean the episode did not finish?

Even running default evaluation also returns NaN

I increased the evaluation_sample_timeout_s and it worked!