Env Rendering and Recording during Evaluation

Hi folks, I’m trying to render my RL environment on every evaluation. I have a somewhat working solution however, when I actualy run my code, I end up with two videos instead of one. Any insight would be much appreciated.

1. Severity of the issue: (select one)

Low: Annoying but doesn’t hinder my work.
Medium: Significantly affects my productivity but can find a workaround.

2. Environment:

  • Ray version: 2.50.1
  • Python version: 3.11
  • OS: MacOS
  • Cloud/Infrastructure: local
  • Other libs/tools (if relevant): gymnasium, MetaDrive

3. What happened vs. what you expected:

  • Expected: 1 gif video uploaded to wandb
  • Actual: 2 gif videos. Sometimes, just a still frame.

This is my custom RLLibCallback. This should look similar to this example.

class EpisodeReturn(RLlibCallback):
    """A custom callback that records episode videos and logs evalulation episodes."""

    def __init__(self, env_runner_indices: Sequence[int] | None = None):
        super().__init__()
        # Keep some global state in between individual callback events.
        self.overall_sum_of_rewards = 0.0
        self._env_runner_indices = env_runner_indices
        self.episode_and_return = (None, float("-inf"))

    def on_episode_step(
        self,
        *,
        episode,  # noqa: ANN001
        env_runner,  # noqa: ANN001
        metrics_logger,  # noqa: ANN001, ARG002
        env,  # noqa: ANN001
        env_index,  # noqa: ARG002, ANN001
        rl_module,  # noqa: ARG002, ANN001
        **kwargs,  # noqa: ANN003, ARG002
    ) -> None:
        """Capture render images at each step of the episode."""
        if not env_runner.config.in_evaluation:
            return

        if self._env_runner_indices is not None and env_runner.worker_index not in self._env_runner_indices:
            return

        if isinstance(env.unwrapped, gym.vector.VectorEnv | VectorMultiAgentEnv):
            image = env.unwrapped.envs[0].render()
        else:
            image = env.unwrapped.render()
        image = resize(image, 64, 96)
        image = np.transpose(image, axes=[2, 0, 1])
        if "render_images" not in episode.custom_data:
            episode.custom_data["render_images"] = []
        episode.custom_data["render_images"].append(image)

    def on_episode_end(
        self,
        *,
        episode,  # noqa: ANN001
        env_runner,  # noqa: ANN001
        env,  # noqa: ARG002, ANN001
        env_index,  # noqa: ARG002, ANN001
        rl_module,  # noqa: ANN001, ARG002
        **kwargs,  # noqa: ANN003, ARG002
    ) -> None:
        """Log the episode return and save episode videos during evaluation."""
        episode_return = episode.get_return()
        # Save episode videos during evaluation.
        if not env_runner.config.in_evaluation:
            return

        images = episode.custom_data["render_images"]
        video = np.expand_dims(np.stack(images, axis=0), axis=0)
        self.episode_and_return = (video, episode_return)

    def on_sample_end(
        self,
        *,
        env_runner,  # noqa: ANN001
        metrics_logger,  # noqa: ANN001
        **kwargs,  # noqa: ANN003, ARG002
    ) -> None:
        """Log the best and worst video to this EnvRunner's MetricsLogger."""
        if not env_runner.config.in_evaluation:
            return

        if self.episode_and_return[0] is not None:
            metrics_logger.log_value(
                "episode_videos_best",
                self.episode_and_return[0],
                reduce=None,
                clear_on_reduce=True,
            )
            self.episode_and_return = (None, float("-inf"))

My PPO configs are as follows:

    config = (
        PPOConfig()
        .environment(
            MyScenarioEnv,
            env_config=env_config,
        )
        .env_runners(
            num_env_runners=8,
            num_envs_per_env_runner=1,
        )
        .learners(
            num_learners=1,
        )
        .training(
            lr=0.0001,
        )
        .framework(
            "torch",
            torch_compile_learner=True,
            torch_compile_learner_dynamo_backend="inductor",
            torch_compile_learner_dynamo_mode="reduce-overhead",
        )
        .callbacks(EpisodeReturn)
        .debugging(
            logger_config={
                "type": MyPrintLogger,
                "prefix": "LOG:",
            },
        )
        .evaluation(
            evaluation_interval=args.eval_interval,
            evaluation_num_env_runners=1,
            evaluation_duration_unit="episodes",
            evaluation_duration=1,
        )
    )
    ppo = config.build_algo()

The issue of getting two videos (or sometimes a still frame) per evaluation is likely due to how state is managed in your callback, specifically the use of self.episode_and_return as a single global variable. With multiple evaluation episodes or env runners, this variable can be overwritten before on_sample_end is called, causing only the last episode’s video to be logged, or leading to duplicate/stale videos. The official RLlib example stores best/worst videos per iteration and per env runner, and only logs them in on_sample_end, using a tuple for each (self.best_episode_and_return, self.worst_episode_and_return) and comparing returns to avoid duplicates or overwrites. See the official example for the recommended approach.

To fix this, track the best (and/or worst) episode video and return per evaluation cycle, and reset these after logging in on_sample_end. This avoids overwriting and ensures only one video is logged per evaluation. If you want only one video, you can skip the “worst” logic and just keep the best. Would you like a step-by-step code rewrite based on the official example?

Sources:

Hint: Mention @RunLLM in the post for followups.

@RunLLM yes, I’d like an example

Certainly! Here’s a minimal example based on the official RLlib pattern, ensuring only one video (the best episode) is logged per evaluation cycle. The key is to store the best video and its return in the callback, compare on each episode end, and log/reset in on_sample_end:

import numpy as np
from ray.rllib.callbacks.callbacks import RLlibCallback

class EpisodeReturn(RLlibCallback):
    def __init__(self, env_runner_indices=None):
        super().__init__()
        self.best_episode_and_return = (None, float("-inf"))
        self._env_runner_indices = env_runner_indices

    def on_episode_step(self, *, episode, env_runner, env, **kwargs):
        if not env_runner.config.in_evaluation:
            return
        if self._env_runner_indices is not None and env_runner.worker_index not in self._env_runner_indices:
            return
        image = env.unwrapped.render()
        image = np.transpose(image, axes=[2, 0, 1])
        if "render_images" not in episode.custom_data:
            episode.custom_data["render_images"] = []
        episode.custom_data["render_images"].append(image)

    def on_episode_end(self, *, episode, env_runner, **kwargs):
        if not env_runner.config.in_evaluation:
            return
        episode_return = episode.get_return()
        images = episode.custom_data["render_images"]
        video = np.expand_dims(np.stack(images, axis=0), axis=0)
        # Keep only the best episode video
        if episode_return > self.best_episode_and_return[1]:
            self.best_episode_and_return = (video, episode_return)

    def on_sample_end(self, *, env_runner, metrics_logger, **kwargs):
        if not env_runner.config.in_evaluation:
            return
        if self.best_episode_and_return[0] is not None:
            metrics_logger.log_value(
                "episode_videos_best",
                self.best_episode_and_return[0],
                reduce=None,
                clear_on_reduce=True,
            )
            self.best_episode_and_return = (None, float("-inf"))

This ensures only the best video per evaluation interval is logged, matching the official example.

Sources:

Hint: Mention @RunLLM in the post for followups.