Ray RLLIB PPO does not solve very simple problem

Hello,

RLLIB PPo does not seem to solve a super easy instance of a 1d positioning problem. The env simply tries to make the agent move to a specific point with max velocity 1. I tried some parameters but it never seems to break -150 in reward, even though other simple PPO libraries converge very quickly to -60. Do you know what is happening here? (using Ray 2.7.1)

from typing import Callable, Dict, List, Optional, Tuple, Type, Union

from torch import nn

from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.a2c import A2CConfig
import ray
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
import torch
from torch_geometric.data import HeteroData, Batch
from torch_geometric.utils import from_networkx
from torch_geometric.nn import RGCNConv
from ray.rllib.models import ModelCatalog
from tqdm import tqdm
from ray import tune, train


import torch.nn.functional as F
class CustomTorchModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        self.actor = nn.Sequential(
                            nn.Linear(obs_space.shape[0], 64),
                            nn.Tanh(),
                            nn.Linear(64, 2),
                            nn.Tanh()
                        )
        self.critic = nn.Sequential(
                        nn.Linear(obs_space.shape[0], 16),
                        nn.Tanh(),
                        nn.Linear(16, 1)
                    )

        def forward(self, input_dict, state, seq_lens):
        obs = input_dict["obs"]
        action_logits = self.actor(obs)
        mean, log_std = torch.chunk(action_logits, 2, dim=1)
        
        # mean = torch.tanh(mean)
        std = F.softplus(log_std)
        action = torch.cat([mean, std], dim=1)

        self.val = self.critic(obs)

        return action, []

    def value_function(self):
        return self.val.flatten()



ModelCatalog.register_custom_model("my_torch_model", CustomTorchModel)


if __name__ == "__main__":
    import env_0

    context = ray.init(local_mode=True)
    print(context.dashboard_url)

    env = env_0.TestEnv()
    ray.rllib.utils.check_env(env)
    config = PPOConfig()
    config = config.environment(
        env="test_env"
    )
    config.rl_module(_enable_rl_module_api=False)
    config = config.training(
        _enable_learner_api=False,
        model={"custom_model": "my_torch_model"},
    )
    config = config.evaluation(
        evaluation_interval=10,
        evaluation_num_episodes=10,
    )

    trainer = config.build()
    trainer.train()

    tuner = tune.Tuner(
        "PPO",
        run_config=train.RunConfig(
            stop={"training_iteration": 100000},
            checkpoint_config=train.CheckpointConfig(
                checkpoint_frequency=1000, checkpoint_at_end=True
            ),
        ),
        param_space=config,
    )

    tuner.fit()
    print("Done")

Here is my env:

from typing import Any
from torch_geometric.data import HeteroData
import grid2op
from grid2op.Reward import LinesCapacityReward
from grid2op.Chronics import MultifolderWithCache
from lightsim2grid import LightSimBackend
from grid2op.gym_compat import GymActionSpace
from ray.tune.registry import register_env
from ray.rllib.utils.spaces.repeated import Repeated
from gymnasium import Env
from gymnasium import spaces
import numpy as np
from gymnasium import spaces
from collections import defaultdict
import torch
import networkx as nx
from collections import OrderedDict


class TestEnv(Env):
    def __init__(self) -> None:
        
        super().__init__()
        self.bound = 10
        self.observation_space = spaces.Box(
            low=-3*self.bound,
            high=3*self.bound,
            shape=(3,),
        )
        self.action_space = spaces.Box(
            low=-1,
            high=1,
            shape=(1,),
        )

    def observe(self):
        obs = np.concatenate(
            [
                self.curr_state-self.target_state,
                self.target_state,
                self.curr_state,
            ]
        )
        return obs

    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[Any, dict[str, Any]]:
        self.target_state = np.random.uniform(low=-self.bound, high=self.bound, size=(1,))
        self.curr_state = np.zeros_like(self.target_state)
        self.n_steps = 0
        return self.observe(), {}

    def step(self, action: Any):
        self.curr_state += action
        self.curr_state = np.clip(self.curr_state, -2*self.bound, 2*self.bound)
        distance_to_target = np.abs(self.curr_state - self.target_state)
        reward = -float(distance_to_target)
        self.n_steps += 1
        return self.observe(), reward, self.n_steps >= 100, False, {}


def env_creator(env_config):
    return TestEnv()

register_env("test_env", env_creator)

Hi @Benedikt_Schesch,

If I was having this issue, the first place I would check is increasing the vf_clip_param. By default this value clips the vf loss to a max of 10 which is probably too low given your rewards.

1 Like

Solution was to use the change in reward as the reward and not the distance directly