I have this very simple env and PPO converges on it but SAC doesn’t learn anything.
Env script:
from typing import Any
from torch_geometric.data import HeteroData
import grid2op
from grid2op.Reward import LinesCapacityReward
from grid2op.Chronics import MultifolderWithCache
from lightsim2grid import LightSimBackend
from grid2op.gym_compat import GymActionSpace
from ray.tune.registry import register_env
from ray.rllib.utils.spaces.repeated import Repeated
from gymnasium import Env
import matplotlib.pyplot as plt
from gymnasium import spaces
import io
import numpy as np
from PIL import Image
from gymnasium import spaces
from collections import defaultdict
import torch
import networkx as nx
from collections import OrderedDict
class TestEnv(Env):
def __init__(self) -> None:
super().__init__()
self.bound = 10
self.n_dim = 1
self.n_agents = 1
self.observation_space = spaces.Box(
low=-3*self.bound,
high=3*self.bound,
shape=(3, self.n_dim,),
)
self.action_space = spaces.Box(
low=-1,
high=1,
shape=(self.n_dim,),
)
def observe(self):
obs = np.stack([
self.curr_state - self.target_state,
self.target_state,
self.curr_state,
])
assert self.observation_space.contains(obs)
return obs
def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[Any, dict[str, Any]]:
self.target_state = np.random.uniform(low=-self.bound, high=self.bound, size=(self.n_dim,)).astype(np.float32)
self.curr_state = np.zeros_like(self.target_state).astype(np.float32)
self.n_steps = 0
return self.observe(), {}
def step(self, action: Any):
initial_distance = np.linalg.norm(self.curr_state - self.target_state)
self.curr_state += action
self.curr_state = np.clip(self.curr_state, -2*self.bound, 2*self.bound)
new_distance = np.linalg.norm(self.curr_state - self.target_state)
reward = initial_distance - new_distance
# reward = min(2*reward,reward)
self.n_steps += 1
return self.observe(), reward, self.n_steps >= 100, False, {}
def render(self, mode='human'):
fig, ax = plt.subplots(tight_layout=True)
ax.set_xlim(-3 * self.bound, 3 * self.bound)
ax.set_ylim(-3 * self.bound, 3 * self.bound)
# Draw target state
ax.scatter(self.target_state[0], self.target_state[1], c='red', label='Target')
# Draw current state
ax.scatter(self.curr_state[0], self.curr_state[1], c='blue', label='Agent')
ax.legend()
if mode == 'human':
plt.show()
elif mode == 'rgb_array':
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
img_arr = np.array(Image.open(buf))
plt.close(fig)
return img_arr
def env_creator(env_config):
return TestEnv()
register_env("test_env", env_creator)
Train script:
from torch import nn
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.sac import SACConfig
import imageio
import ray
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
import torch
from ray.rllib.models import ModelCatalog
from tqdm import tqdm
from ray import tune, train
from ray.rllib.models.torch.misc import normc_initializer
import torch.nn.functional as F
if __name__ == "__main__":
import env
context = ray.init()
print(context.dashboard_url)
env = env.TestEnv()
ray.rllib.utils.check_env(env)
config: SACConfig = SACConfig()
config = config.framework("tf2") # type: ignore
config = config.environment( # type: ignore
env="test_env",
# env_config={"env_name": "l2rpn_case14_sandbox"},
# normalize_actions=True,
)
# config = config.rollouts( # type: ignore
# observation_filter="MeanStdFilter",
# )
config.rl_module(_enable_rl_module_api=False)
config = config.training(
_enable_learner_api=False,
# model={"custom_model": "my_torch_model"},
# model={"fcnet_hiddens": [64, 64], "fcnet_activation": "relu"},
gamma=0.99,
# target_network_update_freq=32,
# tau=1.0,
# train_batch_size=32,
# optimization_config={'actor_learning_rate': 0.005, 'critic_learning_rate': 0.005, 'entropy_learning_rate': 0.0001},
# vf_clip_param=100,
)
# config = config.exploration(
# explore=True,
# exploration_config={
# "type": "StochasticSampling",
# }
# )
config = config.evaluation( # type: ignore
evaluation_interval=10,
evaluation_num_episodes=10,
)
config = config.resources(num_gpus=1).rollouts(num_rollout_workers=4) # type: ignore
trainer = config.build()
trainer.train()
tuner = tune.Tuner(
"SAC",
run_config=train.RunConfig(
stop={"training_iteration": 1000},
checkpoint_config=train.CheckpointConfig(
checkpoint_frequency=10,
checkpoint_at_end=True,
),
),
param_space=config,
)
tuner.fit()
print("Done")
The reward keeps decreasing and then goes a bit randomly. I tried to play with the learning rate and other things but nothing seems to help. Any clue?