Reward function not converging during training

Hi @mannyv

Sorry for not providing this information.

Here is my environment:

from gym import Env
from gym.spaces import Box, Dict
import numpy as np
import matplotlib.pyplot as plt
import plotext
import math

inf=1e4
TICK = 0.5
GAMMA_TRACTION_MAX = 1.5
GAMMA_FS_MAX = -1.5
VMAX = 38.8
PAEX = 1000.
PAF = 1100.
LIMITE_DOMAINE = PAF + 130
SPEED_LIMIT = np.array([[0., 30.], [250., VMAX], [800., 20.], [PAEX, 0], [PAF, 0], [LIMITE_DOMAINE, 0]], dtype=np.float32)
PENALTY_OVERSPEED = 10
PENALTY_DEPASSEMENT = 1000
BONUS_SPEED = 1000
EPSILON = 25

class SimuMiniEnv_v0(Env):
    MAX_STEPS = 10000

    def __init__(self):

        self.action_space = Box(low=np.array([GAMMA_FS_MAX / 10]), high=np.array([GAMMA_TRACTION_MAX / 10]))
        
        spaces = {
            'acceleration': Box(low=GAMMA_FS_MAX, high=GAMMA_TRACTION_MAX, shape=(1,), dtype=np.float32),
            'speed': Box(low=0, high=VMAX, shape=(1,), dtype=np.float32),
            'position': Box(low=0, high=LIMITE_DOMAINE+EPSILON, shape=(1,), dtype=np.float32),
            'LV': Box(low=-inf, high=inf, shape=(6, 2), dtype=np.float32),
        }
        self.observation_space = Dict(spaces)
        self.state = {
            'acceleration': np.array([0.]),
            'speed': np.array([0.]),
            'position': np.array([0.]),
            'LV': SPEED_LIMIT,
        }
        self.acceleration = [0.]
        self.speed = [0.]
        self.position = [0.]

    def step(self, action):

        bonus = 0
        acc_next_step = self.acceleration[-1] + action[0]

        if acc_next_step < GAMMA_FS_MAX:
            acceleration_temp = GAMMA_TRACTION_MAX
        elif acc_next_step > GAMMA_TRACTION_MAX:
            acceleration_temp = GAMMA_TRACTION_MAX
        else:
            acceleration_temp = acc_next_step

        speed_temp = acceleration_temp * TICK + self.speed[-1]
        if speed_temp < 0:
            acceleration_temp = self.speed[-1] / TICK
            speed_temp = 0.
        elif speed_temp > VMAX:
            acceleration_temp = (VMAX - self.speed[-1]) / TICK
            speed_temp = VMAX

        self.acceleration.append(acceleration_temp)
        self.speed.append(speed_temp)

        self.position.append(self.speed[-1] * TICK + self.position[-1])

        SL_updated = np.add(SPEED_LIMIT, np.array([-self.position[-1], 0]))

        index_current_SL = [np.where(SL_updated[:, 0] <= 0)[0][-1], 1]

        current_SL = SL_updated[tuple(index_current_SL)]  

        SL_updated[SL_updated < 0] = inf
        SL_updated[index_current_SL[0], 0] = 0

        for i in range(np.size(SL_updated, 0)):
            if SL_updated[i, 0] == inf:
                SL_updated[i, 1] = inf

        done = False
        self.simone_length += TICK  

        if self.speed[-1] >= current_SL:  
            bonus -= PENALTY_OVERSPEED

        if self.position[-1] >= PAEX:
            if self.speed[-1] <= 0.1:
                done = True
                bonus += self.simone_length * TICK * BONUS_SPEED
            else:
                bonus -= PENALTY_DEPASSEMENT / 10

        if self.position[-1] >= PAF:
            if self.speed[-1] <= 0.01:
                done = True
            else:
                bonus -= PENALTY_DEPASSEMENT
                done = True

        if self.simone_length / TICK >= self.MAX_STEPS:
            bonus -= PENALTY_DEPASSEMENT * 10
            done = True

        if self.position[-1] >= LIMITE_DOMAINE:
            done = True

        if self.speed[-1] <= 0.001 :
            bonus -= PENALTY_OVERSPEED / 10

        info = {} 

        state = {
                'acceleration': np.array([self.acceleration[-1]]),
                'speed': np.array([self.speed[-1]]),
                'position': np.array([self.position[-1]]),
                'LV': SL_updated,
        }
        self.state = state

        return self.state, bonus, done, info

    def render(self):
        pass

    def reset(self):
        self.state = {
            'acceleration': np.array([0]),
            'speed': np.array([0]),
            'position': np.array([0]),
            'LV': SPEED_LIMIT,
        }
        self.position = [0]
        self.speed = [0]
        self.acceleration = [0]
        return self.state

and here is how I setup my training :

register_env("SimuMini-v0", lambda config: SimuMiniEnv_v0())
results=ray.tune.run(
        "PPO",
        stop={"training_iteration": 300},
        config={"env": "SimuMini-v0", "lr": 1e-4, "vf_clip_param": 1e8}
    )

I’m using ray 1.12.1 with Python 3.9 (macOS 11 in case that’s useful)