`compute_single_action` gives same result

I am currently running the following simple corridor example using A2C and tune, and when I restore the best model and get actions, the predicted action is always the same no matter what input I provide. Is this the expected behavior or am I doing something completely wrong by using Tune to train an RLlib model?


import gym
import ray
from ray import air
from ray import tune
from ray.rllib.algorithms.a2c import A2C
from ray.tune import register_env

logging.basicConfig(format='%(levelname)s:%(name)s: %(message)s (%(asctime)s; %(filename)s:%(lineno)d)',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

LOGGER = logging.getLogger(__name__)


class SimpleCorridor(gym.Env):
    """Corridor in which an agent must learn to move right to reach the exit.

    ---------------------
    | S | 1 | 2 | 3 | G |   S=start; G=goal; corridor_length=5
    ---------------------

    Possible actions to chose from are: 0=left; 1=right
    Observations are floats indicating the current field index, e.g. 0.0 for
    starting position, 1.0 for the field next to the starting position, etc..
    Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
    """

    def __init__(self, config):
        LOGGER.warning(f'Initializing the corridor of length {config["corridor_length"]}')
        self.end_pos = config["corridor_length"]
        self.cur_pos = 0
        self.action_space = gym.spaces.Discrete(2)  # left and right
        self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,))

    def reset(self, **kwargs):
        """Resets the episode and returns the initial observation of the new one."""
        LOGGER.warning(f'Resetting current position to the beginning')
        self.cur_pos = 0
        return [self.cur_pos]

    def step(self, action):
        """Takes a single step in the episode given `action`

        Returns:
            New observation, reward, done-flag, warning-dict (empty).
        """
        # Walk left.
        LOGGER.warning(f'inside step function')
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        # Walk right.
        elif action == 1:
            self.cur_pos += 1
        # Set `done` flag when end of corridor (goal) reached.
        done = self.cur_pos >= self.end_pos
        # +1 when goal reached, otherwise -1.
        reward = 1.0 if done else -1.0
        LOGGER.warning(f'Episode end: {self.cur_pos}, {reward}, {done}')
        return [self.cur_pos], reward, done, {}


LOGGER.info(f'Registering env')
environment = SimpleCorridor(config={'corridor_length': 5})

LOGGER.critical(f'If using a remote ray cluster, it should be initialized before the environment is registered,'
                f'otherwise the remote ray cluster never realizes the name of the registered environment')

register_env('simplecorridor', lambda env_config: environment)

agent_params = {
    "num_workers": 1,
    "horizon": 10,
    "model": {
        [32,32]
    },
    "explore": True,
    "lr": 0.001,
    "entropy_coeff": 0.01,
    "vf_loss_coeff": 0.25,
}

metric_dict = {
    "metric": "episode_reward_mean",
    "mode": "max"
}

exp_name = 'test-experiment'
local_dir = 'tmp/ray2/simplecorridor'

trainer_config = agent_params
trainer_config['env'] = 'simplecorridor'

# concerned with metrics and stuff
tune_config = tune.TuneConfig(
    mode=metric_dict['mode'],
    metric=metric_dict['metric']
)

# concerned with the stopping and the checkpointing logic
run_config = air.RunConfig(
    name=exp_name,
    local_dir=local_dir,
    stop={
        "timesteps_total": 100
    },
    checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True, checkpoint_frequency=5,
                                           num_to_keep=5, checkpoint_score_attribute='episode_reward_mean')
)

experiment_tuner = tune.Tuner(
    trainable=A2C,
    param_space=agent_params,
    tune_config=tune_config,
    run_config=run_config,
)

ray.init()
result_grid = experiment_tuner.fit()
LOGGER.info(f'The training is now finished, the best checkpoint as per the metric will be found and loaded. For the'
            f' loading back, we will be using the same metric as we passed into the tuner config')

# no need to pass a metric or mode, default picked from the experiment_tuner config
best_result = result_grid.get_best_result()
best_checkpoint = best_result.checkpoint

LOGGER.info(f'The best checkpoint path is {str(best_checkpoint)}')
LOGGER.info(f'The directory of the best checkpoint for the current experiment run is {best_checkpoint._local_path}')

LOGGER.info(f'Let us now load back the trainer from the best checkpoint from the path we saw above')

LOGGER.info(f'Checking the path of the best checkpoint available from the trainer object')

# this also restores the configuration that the best checkpoint had
new_trainer_for_inference = A2C.from_checkpoint(best_checkpoint)

# we also have to check the method to resume training if continuous training flag is enabled via Airflow
LOGGER.info(f'Make a prediction to check if the model works from the loaded checkpoint')

# the predictions are always the same no matter what input observation you pass, be it 6 or 6000
predictions = new_trainer_for_inference.compute_single_action([6])

LOGGER.info(f'The predictions are now working')

Here, for the predictions, it does not matter if you pass any value, the action is always the same. I have the same issue with my actual environment, wherein I always get the same action no matter what the input is. How do I start fixing this?

I’m unclear what the issue is: the predicted optimal action is always 1 i.e. going to the right, which the algorithm seems to learn, correct?