RLLib Computing random actions that don't match model output

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I’m trying to train a custom tensorflow model using PG which is initialized by a pretrained tf model. The model file looks as below:

`
“”“A custom model created for RLLib policy gradient implementation.”“”
import importlib
from pathlib import Path
import yaml
import os
import sys, gym
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.models.modelv2 import ModelV2
from keras.models import load_model
from keras.optimizers import Adam
from ray.rllib.utils.framework import try_import_tf

from azureml.core.model import Model
from azureml.core import Run
import numpy as np

tf1, tf, tfv = try_import_tf()

class RBCModel(TFModelV2):
“”“Using RBC Model implementation and freezing final layers.”“”

def __init__(
    self,
    observation_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    num_outputs: int,
    model_config: dict,
    name: str,
):
    """Init method for custom rllib model."""
    super(RBCModel, self).__init__(
        observation_space, action_space, num_outputs, model_config, name
    )
    # Download the model from registry
    model_name = model_config["custom_model_config"]["model_name"]
    model_version = model_config["custom_model_config"]["model_version"]
    trainable_upto = model_config["custom_model_config"]["last_trainable_layer"]
    trainable_upto_block = model_config["custom_model_config"]["last_trainable_block"]
    model_dir = os.path.join(BASE_DIR, model_name)
    if not os.path.exists(model_dir):
        run = Run.get_context()
        ws = run.experiment.workspace
        rbc_model = Model(workspace=ws, name=model_name, version=model_version)
        rbc_model.download(target_dir=os.getcwd())

    self.base_model = load_model(model_dir, compile=False)
   
    with Path(model_dir).joinpath("loss_function.yaml").open("r") as f:
        loss_config = yaml.safe_load(f)
    self.base_model.compile(
        optimizer=Adam(learning_rate=float(0.0001)),
        loss=getattr(
            importlib.import_module(f".{loss_config['module_name']}", package=model_name),
            loss_config["loss_function"],
        )(**loss_config["loss_init_args"]),
        metrics=getattr(
            importlib.import_module(f".{loss_config['module_name']}", package=model_name),
            loss_config["metric_function"],
        )(**loss_config["metric_init_args"]),
    )
    self.base_model.summary()

@override(ModelV2)
def forward(self, input_dict: dict, state: T.Any, seq_lens: T.Any):
    """Perform one forward pass over the base model."""
    dim = input_dict["obs"]["speed"].shape[0]
    model_inputs = (
        input_dict["obs"]["image"],
        tf.reshape(input_dict["obs"]["speed"], (dim, 1)),
        tf.reshape(input_dict["obs"]["angle_prev"], (dim, 1)),
        tf.reshape(input_dict["obs"]["throttle_prev"], (dim, 1)),
        tf.reshape(input_dict["obs"]["brake_prev"], (dim, 1)),
        tf.reshape(input_dict["obs"]["target_speed"], (dim, 1)),
    )
    model_out = self.base_model(model_inputs)
    print(f"Model output is {tf.print(model_out)}, shape is {model_out.shape}")
    return model_out, []

@override(ModelV2)
def trainable_variables(
    self, as_dict: bool = False
):
    variable_list = self.base_model.trainable_variables
    if not as_dict:
        return variable_list
    else:
        var_dict = {}
        for var in variable_list:
            var_dict[var.name] = var 
        return var_dict

`
While the forward pass outputs the values correctly, I cannot see the same in the action recieved by the compute actions method.

Model Output:
Tensor Size: [1,2]
Datatype: Float32
Example: [[0.5, 0.5]]

Action Space: Box(np.array([-1.0, -1.0]), np.array([1.0, 1.0]), dtype=np.float32)
Computed Action:
Tensor Size: [1,2]
Datatype: Float32

I have also set explore=False to verify this.

For a model output [ [ 0.5, 0.6 ] ] I am expecting the action to look like [ 0.5, 0.6 ] but it always looks like [0.5, 0.5]. The first element of model output repeats while the second is lost.

Versions / Dependencies

protobuf==3.20.0
ray[rllib]==2.7.0
scp==0.14.5
tensorboard==2.9.0
fsspec==2023.6.0
tensorflow-gpu==2.9
azureml-sdk==1.52.0
psutil==5.9.0
matplotlib==3.7.2