Using the current pip install ray[debug] and ray[rllib], here is my minimum reproducible example #1 using dictionary observations.
This fails with the following error:
RuntimeError: Expected hidden[0] size (1, 1, 256), got [1, 32, 256]
import sys
import numpy as np
import gym
from gym import spaces
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from ray import tune
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch
torch, nn = try_import_torch()
class TestingGym(gym.Env):
metadata = {'render.modes': ['human']}
def __init__(self, timesteps=5):
self.timesteps = timesteps
super(TestingGym, self).__init__()
self.reward_range = (-1000, 1000)
self.action_space = spaces.Box( low=np.array([0, 0]), high=np.array([4, 1]) )
self.done_counter = 0
self.input_1_shape = (16,)
self.input_2_shape = (16,)
self.observation_space = spaces.Dict(
dict(
input_1=spaces.Box(low=-np.inf, high=np.inf, shape=self.input_1_shape, dtype=np.float32),
input_2=spaces.Box(low=-np.inf, high=np.inf, shape=self.input_2_shape, dtype=np.float32)
)
)
def get_observation(self):
curr_obs = dict( input_1 = np.random.random( self.input_1_shape ),
input_2 = np.random.random( self.input_2_shape ) )
return curr_obs
def step(self, action):
self.done_counter += 1
if self.done_counter > 1000:
done = True
else:
done = False
return self.get_observation(), 1, done, {}
def reset(self):
self.done_counter = 0
return self.get_observation()
def env_creator(env_config):
env = TestingGym()
return env
class TorchRNNModel(TorchRNN, nn.Module):
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
fc_size=32,
lstm_state_size=256):
nn.Module.__init__(self)
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
self.obs_space = obs_space
self.obs_size = get_preprocessor(obs_space)(obs_space).size
self.fc_size = fc_size
self.lstm_state_size = lstm_state_size
self.input_1_fc = nn.Linear(16, 16)
self.input_2_fc = nn.Linear(16, 16)
self.fc1 = nn.Linear(32, self.fc_size)
self.lstm = nn.LSTM( self.fc_size, self.lstm_state_size, batch_first=True)
self.action_branch = nn.Linear(self.lstm_state_size, num_outputs)
self.value_branch = nn.Linear(self.lstm_state_size, 1)
self._features = None
@override(ModelV2)
def get_initial_state(self):
h = [
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0),
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0)
]
return h
@override(ModelV2)
def value_function(self):
assert self._features is not None, "must call forward() first"
return torch.reshape(self.value_branch(self._features), [-1])
@override(TorchRNN)
def forward_rnn(self, inputs, state, seq_lens):
original_obs = restore_original_dimensions(
torch.squeeze(inputs,1) , self.obs_space, "torch")
x1 = nn.functional.relu( self.input_1_fc( original_obs['input_1'] ) )
x2 = nn.functional.relu( self.input_2_fc( original_obs['input_2'] ) )
# Join the outputs
x = torch.cat((x2, x1), dim=1)
x = nn.functional.relu(self.fc1(x))
x = torch.unsqueeze( x, 0)
self._features, [h, c] = self.lstm(
x, [torch.unsqueeze(state[0], 0),
torch.unsqueeze(state[1], 0)])
action_out = self.action_branch(self._features)
return action_out, [torch.squeeze(h, 0), torch.squeeze(c, 0)]
import ray
import ray.rllib.agents.a3c as a3c
from ray.rllib.models.preprocessors import get_preprocessor
import copy
ray.shutdown(); ray.init()
ModelCatalog.register_custom_model("torch_rnn_model", TorchRNNModel)
tune.registry.register_env(u"TestingGym", env_creator)
trainer = a3c.A2CTrainer(
env = "TestingGym",
config={
"num_workers": 1,
"lr": 0.000001,
"framework": "torch",
"model": { "custom_model": "torch_rnn_model" }
},
)
for i in range(10):
result = trainer.train()
clear_output()
print(pretty_print(result))
And another attempt, this time using the TorchModelV2 with use_lstm or use_attention… both fail. use_lstm fails with
RuntimeError: input.size(-1) must be equal to input_size. Expected 32, got 16
It ( forward_rnn ) is expecting the original observation size after forward. This is the biggest issue because we want to process our observation and then pass it to the LSTM. Are we missing something?
from IPython.display import clear_output
import torch.nn as nn
import ray
from ray.tune.logger import pretty_print
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
class CustomTorchModel(TorchModelV2, nn.Module):
def __init__(self, obs_space, action_space, num_outputs, model_config, name):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
nn.Module.__init__(self)
self.input_1_fc = nn.Linear(16, 16)
self.input_2_fc = nn.Linear(16, 16)
self.fc1 = nn.Linear(32, 16)
num_outputs=2
# self.action_branch = nn.Linear(32, num_outputs)
self.value_branch = nn.Linear(32, 1)
# self._logits = ...
self._features = None
def forward(self, input_dict, state, seq_lens):
x1 = nn.functional.relu( self.input_1_fc( input_dict['obs']['input_1'] ) )
x2 = nn.functional.relu( self.input_2_fc( input_dict['obs']['input_2'] ) )
x = torch.cat((x2, x1), dim=1)
self._features = nn.functional.relu(self.fc1(x))
# action_out = self.action_branch(self._features)
return self._features, state
@override(ModelV2)
def value_function(self):
assert self._features is not None, "must call forward() first"
return torch.reshape(self.value_branch(self._features), [-1])
ray.shutdown(); ray.init()
ModelCatalog.register_custom_model("my_torch_model", CustomTorchModel)
tune.registry.register_env(u"TestingGym", env_creator)
trainer = ppo.PPOTrainer(env="TestingGym",
config={
"framework": "torch",
"model": {
"use_lstm":True,
"custom_model": "my_torch_model",
},
})
for i in range(10):
result = trainer.train()
clear_output()
print(pretty_print(result))
Please correct me, please be critical and tell me the obvious things I’m missing and any other mistakes we see… like passing self._features, or commenting out the action_branch. Side questions, aren’t self._logits and self._features the same thing? I know logits is the tensor before the last output layer. Perhaps self._features is the designated output for the hidden layers.