I wrote a test environment and model to test how Ray RLlib, PyTorch and gym work and printed out observations, actions and other information from the script. I am trying to get a better understanding of how to work with the libraries, but I got unexpected print statements. Inside the the forward function of my custom rllib.models.torch.torch_modelv2.TorchModelV2
, the inputs
argument is of type SampleBatch of size 32 and I don’t know why that is. Thus, the observations I send through the network are of size (32,< observation size >). The next couple of print statements match up more with my expectation, because I get a SampleBatch of size 1 and the observation is of size (1, < observation size > ). I have read through the Ray[RLlib] documentation, but can’t seem to find out why that is. It might be specific to the Ray[RLlib] Algorithm I am using, but I am unsure. Here is the simple test I wrote for more information:
from ray import rllib
import gym
import numpy
import ray
import torch
class MoreComplexNetwork (
rllib.models.torch.torch_modelv2.TorchModelV2,
torch.nn.Module
) :
def __init__ (
self,
observation_space,
action_space,
num_outputs,
config,
name
):
print( "observation_space: ", observation_space )
print( "action_space: ", action_space )
print( "num_outputs: ", num_outputs )
print( name )
rllib.models.torch.torch_modelv2.TorchModelV2.__init__ (
self,
observation_space,
action_space,
num_outputs,
config,
name
)
torch.nn.Module.__init__( self )
num_observations = int ( numpy.product ( observation_space.shape ) )
layers = [
rllib.models.torch.misc.SlimFC (
in_size = 8,
out_size = 8,
initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
),
rllib.models.torch.misc.SlimFC (
in_size = 8,
out_size = 8,
initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
),
rllib.models.torch.misc.SlimFC (
in_size = 8,
out_size = num_outputs,
initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
)
]
self.main_network = torch.nn.Sequential( *layers )
self.value_network = rllib.models.torch.misc.SlimFC (
in_size = 8,
out_size = 1,
initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
)
self.current_value = None
def forward( self, inputs, state, seq_lens ):
# BONUS QUESTION: what is the state argument?
# it doesn't seem to be any state from the simulation environment?
# is it state specific to the forward passes of the network model used for more complex networks such as RNNs?
print( "forward inputs: ", inputs )
print( "forward inputs[obs]: ", inputs["obs"] )
print( "forward inputs[obs_flat]: ", inputs["obs_flat"] )
print( "forward state: ", state )
print( "forward seq_lens: ", seq_lens )
observation = inputs [ "obs_flat" ].float ( )
print( observation.shape )
# observation = observation.reshape ( observation.shape [ 0 ], -1 )
print( "forward observation: ", observation )
logits = self.main_network ( observation )
print( "forward logits: ", logits )
self.current_value = self.value_network( observation ).squeeze ( 1 )
return logits, state
def value_function( self ):
print( "value_function result: ", self.current_value )
print( "-------------------------------------------------------------" )
return self.current_value
class EnvironmentManager( gym.Env ):
def __init__ ( self, config ):
self.observation_space = gym.spaces.Box(
float("-inf") * numpy.ones(8),
float("inf") * numpy.ones(8),
dtype = numpy.float64
)
self.action_space = gym.spaces.Box(
-1 * numpy.ones( 4 ),
numpy.ones( 4 ),
dtype = numpy.float64()
)
self.goals = numpy.array( [ 10, 10, 10, 10, 10, 10, 10, 10 ] )
self.values = numpy.array( [ 0, 0, 0, 0, 0, 0, 0, 0] )
def reset ( self ):
print( "reset!" )
return ( self.goals - self.values )
def step ( self, action ):
print( "step!" )
print ( "action: ", action )
self.values = self.values + numpy.array( [ *action, *action ] )
observations = ( self.goals - self.values )
reward = 0
for observation in observations:
if observation != 0:
reward = reward + numpy.absolute( 1 / observation )
else:
reward + float( "inf" )
print ( "reward: ", reward )
is_end_of_episode = numpy.allclose( self.goals, self.values, atol=1 )
return (
observations,
reward,
is_end_of_episode,
{}
)
if __name__ == "__main__":
ray.init( )
rllib.models.ModelCatalog.register_custom_model(
"more_complex_network",
MoreComplexNetwork
)
ray.tune.registry.register_env( "environment_manager", lambda config: EnvironmentManager( config ) )
ray.tune.run(
"PPO",
name = "more_complex_training_test",
local_dir = "./output",
checkpoint_at_end = True,
config = {
"env": "environment_manager",
"log_level": "WARN",
"gamma": 0.95,
"lambda": 0.95,
"clip_param": 0.2,
"kl_coeff": 0.0,
"vf_clip_param": 100,
"num_sgd_iter": 2,
"lr": 0.00001,
"sgd_minibatch_size": 1,
"horizon": 5,
"rollout_fragment_length": 1,
"train_batch_size": 1,
"framework": "torch",
"model": {
"custom_model": "more_complex_network"
},
"num_workers": 0
},
stop = {
"time_total_s": 12,
"training_iteration": 2
}
)