Why is my `rllib.models.torch.torch_modelv2.TorchModelV2` receiving a Tensor of shape ( 32, <observation size> )?

I wrote a test environment and model to test how Ray RLlib, PyTorch and gym work and printed out observations, actions and other information from the script. I am trying to get a better understanding of how to work with the libraries, but I got unexpected print statements. Inside the the forward function of my custom rllib.models.torch.torch_modelv2.TorchModelV2, the inputs argument is of type SampleBatch of size 32 and I don’t know why that is. Thus, the observations I send through the network are of size (32,< observation size >). The next couple of print statements match up more with my expectation, because I get a SampleBatch of size 1 and the observation is of size (1, < observation size > ). I have read through the Ray[RLlib] documentation, but can’t seem to find out why that is. It might be specific to the Ray[RLlib] Algorithm I am using, but I am unsure. Here is the simple test I wrote for more information:

from ray import rllib
import gym
import numpy
import ray
import torch

class MoreComplexNetwork (
	rllib.models.torch.torch_modelv2.TorchModelV2,
	torch.nn.Module
) :
	def __init__ (
		self,
		observation_space,
		action_space,
		num_outputs,
		config,
		name
	):
		print( "observation_space: ", observation_space )
		print( "action_space: ", action_space )
		print( "num_outputs: ", num_outputs )
		print( name )
		rllib.models.torch.torch_modelv2.TorchModelV2.__init__ (
			self,
			observation_space,
			action_space,
			num_outputs,
			config,
			name
		)
		torch.nn.Module.__init__( self )
		num_observations = int ( numpy.product ( observation_space.shape ) )
		layers = [
			rllib.models.torch.misc.SlimFC (
				in_size = 8,
				out_size = 8,
				initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
				activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
			),
			rllib.models.torch.misc.SlimFC (
				in_size = 8,
				out_size = 8,
				initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
				activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
			),
			rllib.models.torch.misc.SlimFC (
				in_size = 8,
				out_size = num_outputs,
				initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
				activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
			)
		]
		self.main_network = torch.nn.Sequential( *layers )
		self.value_network = rllib.models.torch.misc.SlimFC (
			in_size = 8,
			out_size = 1,
			initializer = rllib.models.torch.misc.normc_initializer ( 1 ),
			activation_fn = rllib.models.utils.get_activation_fn ( "relu", framework="torch" )
		)
		self.current_value = None

	def forward( self, inputs, state, seq_lens ):
        # BONUS QUESTION: what is the state argument?
          # it doesn't seem to be any state from the simulation environment?
          # is it state specific to the forward passes of the network model used for more complex networks such as RNNs?
		print( "forward inputs: ", inputs )
		print( "forward inputs[obs]: ", inputs["obs"] )
		print( "forward inputs[obs_flat]: ", inputs["obs_flat"] )
		print( "forward state: ", state )
		print( "forward seq_lens: ", seq_lens )
		observation = inputs [ "obs_flat" ].float (  )
		print( observation.shape )
#		observation = observation.reshape ( observation.shape [ 0 ], -1 )
		print( "forward observation: ", observation )
		logits = self.main_network ( observation )
		print( "forward logits: ", logits )
		self.current_value = self.value_network( observation ).squeeze ( 1 )
		return logits, state

	def value_function( self ):
		print( "value_function result: ", self.current_value )
		print( "-------------------------------------------------------------" )
		return self.current_value


class EnvironmentManager( gym.Env ):
	def __init__ ( self, config ):
		self.observation_space = gym.spaces.Box(
			float("-inf") * numpy.ones(8),
			float("inf") * numpy.ones(8),
			dtype = numpy.float64
		)
		self.action_space = gym.spaces.Box(
			-1 * numpy.ones( 4 ),
			numpy.ones( 4 ),
			dtype = numpy.float64()
		)
		self.goals = numpy.array( [ 10, 10, 10, 10, 10, 10, 10, 10 ] )
		self.values = numpy.array( [ 0, 0, 0, 0, 0, 0, 0, 0] )

	def reset ( self ):
		print( "reset!" )
		return ( self.goals - self.values )

	def step ( self, action ):
		print( "step!" )
		print ( "action: ", action )
		self.values = self.values + numpy.array( [ *action, *action ] )
		observations = ( self.goals - self.values )
		reward = 0
		for observation in observations:
			if observation != 0:
				reward = reward + numpy.absolute( 1 / observation )
			else:
				reward + float( "inf" )
		print ( "reward: ", reward )
		is_end_of_episode = numpy.allclose( self.goals, self.values, atol=1 )
		return (
			observations,
			reward,
			is_end_of_episode,
			{}
		)

if __name__ == "__main__":
	ray.init( )
	rllib.models.ModelCatalog.register_custom_model(
		"more_complex_network",
		MoreComplexNetwork
	)
	ray.tune.registry.register_env( "environment_manager", lambda config: EnvironmentManager( config ) )
	ray.tune.run(
		"PPO",
		name = "more_complex_training_test",
		local_dir = "./output",
		checkpoint_at_end = True,
		config = {
			"env": "environment_manager",
			"log_level": "WARN",
			"gamma": 0.95,
			"lambda": 0.95,
			"clip_param": 0.2,
			"kl_coeff": 0.0,
			"vf_clip_param": 100,
			"num_sgd_iter": 2,
			"lr": 0.00001,
			"sgd_minibatch_size": 1,
			"horizon": 5,
			"rollout_fragment_length": 1,
			"train_batch_size": 1,
			"framework": "torch",
			"model": {
				"custom_model": "more_complex_network"
			},
			"num_workers": 0
		},
		stop = {
			"time_total_s": 12,
			"training_iteration": 2
		}
	)

Hi @MrDracoG,

Welcome to the forum.

This post may have the information you are looking for.