I guess I found the answer with the help of this RLlib example
from typing import Type
from ray.rllib.agents.trainer import Trainer
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.typing import ModelWeights, TrainerConfigDict
class FIFO(Policy):
"""FIFO policy"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = None
self.exploration = self._create_exploration()
def compute_actions(self,
obs_batch,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
info_batch=None,
episodes=None,
**kwargs):
# TODO: Should return action for transport order according to fifo logic
return ...
def learn_on_batch(self, samples):
# implement your learning code here
return {} # return stats
def get_weights(self) -> ModelWeights:
"""No weights to save."""
return {}
class FIFOTrainer(Trainer):
def get_default_policy_class(
self, config: TrainerConfigDict
) -> Type[Policy]:
# default policy class for this Trainer is FIFO
return FIFO