I am trying to write a custom action sampler and use it to train the PPO agent. However, the way of doing that is described in the documentation Extending Existing Policies seems to have no effect on the policy and just the default policy is used for training.
The dummy example is shown below:
import ray import ray.rllib.agents.ppo as ppo from ray.tune.logger import pretty_print from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.agents.ppo import PPOTrainer def build_action_sampler(policy, model, input_dict, state, explore, timestep): return 0, None, None # the agent should crash or not learn with this sampler CustomPPOTorchPolicy = PPOTorchPolicy.with_updates( name="CustomPPOTorchPolicy",action_sampler_fn=build_action_sampler) CustomTrainer = PPOTrainer.with_updates( default_policy=CustomPPOTorchPolicy) config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 1 config["framework"] = "torch" trainer = CustomTrainer(config=config, env="CartPole-v0") for i in range(1000): result = trainer.train() print(pretty_print(result))
What’s happening here?