I was wondering if there is a way to have a dynamic Entropy Schedule that does not have to be defined before the training starts but can be adjusted dynamically at runtime based on arbitrary metrics.
I experimented a bit and came up with the following minimal example script for CartPole-v1 with PPO:
from types import MethodType
import ray
import torch
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec
class EntropyScheduleCallback(DefaultCallbacks):
def on_train_result(self, *, algorithm, result, **kwargs):
mean_reward = result["env_runners"]["episode_return_mean"]
if mean_reward is None:
return
if mean_reward > 450:
new_entropy_coeff = 0.01
elif mean_reward > 300:
new_entropy_coeff = 0.05
else:
new_entropy_coeff = 0.1
print(f"[Callback] Scheduling entropy coeff = {new_entropy_coeff}")
learner = algorithm.learner_group._learner
learner.set_entropy_coeff(new_entropy_coeff)
def custom_learner_factory(
config: AlgorithmConfig,
module_spec: RLModuleSpec | MultiRLModuleSpec | None = None,
module: RLModule | None = None,
):
learner_cls = config.get_default_learner_class()
base_learner = learner_cls(config=config, module_spec=module_spec, module=module)
def set_entropy_coeff(self, new_value: float):
for module_id, scheduler in getattr(
self, "entropy_coeff_schedulers_per_module", {}
).items():
scheduler._curr_value = torch.tensor(float(new_value))
setattr(self, "_last_set_entropy_coeff", float(new_value))
return True
base_learner.set_entropy_coeff = MethodType(set_entropy_coeff, base_learner)
return base_learner
def main():
ray.init(runtime_env=({"env_vars": {"RAY_DEBUG": "1"}}))
config = (
PPOConfig()
.environment(env="CartPole-v1")
.env_runners(num_env_runners=7)
.framework("torch")
.training(learner_class=custom_learner_factory)
.callbacks(EntropyScheduleCallback)
)
algo = config.build()
for i in range(50):
result = algo.train()
print(
f"Iteration {i + 1}: episode_reward_mean={result['env_runners']['episode_return_mean']}"
)
print(
f"entropy coefficient: {result['learners']['default_policy']['curr_entropy_coeff']}"
)
algo.stop()
ray.shutdown()
if __name__ == "__main__":
main()
Taking advantage of the new learner API, I patch a entropy coefficient setter method in the PPOLearner class. The dynamic entropy schedule is implemented in a callback function which invokes this patched setter method. Finally, the actual entropy is changed in a bit of a hacky way by “hijacking” the entropy scheduler and changing the (private) entropy coefficient directly.
I am posting this because I think it might be helpful to some and because I was wondering if such functionality is planned in future ray versions. Also this concept could be generalized to all kinds of hyperparameters but I haven’t found a better way than “hijacking” the scheduler so far.
I need this functionality because I am doing curriculum learning and it might be beneficial to dynamically adjust the entropy coefficient to get out of local optima during the curriculum progression.