Send and Tune additional parameters in custom algorithm object

Since initial experiments have failed, @avnishn how do we:

  • send additional parameters to a custom algorithm object? For now, I’ve put those into the custom_model_config because if I put it anywhere else tune gives me the Unknown config parameter error
  • Once we have the above, how do we make these parameters tunable?
    alg_config = config.to_dict()
    # note, policies are defined in the multi-agent dictionary / function call
    alg_config['model']['custom_model_config'] = {
        'k': args.k,
        's': args.s,
        'mc_threshold': args.mc_threshold,
        'meta_reward': args.meta_reward,
        'win_rate_threshold': args.win_rate_threshold,
    }

    perturbation_interval = 10
    pbt = PopulationBasedTraining(
                time_attr="training_iteration",
                perturbation_interval=perturbation_interval,
                hyperparam_mutations={
                    # distribution for resampling
                    'model/custom_model_config/k': tune.grid_search([1, 5, 10, 25, 100]),
                    'model/custom_model_config/s': tune.grid_search([10, 25, 50, 100]),
                    'model/custom_model_config/mc_threshold': tune.grid_search([0.0, 0.25, 0.5, 0.8, 0.9, 0.95]),
                    'model/custom_model_config/win_rate_threshold': tune.grid_search([0.5, 0.75, 0.85, 0.95]),
                },
        )

Sadly, that hasn’t worked:

[2023-03-27 14:46:47,224 E 12352 31608] core_worker.cc:1449: Pushed Error with JobID: 01000000 of type: task with message: ray::CustomPPO.__init__() (pid=12352, ip=127.0.0.1, repr=CustomPPO)
  File "python\ray\_raylet.pyx", line 859, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 863, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 810, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\PycharmProjects\ubc\mapo\custom_ppo.py", line 74, in __init__
    super().__init__(config, env, logger_creator)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\tune\trainable\trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 439, in setup
    self.config = self.merge_trainer_configs(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 2228, in merge_trainer_configs
    return deep_update(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\dict.py", line 60, in deep_update
    raise Exception("Unknown config parameter `{}` ".format(k))
Exception: Unknown config parameter `model/custom_model_config/k` at time: 1.67994e+09

Here’s an example of the tune call:

results = tune.Tuner(
        CustomPPO,
        param_space=alg_config,
        tune_config=tune.TuneConfig(
            metric="batch_meta_reward",
            mode='min',
            scheduler=pbt,
        ),
        run_config=air.RunConfig(
            name="mapo_rpsw",
            stop=stop,
            verbose=0,
            local_dir=os.path.join('..', 'results'),
            progress_reporter=CLIReporter(
                metric_columns={
                    "training_iteration": "iter",
                    "time_total_s": "time_total_s",
                    "timesteps_total": "ts",
                    "episodes_this_iter": "train_episodes",
                    "policy_reward_mean/main0": "reward",
                    "batch_meta_reward": "batch_meta_reward",
                    "win_rate": "win_rate",
                    "league_size": "league_size",
                },
                sort_by_metric=True,
            ),
            checkpoint_config=air.CheckpointConfig(
                checkpoint_at_end=True,
                checkpoint_frequency=10,
            ),
        ),
    ).fit()

And the init for the custom PPO object

class CustomPPO(PPO):
    def __init__(self, config=None, env=None, logger_creator=None):
        # todo make this a real replay buffer object to take
        #  advantage of things like prioritized sampling to determine
        #  which samples to use for meta reward-shaping / learning
        self.reward_buffer = OrderedDict()
        self.seen_snapshot_ids = set()
        self.batch_order = []
        self.passed_mc = False
        self.meta_reward_value = 0
        self.n_opponents = 0
        # configs passed in through the custom_model_config
        # cmc = config['multiagent']['policies'][config['env_config']["players_ids"][1]][3]
        custom_model_config = config.get('model', {}).get('custom_model_config', {})
        # knn to calculate meta-reward against
        self.k = custom_model_config.get("k", 5)
        # whether to use the meta-reward or not
        self.meta_reward = custom_model_config.get("meta_reward", True)
        # hyperparameters for the self-play callback
        # how often to snapshot the current policy
        self.s = custom_model_config.get("s", 20)
        # when applicable, above what win-rate threshold should we snapshot the policy
        self.win_rate_threshold = custom_model_config.get("win_rate_threshold", 0.95)
        # above what win-rate threshold should we switch to the meta-reward
        self.mc_threshold = custom_model_config.get("mc_threshold", 0.8)

        # at the end so that the self-play callback has access to these attributes above
        super().__init__(config, env, logger_creator)

        # initialize a worker to compute rewards on demand
        policy_config = {'model': {'custom_model': 'mlp'}}
        # custom rollout worker: band-aid fix for rllib not
        # letting me do e.g., algorithm.evaluate(p1, p2)
        self.crw = CRW(self.env_creator,
                       config['env_config'],
                       policy_config)

Tried writing a custom PPOConfig object with the parameters, but got the same error:

[2023-03-27 22:55:59,076 E 8404 14056] core_worker.cc:1449: Pushed Error with JobID: 01000000 of type: task with message: ray::CustomPPO.__init__() (pid=8404, ip=127.0.0.1, repr=CustomPPO)
  File "python\ray\_raylet.pyx", line 859, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 863, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 810, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\PycharmProjects\ubc\mapo\custom_ppo.py", line 104, in __init__
    super().__init__(config, env, logger_creator)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\tune\trainable\trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 439, in setup
    self.config = self.merge_trainer_configs(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 2228, in merge_trainer_configs
    return deep_update(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\dict.py", line 60, in deep_update
    raise Exception("Unknown config parameter `{}` ".format(k))
Exception: Unknown config parameter `k` at time: 1.67997e+09

Here’s the custom config:

class CustomPPOConfig(PPOConfig):
    def __init__(self, algo_class=None):
        super().__init__(algo_class=algo_class)
        # knn to calculate meta-reward against
        self.k = 5
        # whether to use the meta-reward or not
        self.meta_reward = True
        # hyperparameters for the self-play callback
        # how often to snapshot the current policy
        self.s = 20
        # when applicable, above what win-rate threshold should we snapshot the policy
        self.win_rate_threshold = 0.95
        # above what win-rate threshold should we switch to the meta-reward
        self.mc_threshold = 0.8

    def hparams(self, *,
                knn: Optional[int] = 5,
                meta_reward: Optional[bool] = True,
                snapshot_timer: Optional[int] = 25,
                win_rate_threshold: Optional[float] = 0.95,
                mc_threshold: Optional[float] = 0.8) -> "PPOConfig":
        """Returns a copy of this config with the given hyperparameters"""
        self.k = knn
        self.meta_reward = meta_reward
        self.s = snapshot_timer
        self.win_rate_threshold = win_rate_threshold
        self.mc_threshold = mc_threshold
        return self
config = (
        CustomPPOConfig(algo_class=CustomPPO)
        .environment(env=game_name,
                     env_config=env_config)
        .framework(args.framework)
        .callbacks(MultiCallbacks([SelfPlayCallback,
                                   ActionFreqCallback]))
        .rollouts(num_envs_per_worker=5,
                  num_rollout_workers=10,
                  recreate_failed_workers=True,
                  create_env_on_local_worker=True)
        .training(train_batch_size=4192*2,
                    lr=3e-4,
                    gamma=0.99,
                    lambda_=0.95,
                    use_gae=True,
                    clip_param=0.4,
                    grad_clip=None,
                    entropy_coeff=0.2,
                    vf_loss_coeff=0.6,
                    sgd_minibatch_size=1024,
                    num_sgd_iter=4,
                    )
        .multi_agent(
            # Initial policy map: Random and PPO. This will be expanded
            # to more policy snapshots taken from "main" against which "main"
            # will then play (instead of "random"). This is done in the
            # custom callback defined above (`SelfPlayCallback`).
            policies={
                    env_config["players_ids"][0]: PolicySpec(policy_class=RandomPolicy,
                                                             observation_space=IteratedRPS_W.OBSERVATION_SPACE,
                                                             action_space=IteratedRPS_W.P1_ACTION_SPACE),
                    env_config["players_ids"][1]: (
                        None,
                        IteratedRPS_W.OBSERVATION_SPACE,
                        IteratedRPS_W.P2_ACTION_SPACE,
                        {'model': {'custom_model': 'mlp'}},
                    ),
                },
            # Assign agent 0 and 1 randomly to the "main" policy or
            # to the opponent ("random" at first). Make sure (via episode_id)
            # that "main" always plays against "random" (and not against
            # another "main").
            policy_mapping_fn=policy_mapping_fn,
            # Always just train the "main" policy.
            policies_to_train=["main0"],
        )
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        .resources(num_gpus=0)
        .hparams(knn=args.k,
                 snapshot_timer=args.s,
                 mc_threshold=args.mc_threshold,
                 meta_reward=args.meta_reward,
                 win_rate_threshold=args.win_rate_threshold)
    )

Here’s a gist with purely the necessary files: README.md · GitHub

Hi @aadharna ,

Try adding the following in addition to what you already have and see if it helps.

class CustomPPO(PPO):
    @classmethod
    def get_default_config(cls) -> AlgorithmConfig:
        return CustomPPOConfig()
class CustomPPOConfig(PPOConfig):
    def to_dict(self):
      config = super().to_dict()
      config["k"] = self.k
      config["meta_reward"] = self.meta_reward
      config["s"] = self.s
      config["win_rate_threshold"] = self.win_rate_threshold
      config["mc_theshold"] = self.mc_threshold
      return config

    def update_from_dict(
        self,
        config_dict) -> "AlgorithmConfig":
        for k in ["k","meta_reward","s","win_rate_threshold","mc_theshold"]:
          if k in config_dict:
            setattr(self, k, config_dict.pop(k))
        return super().update_from_dict(config_dict)
3 Likes

That did the trick (mostly)!

I removed the update_from_dict function because in neither the base AlgorithmConfig or PPOConfig objects were there either update_from_dict or __setstate__ functions that I could call (which seems like an oversight).

So, it seems there’s no current way to just update the parameters of a config object from a dictionary.

However, besides that, this exactly did the trick and now we can pass in arguments like normal without having to go through the custom_model_config dict!

Hi @aadharna,

Awesome that that worked.

Whay version of Ray are you using? I ask because update_from_dict exists and is called in 2.3. You may want to keep it around, perhaps commented out, in case of future upgrades.

Noted!

I am using 2.1.0 at the moment because 2.2.0 (which is what was in pip when I started the code) had a weird bug.

Thanks!