Since initial experiments have failed, @avnishn how do we:
- send additional parameters to a custom algorithm object? For now, I’ve put those into the custom_model_configbecause if I put it anywhere else tune gives me theUnknown config parametererror
- Once we have the above, how do we make these parameters tunable?
    alg_config = config.to_dict()
    # note, policies are defined in the multi-agent dictionary / function call
    alg_config['model']['custom_model_config'] = {
        'k': args.k,
        's': args.s,
        'mc_threshold': args.mc_threshold,
        'meta_reward': args.meta_reward,
        'win_rate_threshold': args.win_rate_threshold,
    }
    perturbation_interval = 10
    pbt = PopulationBasedTraining(
                time_attr="training_iteration",
                perturbation_interval=perturbation_interval,
                hyperparam_mutations={
                    # distribution for resampling
                    'model/custom_model_config/k': tune.grid_search([1, 5, 10, 25, 100]),
                    'model/custom_model_config/s': tune.grid_search([10, 25, 50, 100]),
                    'model/custom_model_config/mc_threshold': tune.grid_search([0.0, 0.25, 0.5, 0.8, 0.9, 0.95]),
                    'model/custom_model_config/win_rate_threshold': tune.grid_search([0.5, 0.75, 0.85, 0.95]),
                },
        )
Sadly, that hasn’t worked:
[2023-03-27 14:46:47,224 E 12352 31608] core_worker.cc:1449: Pushed Error with JobID: 01000000 of type: task with message: ray::CustomPPO.__init__() (pid=12352, ip=127.0.0.1, repr=CustomPPO)
  File "python\ray\_raylet.pyx", line 859, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 863, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 810, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\PycharmProjects\ubc\mapo\custom_ppo.py", line 74, in __init__
    super().__init__(config, env, logger_creator)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\tune\trainable\trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 439, in setup
    self.config = self.merge_trainer_configs(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 2228, in merge_trainer_configs
    return deep_update(
  File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\dict.py", line 60, in deep_update
    raise Exception("Unknown config parameter `{}` ".format(k))
Exception: Unknown config parameter `model/custom_model_config/k` at time: 1.67994e+09
Here’s an example of the tune call:
results = tune.Tuner(
        CustomPPO,
        param_space=alg_config,
        tune_config=tune.TuneConfig(
            metric="batch_meta_reward",
            mode='min',
            scheduler=pbt,
        ),
        run_config=air.RunConfig(
            name="mapo_rpsw",
            stop=stop,
            verbose=0,
            local_dir=os.path.join('..', 'results'),
            progress_reporter=CLIReporter(
                metric_columns={
                    "training_iteration": "iter",
                    "time_total_s": "time_total_s",
                    "timesteps_total": "ts",
                    "episodes_this_iter": "train_episodes",
                    "policy_reward_mean/main0": "reward",
                    "batch_meta_reward": "batch_meta_reward",
                    "win_rate": "win_rate",
                    "league_size": "league_size",
                },
                sort_by_metric=True,
            ),
            checkpoint_config=air.CheckpointConfig(
                checkpoint_at_end=True,
                checkpoint_frequency=10,
            ),
        ),
    ).fit()
And the init for the custom PPO object
class CustomPPO(PPO):
    def __init__(self, config=None, env=None, logger_creator=None):
        # todo make this a real replay buffer object to take
        #  advantage of things like prioritized sampling to determine
        #  which samples to use for meta reward-shaping / learning
        self.reward_buffer = OrderedDict()
        self.seen_snapshot_ids = set()
        self.batch_order = []
        self.passed_mc = False
        self.meta_reward_value = 0
        self.n_opponents = 0
        # configs passed in through the custom_model_config
        # cmc = config['multiagent']['policies'][config['env_config']["players_ids"][1]][3]
        custom_model_config = config.get('model', {}).get('custom_model_config', {})
        # knn to calculate meta-reward against
        self.k = custom_model_config.get("k", 5)
        # whether to use the meta-reward or not
        self.meta_reward = custom_model_config.get("meta_reward", True)
        # hyperparameters for the self-play callback
        # how often to snapshot the current policy
        self.s = custom_model_config.get("s", 20)
        # when applicable, above what win-rate threshold should we snapshot the policy
        self.win_rate_threshold = custom_model_config.get("win_rate_threshold", 0.95)
        # above what win-rate threshold should we switch to the meta-reward
        self.mc_threshold = custom_model_config.get("mc_threshold", 0.8)
        # at the end so that the self-play callback has access to these attributes above
        super().__init__(config, env, logger_creator)
        # initialize a worker to compute rewards on demand
        policy_config = {'model': {'custom_model': 'mlp'}}
        # custom rollout worker: band-aid fix for rllib not
        # letting me do e.g., algorithm.evaluate(p1, p2)
        self.crw = CRW(self.env_creator,
                       config['env_config'],
                       policy_config)