Since initial experiments have failed, @avnishn how do we:
- send additional parameters to a custom algorithm object? For now, I’ve put those into the
custom_model_config
because if I put it anywhere else tune gives me theUnknown config parameter
error - Once we have the above, how do we make these parameters tunable?
alg_config = config.to_dict()
# note, policies are defined in the multi-agent dictionary / function call
alg_config['model']['custom_model_config'] = {
'k': args.k,
's': args.s,
'mc_threshold': args.mc_threshold,
'meta_reward': args.meta_reward,
'win_rate_threshold': args.win_rate_threshold,
}
perturbation_interval = 10
pbt = PopulationBasedTraining(
time_attr="training_iteration",
perturbation_interval=perturbation_interval,
hyperparam_mutations={
# distribution for resampling
'model/custom_model_config/k': tune.grid_search([1, 5, 10, 25, 100]),
'model/custom_model_config/s': tune.grid_search([10, 25, 50, 100]),
'model/custom_model_config/mc_threshold': tune.grid_search([0.0, 0.25, 0.5, 0.8, 0.9, 0.95]),
'model/custom_model_config/win_rate_threshold': tune.grid_search([0.5, 0.75, 0.85, 0.95]),
},
)
Sadly, that hasn’t worked:
[2023-03-27 14:46:47,224 E 12352 31608] core_worker.cc:1449: Pushed Error with JobID: 01000000 of type: task with message: ray::CustomPPO.__init__() (pid=12352, ip=127.0.0.1, repr=CustomPPO)
File "python\ray\_raylet.pyx", line 859, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 863, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 810, in ray._raylet.execute_task.function_executor
File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
return method(self, *_args, **_kwargs)
File "D:\PycharmProjects\ubc\mapo\custom_ppo.py", line 74, in __init__
super().__init__(config, env, logger_creator)
File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 414, in __init__
super().__init__(config=config, logger_creator=logger_creator, **kwargs)
File "D:\miniconda\envs\mapo\lib\site-packages\ray\tune\trainable\trainable.py", line 161, in __init__
self.setup(copy.deepcopy(self.config))
File "D:\miniconda\envs\mapo\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
return method(self, *_args, **_kwargs)
File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 439, in setup
self.config = self.merge_trainer_configs(
File "D:\miniconda\envs\mapo\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 2228, in merge_trainer_configs
return deep_update(
File "D:\miniconda\envs\mapo\lib\site-packages\ray\_private\dict.py", line 60, in deep_update
raise Exception("Unknown config parameter `{}` ".format(k))
Exception: Unknown config parameter `model/custom_model_config/k` at time: 1.67994e+09
Here’s an example of the tune call:
results = tune.Tuner(
CustomPPO,
param_space=alg_config,
tune_config=tune.TuneConfig(
metric="batch_meta_reward",
mode='min',
scheduler=pbt,
),
run_config=air.RunConfig(
name="mapo_rpsw",
stop=stop,
verbose=0,
local_dir=os.path.join('..', 'results'),
progress_reporter=CLIReporter(
metric_columns={
"training_iteration": "iter",
"time_total_s": "time_total_s",
"timesteps_total": "ts",
"episodes_this_iter": "train_episodes",
"policy_reward_mean/main0": "reward",
"batch_meta_reward": "batch_meta_reward",
"win_rate": "win_rate",
"league_size": "league_size",
},
sort_by_metric=True,
),
checkpoint_config=air.CheckpointConfig(
checkpoint_at_end=True,
checkpoint_frequency=10,
),
),
).fit()
And the init for the custom PPO object
class CustomPPO(PPO):
def __init__(self, config=None, env=None, logger_creator=None):
# todo make this a real replay buffer object to take
# advantage of things like prioritized sampling to determine
# which samples to use for meta reward-shaping / learning
self.reward_buffer = OrderedDict()
self.seen_snapshot_ids = set()
self.batch_order = []
self.passed_mc = False
self.meta_reward_value = 0
self.n_opponents = 0
# configs passed in through the custom_model_config
# cmc = config['multiagent']['policies'][config['env_config']["players_ids"][1]][3]
custom_model_config = config.get('model', {}).get('custom_model_config', {})
# knn to calculate meta-reward against
self.k = custom_model_config.get("k", 5)
# whether to use the meta-reward or not
self.meta_reward = custom_model_config.get("meta_reward", True)
# hyperparameters for the self-play callback
# how often to snapshot the current policy
self.s = custom_model_config.get("s", 20)
# when applicable, above what win-rate threshold should we snapshot the policy
self.win_rate_threshold = custom_model_config.get("win_rate_threshold", 0.95)
# above what win-rate threshold should we switch to the meta-reward
self.mc_threshold = custom_model_config.get("mc_threshold", 0.8)
# at the end so that the self-play callback has access to these attributes above
super().__init__(config, env, logger_creator)
# initialize a worker to compute rewards on demand
policy_config = {'model': {'custom_model': 'mlp'}}
# custom rollout worker: band-aid fix for rllib not
# letting me do e.g., algorithm.evaluate(p1, p2)
self.crw = CRW(self.env_creator,
config['env_config'],
policy_config)