What would be the correct way of passing a custom policy to a multi-agent setting?. I want to pass just the regular ddpg policy, before doing further modifications. I have tried to pass it as:
from custom_ddpg import DDPGTFModel
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_model("my_tf_model", DDPGTFModel)
experiment_params = {
"training": {
"env": env_name,
"run": "DDPG",
...
"model": {
"fcnet_hiddens": [1028, 512, 512, 128],
"fcnet_activation": "swish",
# Specify our custom model from above.
"custom_model": "my_tf_model",
# Extra kwargs to be passed to your model's c'tor.
"custom_model_config": {}
},
"multiagent": {
"policies": {
"agent_DA": (DDPGTFModel, env.observation_space_DA, env.action_space_DA, {}),
"agent_BM_Ask": (DDPGTFModel, env.observation_space_BM, env.action_space_BM, {}),
"agent_BM_Bid": (DDPGTFModel, env.observation_space_BM, env.action_space_BM, {})
},
"policy_mapping_fn": lambda x: policy_mapping(x),
},
}
}
}
When I try to run the tests I get the following error:
TypeError: __init__() missing 2 required positional arguments: 'model_config' and 'name'
If I initialize these parameters in the policy class as:
model= {
"fcnet_hiddens": [1028, 512, 512, 128],
"fcnet_activation": "swish",
}
class DDPGTFModel(TFModelV2):
"""Extension of standard TFModel to provide DDPG action- and q-outputs.
Data flow:
obs -> forward() -> model_out
model_out -> get_policy_output() -> deterministic actions
model_out, actions -> get_q_values() -> Q(s, a)
model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
Note that this class by itself is not a valid model unless you
implement forward() in a subclass."""
def __init__(
self,
obs_space: gym.spaces.Space,
action_space: gym.spaces.Space,
num_outputs: int,
model_config: ModelConfigDict = model,
name: str = "DDPG",
# Extra DDPGActionModel args:
actor_hiddens: Optional[List[int]] = None,
actor_hidden_activation: str = "relu",
critic_hiddens: Optional[List[int]] = None,
critic_hidden_activation: str = "relu",
twin_q: bool = False,
add_layer_norm: bool = False):
...
The error becomes:
Traceback (most recent call last):
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 890, in _process_trial
rsl1olntz5-algo-1-lwryd | results = self.trial_executor.fetch_result(trial)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 788, in fetch_result
rsl1olntz5-algo-1-lwryd | result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
rsl1olntz5-algo-1-lwryd | return func(*args, **kwargs)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 1627, in get
rsl1olntz5-algo-1-lwryd | raise value
rsl1olntz5-algo-1-lwryd | ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::DDPG.__init__() (pid=184, ip=172.18.0.2)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 137, in __init__
rsl1olntz5-algo-1-lwryd | Trainer.__init__(self, config, env, logger_creator)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 623, in __init__
rsl1olntz5-algo-1-lwryd | super().__init__(config, logger_creator)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/tune/trainable.py", line 107, in __init__
rsl1olntz5-algo-1-lwryd | self.setup(copy.deepcopy(self.config))
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 147, in setup
rsl1olntz5-algo-1-lwryd | super().setup(config)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 776, in setup
rsl1olntz5-algo-1-lwryd | self._init(self.config, self.env_creator)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 176, in _init
rsl1olntz5-algo-1-lwryd | num_workers=self.config["num_workers"])
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 864, in _make_workers
rsl1olntz5-algo-1-lwryd | logdir=self.logdir)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 89, in __init__
rsl1olntz5-algo-1-lwryd | lambda p, pid: (pid, p.observation_space, p.action_space)))
rsl1olntz5-algo-1-lwryd | ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=183, ip=172.18.0.2)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 1305, in as_shape
rsl1olntz5-algo-1-lwryd | return TensorShape(shape)
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 765, in __init__
rsl1olntz5-algo-1-lwryd | self._dims = [Dimension(d) for d in dims]
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 765, in <listcomp>
rsl1olntz5-algo-1-lwryd | self._dims = [Dimension(d) for d in dims]
rsl1olntz5-algo-1-lwryd | File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 209, in __init__
rsl1olntz5-algo-1-lwryd | .format(value, type(value))), None)
rsl1olntz5-algo-1-lwryd | File "<string>", line 3, in raise_from
rsl1olntz5-algo-1-lwryd | TypeError: Dimension value must be integer or None or have an __index__ method, got value '{'num_workers': 1, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 'rollout_fragment_length': 1, 'batch_mode': 'complete_episodes', 'gamma': 0.99, 'lr': 0.001, 'train_batch_size': 16, 'model': DictWrapper({'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': ListWrapper([1028, 512, 512, 128]), 'fcnet_activation': 'swish', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': ListWrapper([]), 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': 'my_tf_model', 'custom_model_config': DictWrapper({}), 'custom_action_dist': None, 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1}), 'optimizer': DictWrapper({}), 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env': 'PowerTrading-v0', 'observation_space': None, 'action_space': None, 'env_config': DictWrapper({'file_name': '/opt/ml/output/intermediate'}), 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'env_task_fn': None, 'render_env': False, 'record_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'preprocessor_pref': 'deepmind', 'log_level': 'WARN', 'callbacks': <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>, 'ignore_worker_failures': False, 'log_sys_usage': True, 'fake_sampler': False, 'framework': 'tf', 'eager_tracing': False, 'explore': True, 'exploration_config': DictWrapper({'type': 'OrnsteinUhlenbeckNoise', 'random_timesteps': 1000, 'ou_base_scale': 0.1, 'ou_theta': 0, 'ou_sigma': 0.2, 'initial_scale': 1.0, 'final_scale': 0.02, 'scale_timesteps': 10000}), 'evaluation_interval': None, 'evaluation_num_episodes': 10, 'evaluation_parallel_to_training': False, 'in_evaluation': False, 'evaluation_config': DictWrapper({'explore': False}), 'evaluation_num_workers': 0, 'custom_eval_function': None, 'sample_async': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': DictWrapper({'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': DictWrapper({'allow_growth': True}), 'log_device_placement': False, 'device_count': DictWrapper({'CPU': 1}), 'allow_soft_placement': True}), 'local_tf_session_args': DictWrapper({'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}), 'compress_observations': False, 'collect_metrics_timeout': 10080, 'metrics_smoothing_episodes': 100, 'min_iter_time_s': 1, 'timesteps_per_iteration': 1000, 'seed': 1, 'extra_python_environs_for_driver': DictWrapper({}), 'extra_python_environs_for_worker': DictWrapper({}), 'num_gpus': 0, '_fake_gpus': False, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': DictWrapper({}), 'num_cpus_for_driver': 1, 'placement_strategy': 'PACK', 'input': 'sampler', 'input_config': DictWrapper({}), 'actions_in_input_normalized': False, 'input_evaluation': ListWrapper(['is', 'wis']), 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ListWrapper(['obs', 'new_obs']), 'output_max_file_size': 67108864, 'multiagent': DictWrapper({'policies': DictWrapper({'agent_DA': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (344,), float32), action_space=Box(20.0, 200.0, (1,), float32), config=DictWrapper({}))), 'agent_BM_Ask': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (156,), float32), action_space=Box(-500.0, 500.0, (1,), float32), config=DictWrapper({}))), 'agent_BM_Bid': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (156,), float32), action_space=Box(-500.0, 500.0, (1,), float32), config=DictWrapper({})))}), 'policy_map_capacity': 100, 'policy_map_cache': None, 'policy_mapping_fn': <function <lambda> at 0x7f88ee6eb710>, 'policies_to_train': None, 'observation_fn': None, 'replay_mode': 'independent', 'count_steps_by': 'env_steps'}), 'logger_config': None, '_tf_policy_handles_more_than_one_loss': False, '_disable_preprocessor_api': False, 'simple_optimizer': True, 'monitor': -1, 'twin_q': False, 'policy_delay': 1, 'smooth_target_policy': False, 'target_noise': 0.2, 'target_noise_clip': 0.5, 'use_state_preprocessor': True, 'actor_hiddens': ListWrapper([400, 300]), 'actor_hidden_activation': 'relu', 'critic_hiddens': ListWrapper([400, 300]), 'critic_hidden_activation': 'relu', 'n_step': 1, 'buffer_size': 50000, 'store_buffer_in_checkpoints': False, 'prioritized_replay': True, 'prioritized_replay_alpha': 0.6, 'prioritized_replay_beta': 0.4, 'prioritized_replay_beta_annealing_timesteps': 20000, 'final_prioritized_replay_beta': 0.4, 'prioritized_replay_eps': 1e-06, 'training_intensity': None, 'critic_lr': 0.001, 'actor_lr': 0.001, 'target_network_update_freq': 0, 'tau': 0.002, 'use_huber': False, 'huber_threshold': 1.0, 'l2_reg': 1e-06, 'grad_clip': None, 'learning_starts': 1500, 'worker_side_prioritization': False, 'worker_index': 1}' with type '<class 'dict'>'