Passing custom policy multi-agent

What would be the correct way of passing a custom policy to a multi-agent setting?. I want to pass just the regular ddpg policy, before doing further modifications. I have tried to pass it as:

from custom_ddpg import DDPGTFModel
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_model("my_tf_model", DDPGTFModel)


experiment_params = {
        "training": {
            "env": env_name,
            "run": "DDPG",
            ...
            "model": {
                    "fcnet_hiddens": [1028, 512, 512, 128],
                    "fcnet_activation": "swish",
                    # Specify our custom model from above.
                    "custom_model": "my_tf_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {}

                },
                "multiagent": {
                    "policies": {
                        "agent_DA": (DDPGTFModel, env.observation_space_DA, env.action_space_DA, {}),
                        "agent_BM_Ask": (DDPGTFModel, env.observation_space_BM, env.action_space_BM, {}),
                        "agent_BM_Bid": (DDPGTFModel, env.observation_space_BM, env.action_space_BM, {})
                    },
                    "policy_mapping_fn": lambda x: policy_mapping(x),
                },
            }
        }
    }    
    

When I try to run the tests I get the following error:

TypeError: __init__() missing 2 required positional arguments: 'model_config' and 'name'

If I initialize these parameters in the policy class as:

model= {
                    "fcnet_hiddens": [1028, 512, 512, 128],
                    "fcnet_activation": "swish", 
                }

class DDPGTFModel(TFModelV2):
    """Extension of standard TFModel to provide DDPG action- and q-outputs.
    Data flow:
        obs -> forward() -> model_out
        model_out -> get_policy_output() -> deterministic actions
        model_out, actions -> get_q_values() -> Q(s, a)
        model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
    Note that this class by itself is not a valid model unless you
    implement forward() in a subclass."""

    def __init__(
            self,
            obs_space: gym.spaces.Space,
            action_space: gym.spaces.Space,
            num_outputs: int,
            model_config: ModelConfigDict = model,
            name: str = "DDPG",
            # Extra DDPGActionModel args:
            actor_hiddens: Optional[List[int]] = None,
            actor_hidden_activation: str = "relu",
            critic_hiddens: Optional[List[int]] = None,
            critic_hidden_activation: str = "relu",
            twin_q: bool = False,
            add_layer_norm: bool = False):
    ...

The error becomes:

Traceback (most recent call last):
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 890, in _process_trial
rsl1olntz5-algo-1-lwryd |     results = self.trial_executor.fetch_result(trial)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 788, in fetch_result
rsl1olntz5-algo-1-lwryd |     result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
rsl1olntz5-algo-1-lwryd |     return func(*args, **kwargs)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 1627, in get
rsl1olntz5-algo-1-lwryd |     raise value
rsl1olntz5-algo-1-lwryd | ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::DDPG.__init__() (pid=184, ip=172.18.0.2)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 137, in __init__
rsl1olntz5-algo-1-lwryd |     Trainer.__init__(self, config, env, logger_creator)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 623, in __init__
rsl1olntz5-algo-1-lwryd |     super().__init__(config, logger_creator)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/tune/trainable.py", line 107, in __init__
rsl1olntz5-algo-1-lwryd |     self.setup(copy.deepcopy(self.config))
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 147, in setup
rsl1olntz5-algo-1-lwryd |     super().setup(config)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 776, in setup
rsl1olntz5-algo-1-lwryd |     self._init(self.config, self.env_creator)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer_template.py", line 176, in _init
rsl1olntz5-algo-1-lwryd |     num_workers=self.config["num_workers"])
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 864, in _make_workers
rsl1olntz5-algo-1-lwryd |     logdir=self.logdir)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py", line 89, in __init__
rsl1olntz5-algo-1-lwryd |     lambda p, pid: (pid, p.observation_space, p.action_space)))
rsl1olntz5-algo-1-lwryd | ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=183, ip=172.18.0.2)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 1305, in as_shape
rsl1olntz5-algo-1-lwryd |     return TensorShape(shape)
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 765, in __init__
rsl1olntz5-algo-1-lwryd |     self._dims = [Dimension(d) for d in dims]
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 765, in <listcomp>
rsl1olntz5-algo-1-lwryd |     self._dims = [Dimension(d) for d in dims]
rsl1olntz5-algo-1-lwryd |   File "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py", line 209, in __init__
rsl1olntz5-algo-1-lwryd |     .format(value, type(value))), None)
rsl1olntz5-algo-1-lwryd |   File "<string>", line 3, in raise_from
rsl1olntz5-algo-1-lwryd | TypeError: Dimension value must be integer or None or have an __index__ method, got value '{'num_workers': 1, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 'rollout_fragment_length': 1, 'batch_mode': 'complete_episodes', 'gamma': 0.99, 'lr': 0.001, 'train_batch_size': 16, 'model': DictWrapper({'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': ListWrapper([1028, 512, 512, 128]), 'fcnet_activation': 'swish', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': ListWrapper([]), 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': 'my_tf_model', 'custom_model_config': DictWrapper({}), 'custom_action_dist': None, 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1}), 'optimizer': DictWrapper({}), 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env': 'PowerTrading-v0', 'observation_space': None, 'action_space': None, 'env_config': DictWrapper({'file_name': '/opt/ml/output/intermediate'}), 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'env_task_fn': None, 'render_env': False, 'record_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'preprocessor_pref': 'deepmind', 'log_level': 'WARN', 'callbacks': <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>, 'ignore_worker_failures': False, 'log_sys_usage': True, 'fake_sampler': False, 'framework': 'tf', 'eager_tracing': False, 'explore': True, 'exploration_config': DictWrapper({'type': 'OrnsteinUhlenbeckNoise', 'random_timesteps': 1000, 'ou_base_scale': 0.1, 'ou_theta': 0, 'ou_sigma': 0.2, 'initial_scale': 1.0, 'final_scale': 0.02, 'scale_timesteps': 10000}), 'evaluation_interval': None, 'evaluation_num_episodes': 10, 'evaluation_parallel_to_training': False, 'in_evaluation': False, 'evaluation_config': DictWrapper({'explore': False}), 'evaluation_num_workers': 0, 'custom_eval_function': None, 'sample_async': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'observation_filter': 'NoFilter', 'synchronize_filters': True, 'tf_session_args': DictWrapper({'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': DictWrapper({'allow_growth': True}), 'log_device_placement': False, 'device_count': DictWrapper({'CPU': 1}), 'allow_soft_placement': True}), 'local_tf_session_args': DictWrapper({'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}), 'compress_observations': False, 'collect_metrics_timeout': 10080, 'metrics_smoothing_episodes': 100, 'min_iter_time_s': 1, 'timesteps_per_iteration': 1000, 'seed': 1, 'extra_python_environs_for_driver': DictWrapper({}), 'extra_python_environs_for_worker': DictWrapper({}), 'num_gpus': 0, '_fake_gpus': False, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': DictWrapper({}), 'num_cpus_for_driver': 1, 'placement_strategy': 'PACK', 'input': 'sampler', 'input_config': DictWrapper({}), 'actions_in_input_normalized': False, 'input_evaluation': ListWrapper(['is', 'wis']), 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': ListWrapper(['obs', 'new_obs']), 'output_max_file_size': 67108864, 'multiagent': DictWrapper({'policies': DictWrapper({'agent_DA': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (344,), float32), action_space=Box(20.0, 200.0, (1,), float32), config=DictWrapper({}))), 'agent_BM_Ask': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (156,), float32), action_space=Box(-500.0, 500.0, (1,), float32), config=DictWrapper({}))), 'agent_BM_Bid': _TupleWrapper(PolicySpec(policy_class=<class 'custom_ddpg.DDPGTFModel'>, observation_space=Box(-inf, inf, (156,), float32), action_space=Box(-500.0, 500.0, (1,), float32), config=DictWrapper({})))}), 'policy_map_capacity': 100, 'policy_map_cache': None, 'policy_mapping_fn': <function <lambda> at 0x7f88ee6eb710>, 'policies_to_train': None, 'observation_fn': None, 'replay_mode': 'independent', 'count_steps_by': 'env_steps'}), 'logger_config': None, '_tf_policy_handles_more_than_one_loss': False, '_disable_preprocessor_api': False, 'simple_optimizer': True, 'monitor': -1, 'twin_q': False, 'policy_delay': 1, 'smooth_target_policy': False, 'target_noise': 0.2, 'target_noise_clip': 0.5, 'use_state_preprocessor': True, 'actor_hiddens': ListWrapper([400, 300]), 'actor_hidden_activation': 'relu', 'critic_hiddens': ListWrapper([400, 300]), 'critic_hidden_activation': 'relu', 'n_step': 1, 'buffer_size': 50000, 'store_buffer_in_checkpoints': False, 'prioritized_replay': True, 'prioritized_replay_alpha': 0.6, 'prioritized_replay_beta': 0.4, 'prioritized_replay_beta_annealing_timesteps': 20000, 'final_prioritized_replay_beta': 0.4, 'prioritized_replay_eps': 1e-06, 'training_intensity': None, 'critic_lr': 0.001, 'actor_lr': 0.001, 'target_network_update_freq': 0, 'tau': 0.002, 'use_huber': False, 'huber_threshold': 1.0, 'l2_reg': 1e-06, 'grad_clip': None, 'learning_starts': 1500, 'worker_side_prioritization': False, 'worker_index': 1}' with type '<class 'dict'>'

@carlorop,

Your code is not registering a custom policy. It is registering a custom model. Those are different. The policy holds instances of the model.

Do you want a custom policy or custom model?

1 Like

Hi @mannyv

I am not familiar with the difference between model and policy. My problem is that my agents tend to get stuck in the boundaries of the action range, even when the actions are far from being optimum. I am suspecting on NN saturation, so I wanted to print the outputs of the layers to check their outputs. Should I modify the model or the policy in this case?

Some docs that may help:

policies:
https://docs.ray.io/en/latest/rllib-concepts.html
https://docs.ray.io/en/latest/rllib-package-ref.html#module-ray.rllib.policy

models:
https://docs.ray.io/en/latest/rllib-models.html#
https://docs.ray.io/en/latest/rllib-package-ref.html#module-ray.rllib.models