How to use recsim env with SlateQ and Bandits?

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

My env details are:
ray = 1.13.0
gym= 0.15.3
recsim=0.2.4

My code:

import ray
from ray.rllib.agents.slateq import SlateQTrainer
from ray.rllib.examples.env.recommender_system_envs_with_recsim import InterestEvolutionRecSimEnv

config = {
    "env": InterestEvolutionRecSimEnv,
    "env_config": {
        "num_candidates": 10,
        "slate_size": 1,
        "wrap_for_bandits": False,
        "resample_documents": True,
        "convert_to_discrete_action_space": False,
    },
    "framework": "torch",
}
trainer = SlateQTrainer(config=config)


I keep getting error that observation_space[“doc”] has no len.

Error message here:

NotImplementedError                       Traceback (most recent call last)
~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in setup(self, config)
    934         try:
--> 935             self._init(self.config, self.env_creator)
    936         # New design: Override `Trainable.setup()` (as indented by Trainable)

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in _init(self, config, env_creator)
   1073     def _init(self, config: TrainerConfigDict, env_creator: EnvCreator) -> None:
-> 1074         raise NotImplementedError
   1075 

NotImplementedError: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_82885/3574079892.py in <module>
     14     "framework": "torch",
     15 }
---> 16 trainer = SlateQTrainer(config=config)
     17 
     18 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in __init__(self, config, env, logger_creator, remote_checkpoint_dir, sync_function_tpl)
    869 
    870         super().__init__(
--> 871             config, logger_creator, remote_checkpoint_dir, sync_function_tpl
    872         )
    873 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/tune/trainable.py in __init__(self, config, logger_creator, remote_checkpoint_dir, sync_function_tpl)
    154         start_time = time.time()
    155         self._local_ip = self.get_current_ip()
--> 156         self.setup(copy.deepcopy(self.config))
    157         setup_time = time.time() - start_time
    158         if setup_time > SETUP_TIME_THRESHOLD:

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in setup(self, config)
    955                 num_workers=self.config["num_workers"],
    956                 local_worker=True,
--> 957                 logdir=self.logdir,
    958             )
    959             # By default, collect metrics for all remote workers.

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py in __init__(self, env_creator, validate_env, policy_class, trainer_config, num_workers, local_worker, logdir, _setup)
    176                     num_workers=num_workers,
    177                     config=self._local_config,
--> 178                     spaces=spaces,
    179                 )
    180 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py in _make_worker(self, cls, env_creator, validate_env, policy_cls, worker_index, num_workers, recreated_worker, config, spaces)
    670             extra_python_environs=extra_python_environs,
    671             spaces=spaces,
--> 672             disable_env_checking=config["disable_env_checking"],
    673         )
    674 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py in __init__(self, env_creator, validate_env, policy_spec, policy_mapping_fn, policies_to_train, tf_session_creator, rollout_fragment_length, count_steps_by, batch_mode, episode_horizon, preprocessor_pref, sample_async, compress_observations, num_envs, observation_fn, observation_filter, clip_rewards, normalize_actions, clip_actions, env_config, model_config, policy_config, worker_index, num_workers, recreated_worker, record_env, log_dir, log_level, callbacks, input_creator, input_evaluation, output_creator, remote_worker_envs, remote_env_batch_wait_ms, soft_horizon, no_done_at_end, seed, extra_python_environs, fake_sampler, spaces, policy, monitor_path, disable_env_checking)
    632             policy_config,
    633             session_creator=tf_session_creator,
--> 634             seed=seed,
    635         )
    636 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py in _build_policy_map(self, policy_dict, policy_config, session_creator, seed)
   1787             # Create the actual policy object.
   1788             self.policy_map.create_policy(
-> 1789                 name, orig_cls, obs_space, act_space, conf, merged_conf
   1790             )
   1791 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/policy/policy_map.py in create_policy(self, policy_id, policy_cls, observation_space, action_space, config_override, merged_config)
    150         else:
    151             class_ = policy_cls
--> 152             self[policy_id] = class_(observation_space, action_space, merged_config)
    153 
    154         # Store spec (class, obs-space, act-space, and config overrides) such

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/policy/policy_template.py in __init__(self, obs_space, action_space, config)
    277             elif make_model_and_action_dist:
    278                 self.model, dist_class = make_model_and_action_dist(
--> 279                     self, obs_space, action_space, config
    280                 )
    281             # Use default model and default action dist.

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/slateq/slateq_torch_policy.py in build_slateq_model_and_distribution(policy, obs_space, action_space, config)
     53         model_config=config["model"],
     54         name="slateq_model",
---> 55         fcnet_hiddens_per_candidate=config["fcnet_hiddens_per_candidate"],
     56     )
     57 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/slateq/slateq_torch_model.py in __init__(self, obs_space, action_space, num_outputs, model_config, name, fcnet_hiddens_per_candidate, double_q)
    169         self.choice_model = UserChoiceModel()
    170 
--> 171         self.q_model = QValueModel(self.obs_space, fcnet_hiddens_per_candidate)
    172 
    173     def get_q_values(self, user: TensorType, docs: List[TensorType]) -> TensorType:

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/slateq/slateq_torch_model.py in __init__(self, obs_space, fcnet_hiddens_per_candidate)
     33         self.orig_obs_space = obs_space
     34         self.embedding_size = self.orig_obs_space["doc"]["0"].shape[0]
---> 35         self.num_candidates = len(self.orig_obs_space["doc"])
     36         assert self.orig_obs_space["user"].shape[0] == self.embedding_size
     37 

**TypeError: object of type 'Dict' has no len()**

I am also getting same error when using BanditLinUCBTrainer with same recsim env.
Code:

from ray.rllib.agents.bandit import BanditLinUCBTrainer
from ray.rllib.examples.env.recommender_system_envs_with_recsim import InterestEvolutionRecSimEnv

bandits_config = {
    "env": InterestEvolutionRecSimEnv,
    "env_config": {
        "num_candidates": 10,  # action_sapce = Discrete(10) -> int 0-9
        "slate_size": 1, # only 1 item recommendation
        # Set to False for re-using the same candidate doecuments each timestep. 
        # Set True to change the candidate documents at each time step
        "resample_documents": True,
        # Convert MultiDiscrete actions to Discrete (flatten action space).
        # e.g. slate_size=2 and num_candidates=10 -> MultiDiscrete([10, 10]) -> Discrete(100)  # 10x10
        "convert_to_discrete_action_space": True,
        # Convert "doc" key into "item" key.
        "wrap_for_bandits": True,
        "seed": 0,
    },
    # trainer information now
    "seed": 0,
    # rewards are smoothed mean over last these many episodes
    'metrics_num_episodes_for_smoothing': 100,
    # Generate a result dict every single time step.
    "timesteps_per_iteration": 1,
    
}

bandits_trainer = BanditLinUCBTrainer(config = bandits_config)
bandits_trainer

Error is:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in setup(self, config)
    934         try:
--> 935             self._init(self.config, self.env_creator)
    936         # New design: Override `Trainable.setup()` (as indented by Trainable)

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in _init(self, config, env_creator)
   1073     def _init(self, config: TrainerConfigDict, env_creator: EnvCreator) -> None:
-> 1074         raise NotImplementedError
   1075 

NotImplementedError: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_82885/1175594674.py in <module>
     26 }
     27 
---> 28 bandits_trainer = BanditLinUCBTrainer(config = bandits_config)
     29 bandits_trainer

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in __init__(self, config, env, logger_creator, remote_checkpoint_dir, sync_function_tpl)
    869 
    870         super().__init__(
--> 871             config, logger_creator, remote_checkpoint_dir, sync_function_tpl
    872         )
    873 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/tune/trainable.py in __init__(self, config, logger_creator, remote_checkpoint_dir, sync_function_tpl)
    154         start_time = time.time()
    155         self._local_ip = self.get_current_ip()
--> 156         self.setup(copy.deepcopy(self.config))
    157         setup_time = time.time() - start_time
    158         if setup_time > SETUP_TIME_THRESHOLD:

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in setup(self, config)
    955                 num_workers=self.config["num_workers"],
    956                 local_worker=True,
--> 957                 logdir=self.logdir,
    958             )
    959             # By default, collect metrics for all remote workers.

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py in __init__(self, env_creator, validate_env, policy_class, trainer_config, num_workers, local_worker, logdir, _setup)
    176                     num_workers=num_workers,
    177                     config=self._local_config,
--> 178                     spaces=spaces,
    179                 )
    180 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/worker_set.py in _make_worker(self, cls, env_creator, validate_env, policy_cls, worker_index, num_workers, recreated_worker, config, spaces)
    670             extra_python_environs=extra_python_environs,
    671             spaces=spaces,
--> 672             disable_env_checking=config["disable_env_checking"],
    673         )
    674 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py in __init__(self, env_creator, validate_env, policy_spec, policy_mapping_fn, policies_to_train, tf_session_creator, rollout_fragment_length, count_steps_by, batch_mode, episode_horizon, preprocessor_pref, sample_async, compress_observations, num_envs, observation_fn, observation_filter, clip_rewards, normalize_actions, clip_actions, env_config, model_config, policy_config, worker_index, num_workers, recreated_worker, record_env, log_dir, log_level, callbacks, input_creator, input_evaluation, output_creator, remote_worker_envs, remote_env_batch_wait_ms, soft_horizon, no_done_at_end, seed, extra_python_environs, fake_sampler, spaces, policy, monitor_path, disable_env_checking)
    504         ):
    505             # Run the `env_creator` function passing the EnvContext.
--> 506             self.env = env_creator(copy.deepcopy(self.env_context))
    507 
    508         if self.env is not None:

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/agents/trainer.py in <lambda>(cfg)
   2773                 register_env(name, lambda cfg: _wrapper.remote(cfg))
   2774             else:
-> 2775                 register_env(name, lambda cfg: env_object(cfg))
   2776             return name
   2777         elif env_object is None:

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/env/wrappers/recsim.py in __init__(self, config)
    258                 gym_env,
    259                 config["convert_to_discrete_action_space"],
--> 260                 config["wrap_for_bandits"],
    261             )
    262             # Call the super (Wrapper constructor) passing it the created env.

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/env/wrappers/recsim.py in recsim_gym_wrapper(recsim_gym_env, convert_to_discrete_action_space, wrap_for_bandits)
    193         env = MultiDiscreteToDiscreteActionWrapper(env)
    194     if wrap_for_bandits:
--> 195         env = RecSimObservationBanditWrapper(env)
    196     return env
    197 

~/SageMaker/python_for_rl_venv/lib/python3.7/site-packages/ray/rllib/env/wrappers/recsim.py in __init__(self, env)
     80         obs_space = self.env.observation_space
     81 
---> 82         num_items = len(obs_space["doc"])
     83         embedding_dim = next(iter(obs_space["doc"].values())).shape[-1]
     84         self.observation_space = Dict(

TypeError: object of type 'Dict' has no len()

I am not sure what error I am doing here?