Rllib dont support tuple action spaces with unity mlagent environment

How severe does this issue affect your experience of using Ray?

Hello everyone, I have encountered a problem. I have created a Unity ML Agent environment as follows:

link to the main code notebook : MultiAgentContiniousPCGRL/Untitled2.ipynb at main · danianamir/MultiAgentContiniousPCGRL (github.com)

class Unity3DEnv(MultiAgentEnv):

    _BASE_PORT_EDITOR = 5004
    _BASE_PORT_ENVIRONMENT = 5005
    _WORKER_ID = 0

    def __init__(
        self,
        file_name: str = None,
        port: Optional[int] = None,
        seed: int = 0,
        no_graphics: bool = False,
        timeout_wait: int = 300,
        episode_horizon: int = None,

    ):

        self._skip_env_checking = True
        super().__init__()

        if file_name is None:
            print(
                "No game binary provided, will use a running Unity editor "
                "instead.\nMake sure you are pressing the Play (|>) button in "
                "your editor to start."
            )

        # Try connecting to the Unity3D game instance. If a port is blocked
        port_ = None
        while True:
            # Sleep for random time to allow for concurrent startup of many
            # environments (num_workers >> 1). Otherwise, would lead to port
            # conflicts sometimes.
            if port_ is not None:
                time.sleep(random.randint(1, 10))
            port_ = port or (
                self._BASE_PORT_ENVIRONMENT if file_name else self._BASE_PORT_EDITOR
            )
            # cache the worker_id and
            # increase it for the next environment
            worker_id_ = Unity3DEnv._WORKER_ID if file_name else 0
            Unity3DEnv._WORKER_ID += 1
            try:
                self.unity_env = UnityEnvironment(
                    file_name=file_name,
                    worker_id=worker_id_,
                    base_port=port_,
                    seed=seed,
                    no_graphics=no_graphics,
                    timeout_wait=timeout_wait,
                )

                print("Created UnityEnvironment for port {}".format(port_ + worker_id_))
            except mlagents_envs.exception.UnityWorkerInUseException:
                pass
            else:
                break

        self.episode_horizon = episode_horizon
        self.episode_timesteps = 0









    def step(
        self, action_dict
    ):

        for behavior_name in self.unity_env.behavior_specs:
                actions = []
                for agent_id in self.unity_env.get_steps(behavior_name)[0].agent_id:
                    key = behavior_name + "_{}".format(agent_id)
                    actions.append(action_dict[key])



                    if behavior_name=="instantiator Behaviour?team=0":
                       continuous_action= np.array(actions[0][0])
                       print(continuous_action)
                       discrete_action=np.array([[actions[0][1]]])
                       print(discrete_action)
                       action_tuple =ActionTuple(continuous= continuous_action , discrete= discrete_action)

                    if behavior_name=="modifyer Behavior?team=0":
                       continuous_action = np.array(actions[0][0])
                       print(continuous_action)
                       discrete_action = np.array([actions[0][1]])
                       print(discrete_action)
                       action_tuple = ActionTuple(continuous= continuous_action , discrete= discrete_action)   


                    self.unity_env.set_actions(behavior_name, action_tuple)

        self.unity_env.step()
        self.episode_timesteps += 1
        obs, rewards, terminateds, truncateds, infos = self._get_step_results()
        return obs, rewards, terminateds, truncateds, infos














    def reset(
        self, *, seed=None, options=None
    ):
        self.episode_timesteps = 0
        self.unity_env.reset()
        obs, _, _, _, infos = self._get_step_results()
        return obs, infos








    def _get_step_results(self):

        # first we set the initial dict that step return
        obs = {}
        rewards = {}
        terminated ={}
        truncated={}
        infos = {}


        num_active=0
        num_done=0
        num_all=0



        #go thorugh all the behavior
        # return decision_step (batch of agent have similar behavior)
        # return trminal steps  (batch of agent have similar behavior /that end thier episode)
        for behavior_name  in self.unity_env.behavior_specs:
            decision_steps, terminal_steps = self.unity_env.get_steps(behavior_name)
            num_active=num_active+len(decision_steps)
            num_done=num_done+len(terminal_steps)
            num_all=num_active+num_done





            #set the obs / reward  from  decision step for each agent like:{"behavior_name+agent_id" : numpy array}
            for agent_id, idx in decision_steps.agent_id_to_index.items():
                key = behavior_name + "_{}".format(agent_id)
                os = tuple(o[idx] for o in decision_steps.obs)
                os = os[0] if len(os) == 1 else os
                obs[key] = os
                rewards[key] = (decision_steps.reward[idx] + decision_steps.group_reward[idx])



            #set the obs / reward  from  terminal_steps for each agent like:{"behavior_name+agent_id" : numpy array}
            for agent_id, idx in terminal_steps.agent_id_to_index.items():
                key = behavior_name + "_{}".format(agent_id)
                if key not in obs:
                    os = tuple(o[idx] for o in terminal_steps.obs)
                    obs[key] = os = os[0] if len(os) == 1 else os
                rewards[key] = (terminal_steps.reward[idx] + terminal_steps.group_reward[idx])




        #infos={"acive":num_active , "done":num_done , "all":num_all }


        # set the terminated if all agent end thier own episod like: {"__all__":True}
        if(num_active>0):
           terminated["__all__"]=False
        else:
           terminated["__all__"]=True



        # set the truncated /if the step of the env reach the horizen like{"__all__":True}
        if self.episode_timesteps == self.episode_horizon:
          truncated["__all__"]=True
        else:
          truncated["__all__"]=False


        # return
        return obs, rewards, terminated, truncated, infos

Then I utilized the rolloutworker to randomly sample from the environment, like this:

continuous_action_space_initializer = spaces.Box(low=-100, high=100, shape=(2,), dtype=float)
discrete_action_space_initializer = spaces.Discrete(3)

continuous_action_space_modifier=spaces.Box(low=-100, high=100, shape=(2,), dtype=float)
multi_discrete_action_space_modifier=spaces.MultiDiscrete([6,2])



policies = { "policy_initializer": (RandomPolicy,spaces.Box(float("-inf"), float("inf"), (28,)),
                                    spaces.Tuple((continuous_action_space_initializer, discrete_action_space_initializer)),
                                    AlgorithmConfig())

                   ,"policy_modifier":(RandomPolicy,spaces.Box(float("-inf"), float("inf"), (28,)),
                                       spaces.Tuple((continuous_action_space_modifier,multi_discrete_action_space_modifier)),
                                       AlgorithmConfig())
                }



          

def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    if agent_id == 0:
        return "policy_initializer"
    else:
        return "policy_modifier"


config=AlgorithmConfig().multi_agent(policies=policies,policy_mapping_fn=policy_mapping_fn).rollouts(rollout_fragment_length=10)


worker=RolloutWorker(env_creator=lambda _: Unity3DEnv(file_name="/content/MultiAgentContiniousPCGRL/unity_built/my_game.x86_64",episode_horizon = 10)
,config=config)

multi_batch=worker.sample()

but when i run the above code i got this error :

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/tree/__init__.py in assert_same_structure(a, b, check_types)
    283   try:
--> 284     _tree.assert_same_structure(a, b, check_types)
    285   except (ValueError, TypeError) as e:

ValueError: The two structures don't have the same nested structure.

First structure: type=ndarray str=[[92.19893   -4.3250813]
 [ 3.         1.       ]]

Second structure: type=tuple str=(Box(-100.0, 100.0, (2,), float64), MultiDiscrete([6 2]))

More specifically: Substructure "type=tuple str=(Box(-100.0, 100.0, (2,), float64), MultiDiscrete([6 2]))" is a sequence, while substructure "type=ndarray str=[[92.19893   -4.3250813]
 [ 3.         1.       ]]" is not

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
12 frames
/usr/local/lib/python3.10/dist-packages/tree/__init__.py in assert_same_structure(a, b, check_types)
    286     str1 = str(map_structure(lambda _: _DOT, a))
    287     str2 = str(map_structure(lambda _: _DOT, b))
--> 288     raise type(e)("%s\n"
    289                   "Entire first structure:\n%s\n"
    290                   "Entire second structure:\n%s"

ValueError: The two structures don't have the same nested structure.

First structure: type=ndarray str=[[92.19893   -4.3250813]
 [ 3.         1.       ]]

Second structure: type=tuple str=(Box(-100.0, 100.0, (2,), float64), MultiDiscrete([6 2]))

More specifically: Substructure "type=tuple str=(Box(-100.0, 100.0, (2,), float64), MultiDiscrete([6 2]))" is a sequence, while substructure "type=ndarray str=[[92.19893   -4.3250813]
 [ 3.         1.       ]]" is not
Entire first structure:
.
Entire second structure:
(., .)

what is the problem ? i think the policy ouput actions are in the numpyarry structre like :
str=[[92.19893 -4.3250813][ 3. 1. ]]
but the environment accept the tuple action sapce like :
(tuple str=(Box(-100.0, 100.0, (2,), float64), MultiDiscrete([6 2]))

I would appreciate any guidance on how I can modify my environment to accept the policy output. Thank you.