Ray Tune tensor([[nan]]) for HRL (custom MultiAgentEnV)

Hi there,
I am a Rllib newbie (coming from SB3). I have created a custom MultiAgent Env (a HRL component sitting on top of a manager which is managing 13 individual workers doing stock trading). My tune.run continuously fails with tensor([[nan]]) at some point (I have already reduced the learning rate and now the trials runs for longer but eventually, it always fails and I cannot narrow down where the issue is coming from)
Below is the code I use to run the tune.run - is there something fundamentally wrong with my code?

Thank you so much for your help!!

env = HRL()
def env_creator(env_config):
return HRL(env_config)

register_env(“hrl”, env_creator)

def create_policy_spec(worker_id):
# print(f"Creating policy for {worker_id} with obs space {env.observation_space[worker_id]} and action space {env.action_space[worker_id]}")
return PolicySpec(
observation_space=env.observation_space[worker_id],
action_space=env.action_space[worker_id],
config={}
)

manager_policy_spec = PolicySpec(
observation_space=env.observation_space[‘manager’],
action_space=env.action_space[‘manager’],
config={}
)

policies = {
“manager_policy”: manager_policy_spec,
}

for worker_id in env.workers:
policies[worker_id] = create_policy_spec(worker_id)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
if agent_id == ‘manager’:
# print(f"!!! policy mapping manager: {agent_id}")
return “manager_policy”
elif agent_id in env.workers:
return agent_id
else:
print(“defaul policy triggered”)
return “default_policy”

param_space = {
“env”: “hrl”,
“multiagent”: {
“policies”: policies,
“policy_mapping_fn”: policy_mapping_fn,
},
“rollout_fragment_length”: “auto”,
“lr”: tune.uniform(1e-5,1e-4),
“gamma”: tune.uniform(0.95, 0.9999),
“lambda”: tune.uniform(0.9,1.0),
“entropy_coeff”: tune.uniform(0.01,0.1),
“vf_loss_coeff”: tune.uniform(0.1,0.3),
“num_workers”: 4,
“log_level”: “ERROR”,
“output”: “logdir”,
“monitor”: True,
}

analysis = tune.run(
“A2C”,
metric=“episode_reward_mean”,
num_samples=10,
mode=“max”,
config=param_space,
storage_path=“/Volumes/SSD980/ray/results/test_tunerun3”,
search_alg=None,
scheduler=None,
progress_reporter=CLIReporter(max_progress_rows=10,max_report_frequency=120),
max_concurrent_trials=1,
#checkpoint_config not checked yet
checkpoint_config={
“num_to_keep”: 3,
“checkpoint_score_attribute”: “episode_reward_mean”,
“checkpoint_score_order”: “max”,
“checkpoint_frequency”: 10
}
)