When constructing dictionary via Iterator during tune.run, element skipped

Hi, so I run into a problem where during training sometimes (though not consistently) an element is skipped during the construction of a dictionary inside MultiAgentEnvironment. As a result, I get a ‘none’ type for that element which causes the tune.run to fail. I’m not sure what is causing this as it doesn’t happen consistently. Furthermore, if I force print statements inside MultiAgentEnvironment, this problem seems to vanish, which makes me think it is some concurrency issue.

Any idea what I can do to fix this issue? I attach my code as well as the associated output:

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
from ray.rllib.agents import ddpg
import ray.tune as tune

INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2.0
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1.0
np.random.seed(2020)



class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""


    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, 100000)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([100000, 100000, 100000]),
            dtype=np.float32,
        )
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
        self.obs = np.array([self.assets, self.price, self.income])
        done = self.cons <= 0
        self.done = done
        if self.cons > 0:
            self.reward = (self.cons ** (1 - GAMMA) / (1 - GAMMA)).item()
        else:
            self.reward = -10000
        return self.obs, self.reward, self.done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        #needed here to be 7 as added aggregates.
        self.observation_space = gym.spaces.Box(
            low=np.array([BORROW_LIM, 0, 0, 0, 0, 0, 0]),
            high=np.array([100000, 100000, 100000, 100000, 100000, 100000, 100000]),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        agg_obs_list = [self.K, self.N, self.R, self.W]
        for i in range(self.num):
            dict_agents[str(i)][0:3] = self.agents[i].reset()
            dict_agents[str(i)][3:7] = np.array(agg_obs_list)
        return dict_agents

    def step(self, action_dict):
        #dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
        keylist = list(str(i) for i in range(0,self.num))
        obs, rew, done, info = dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist)
        obs_temp_list = {}
        obs =  dict.fromkeys(keylist)
        obs_temp_list = dict.fromkeys(keylist)

        obs_temp = np.zeros(7)
        
       

        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs_temp[0:3], rew[str(i)], done[str(i)], info[str(i)] = self.agents[int(i)].step(
                action, self.R, self.W
            ) 
            obs_temp_list[i] = obs_temp[0:7]
            # append aggregate observations to each i.
            if done[str(i)]:
                self.dones.add(str(i))
        # construct and append aggregate states
        try:
            self.K = sum(obs_val[0] for obs_val in obs_temp_list.values())
        except:
            print('Compare 1')
            print({i: self.agents[int(i)].step(action,self.R,self.W)[0] for i, action in action_dict.items()})
            print('Compare 2')
            print(obs_temp_list)
            print('Action Dict')
            print(action_dict)
            print(self.agents[0])
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(0, self.num):
            obs_temp_list[str(i)][3:7] = [self.K, self.N, self.R, self.W]
            obs[str(i)] = obs_temp_list[str(i)]
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info
    def render(self, mode="human", close=True):
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


if __name__ == "__main__":

    env = AiyagariMultiAgentEnv(5)
    obs = env.reset()
    for items in env.render():
        print(f"Agent: {env.render().index(items)+1} \n")
        print(items)
    print(env.action_space)

    tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))

    #obs_space = env.observation_space
    #act_spc = env.action_space
    #policies = {agent: (None, obs_space, act_spc, {}) for agent in env.agents}
    ray.init()
    config = {
            "env": "my_env",
            # General
            "num_gpus": 0,
            "num_workers": 2,
            # Method specific
            }  


    analysis=tune.run(
        "PPO",
        stop={"training_iteration": 10},
        checkpoint_freq=10,
        config=config,
        checkpoint_at_end = True
     )
    checkpoints = analysis.get_trial_checkpoints_paths(trial= analysis.get_best_trial("epsiode_reward_mean"), metric="episode_reward_mean")

    agent = ppo.PPOTrainer(config=config, env="my_env")
    agent.restore(checkpoints)  
    episode_reward = 0
    done = False
    obs = env.reset()
    while not done:
        action = agent.compute_action(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

and output:

  File "~/aiyagari.py", line 213, in step
    obs_temp_list[str(i)][3:7] = [self.K, self.N, self.R, self.W]
TypeError: 'NoneType' object does not support item assignment
Result for PPO_my_env_5f4a6_00000:
  {}
  
== Status ==
Memory usage on this node: 19.0/32.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/0 GPUs, 0.0/8.2 GiB heap, 0.0/2.83 GiB objects
Result logdir: /Users/brandonkaplowitz/ray_results/PPO
Number of trials: 1/1 (1 ERROR)
+------------------------+----------+-------+
| Trial name             | status   | loc   |
|------------------------+----------+-------|
| PPO_my_env_5f4a6_00000 | ERROR    |       |
+------------------------+----------+-------+
Number of errored trials: 1
+------------------------+--------------+------------------------------------------------------------------------------------------------+
| Trial name             |   # failures | error file                                                                                     |
|------------------------+--------------+------------------------------------------------------------------------------------------------|
| PPO_my_env_5f4a6_00000 |            1 | /Users/brandonkaplowitz/ray_results/PPO/PPO_my_env_5f4a6_00000_0_2021-06-18_17-30-15/error.txt |
+------------------------+--------------+------------------------------------------------------------------------------------------------+

== Status ==
Memory usage on this node: 19.0/32.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/0 GPUs, 0.0/8.2 GiB heap, 0.0/2.83 GiB objects
Result logdir: /Users/brandonkaplowitz/ray_results/PPO
Number of trials: 1/1 (1 ERROR)
+------------------------+----------+-------+
| Trial name             | status   | loc   |
|------------------------+----------+-------|
| PPO_my_env_5f4a6_00000 | ERROR    |       |
+------------------------+----------+-------+
Number of errored trials: 1
+------------------------+--------------+------------------------------------------------------------------------------------------------+
| Trial name             |   # failures | error file                                                                                     |
|------------------------+--------------+------------------------------------------------------------------------------------------------|
| PPO_my_env_5f4a6_00000 |            1 | /Users/brandonkaplowitz/ray_results/PPO/PPO_my_env_5f4a6_00000_0_2021-06-18_17-30-15/error.txt |
+------------------------+--------------+------------------------------------------------------------------------------------------------+

(pid=30050) Compare 1
(pid=30050) {'0': array([array([0.6450821], dtype=float32), 0.7623486417300875,
(pid=30050)        0.08079125864003082], dtype=object), '2': array([array([1.3112853], dtype=float32), 0.7623486417300875,
(pid=30050)        0.111011391232876], dtype=object), '3': array([array([0.8796276], dtype=float32), 0.7623486417300875,
(pid=30050)        0.27394019528008057], dtype=object), '4': array([array([0.3601406], dtype=float32), 0.7623486417300875,
(pid=30050)        0.2598886475183971], dtype=object)}
(pid=30050) Compare 2
(pid=30050) {'0': array([0.48552665, 0.76234864, 0.24534271, 0.        , 0.        ,
(pid=30050)        0.        , 0.        ]), '1': None, '2': array([0.48552665, 0.76234864, 0.24534271, 0.        , 0.        ,
(pid=30050)        0.        , 0.        ]), '3': array([0.48552665, 0.76234864, 0.24534271, 0.        , 0.        ,
(pid=30050)        0.        , 0.        ]), '4': array([0.48552665, 0.76234864, 0.24534271, 0.        , 0.        ,
(pid=30050)        0.        , 0.        ])}
(pid=30050) Action Dict
(pid=30050) {'0': array([-0.01], dtype=float32), '2': array([0.27017367], dtype=float32), '3': array([0.28476968], dtype=float32), '4': array([-0.01], dtype=float32)}
(pid=30050) <AiyagariEnvironment instance>
(pid=30046) Compare 1
(pid=30046) {'0': array([array([3.5576801], dtype=float32), 0.7331338777458356,
(pid=30046)        0.8155875202662485], dtype=object), '1': array([array([0.8347855], dtype=float32), 0.7331338777458356,
(pid=30046)        0.2332221717077834], dtype=object), '3': array([array([4.1018376], dtype=float32), 0.7331338777458356,
(pid=30046)        0.45530475275933174], dtype=object), '4': array([array([2.3350573], dtype=float32), 0.7331338777458356,
(pid=30046)        0.19900735236483508], dtype=object)}
(pid=30046) Compare 2
(pid=30046) {'0': array([1.65634084, 0.73313388, 1.81556862, 0.        , 0.        ,
(pid=30046)        0.        , 0.        ]), '1': array([1.65634084, 0.73313388, 1.81556862, 0.        , 0.        ,
(pid=30046)        0.        , 0.        ]), '2': None, '3': array([1.65634084, 0.73313388, 1.81556862, 0.        , 0.        ,
(pid=30046)        0.        , 0.        ]), '4': array([1.65634084, 0.73313388, 1.81556862, 0.        , 0.        ,
(pid=30046)        0.        , 0.        ])}
(pid=30046) Action Dict
(pid=30046) {'0': array([1.4794791], dtype=float32), '1': array([-0.01], dtype=float32), '3': array([2.2348056], dtype=float32), '4': array([1.1207378], dtype=float32)}
(pid=30046) <AiyagariEnvironment instance>
Traceback (most recent call last):

There is also a secondary issue where I still get NaN when using PPO for reported statistics even when the tune.run doesn’t stop.

Hi @bkaplowitz,

Looking at your code I can tell you what is happening but I do not know why.

So the error you are seeing is because the sum to create self.K raises an exception which you caught. A few lines down you try to assign self. K into a numpy array which you cannot do because it is none since it excepted.

Now the reason that it excepted is because you defined obs_temp_list to be a dictionary with keys [0:self.num-1]. The way you constructed it, each key is initialized to contain None.

A few lines below that you change the values of this dictionary to be the real observations based on the keys in the action dictionary but as you can see, there is no key for agent 1 in there so that value remains none and the sum for K fails.

The obvious question is why is agent 1 not in the action dictionary. I did not run your code so I am not certain but my guess is that it w marked done in the previous step. Once that happens two things occur. First,rllib will no longer produce actions for it. Second, rllib expects that it will not appear in the observations returned by subsequent steps. You are tracking done agents but as far as I can see you are not using that info to make sure that they are not being included in the return overvalued from step.

I do not know if this is appropriate for the goals of your environment, but one way to fix the error is to build keylist from the keys in your action dictionary rather than from self.num.

Hope this helps!

Hi @mannyv,

Thanks so much. I agree the short term fix is building from the action dictionary. However, I have no idea why the action dictionary sometimes returns none. As you say it may be because it previously marked agents as done, but I don’t remove them based on this and I don’t want that to happen right now. The goal right now is just to get the iterative method working and then use a reasonable criteria to stop updating all agents at once.

Do you have any suggestions as to how to achieve this?

If you tell rllib that an agent is done at step t it will not compute an action for it on that step and it will require that it is not present in the observations for steps > t. If it is done there are no actions it can take.

One thing you could do internally is not return done until all the agents are done and then return a fixed observation for agents that are done (all 0s for example) and ignore the actions or force that agent to take a noop action. This is a common approach taken by some environments.

Hi @mannyv this works! I did a modified version where I return done for all if even a single agent achieves done, as I wasn’t sure if I wanted to impute values. Effectively I want to be sure they never choose ‘c’ to be larger than the net_assets they have accumulated, but because of the changing size of bounds, I can’t do this directly.

Now the tuning ‘works’. However, there are still two issues: I want the choice ‘c’ to never be below 0. I choose a very large negative reward if this happened originally, but then the reward barely budged from that huge negative value during the updating process. I have since made it smaller, but consistently agents are still returning negative rewards over the horizon so clearly they are still hitting 0. This terminates after 24 iterations (not sure why) so they never learn to avoid this outcome, and I’m not sure how else to encourage the agents to learn to avoid this. Any suggestions would be greatly appreciated. It is possibly because I terminate the episode after a single agent hits done, so maybe the environment is too hard to learn then? Maybe it is better to just have an agent play 0 and receive the negative reward thereafter if they hit the boundary?

Second when I evaluate now, I still get the following error:

Traceback (most recent call last):
  File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/models/preprocessors.py", line 62, in check_shape
    if not self._obs_space.contains(observation):
  File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/gym/spaces/box.py", line 115, in contains
    return x.shape == self.shape and np.all(x >= self.low) and np.all(x <= self.high)
AttributeError: 'dict' object has no attribute 'shape'
During handling of the above exception, another exception occurred:
...
ValueError: ('Observation for a Box/MultiBinary/MultiDiscrete space should be an np.array, not a Python list.', {'0': array([1.        , 1.02      , 4.63189596, 5.        , 5.        ,
       0.67      , 0.33      ]), '1': array([1.        , 1.02      , 1.11957128, 5.        , 5.        ,
       0.67      , 0.33      ]), '2': array([1.       , 1.02     , 6.7787408, 5.       , 5.       , 0.67     ,
       0.33     ]), '3': array([1.        , 1.02      , 0.16358127, 5.        , 5.        ,
       0.67      , 0.33      ]), '4': array([1.        , 1.02      , 0.61516606, 5.        , 5.        ,
       0.67      , 0.33      ])})

Any suggestions would be greatly appreciated.

I attach the current code below and an example return:

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
from ray.rllib.agents import ddpg
import ray.tune as tune

INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2.0
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1.0
np.random.seed(2020)

# alternative for raylib


class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return util
    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, 100000)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([100000, 100000, 100000]),
            dtype=np.float32,
        )
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
            #takes ndarray self.assets and newly created ndarray of self.prices + self.income and returns single ndarray
        self.obs = np.hstack((self.assets,np.array(self.price), np.array(self.income)))
        done = (self.cons <= 0)
        self.done = done
        if self.cons > 0:
            self.reward = (self.cons ** (1 - GAMMA) / (1 - GAMMA)).item()
        else:
            self.reward = -1
        return self.obs, self.reward, self.done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        #needed here to be 7 as added aggregates.
        self.observation_space = gym.spaces.Box(
            low=np.array([BORROW_LIM, 0, 0, 0, 0, 0, 0]),
            high=np.array([100000, 100000, 100000, 100000, 100000, 100000, 100000]),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        agg_obs_list = [self.K, self.N, self.R, self.W]
        for i in range(self.num):
            dict_agents[str(i)][0:3] = self.agents[i].reset()
            dict_agents[str(i)][3:7] = np.array(agg_obs_list)
        return dict_agents

    def step(self, action_dict):
        #dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
        keylist = list(str(i) for i in range(0,self.num))
        obs, rew, done, info = dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist)
        obs_temp_list = {}
        obs =  dict.fromkeys(keylist)
        obs_temp_list = dict.fromkeys(keylist)
      # for i in range(0, self.num):
      #      try:
      #          obs_temp_list['agent'+i]
      #      except KeyError:
      #          print('first check')
      #          print(obs_temp_list,np.array([self.K, self.N, self.R, self.W]))
        obs_temp = np.zeros(7)
        
       

        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs_temp[0:3], rew[str(i)], done[str(i)], info[str(i)] = self.agents[int(i)].step(
                action, self.R, self.W
            ) 
            #if max(done)==1:
            #    print('Agent hit 0. Restarting!')
            #    done[str(i)]=True
            obs_temp_list[i] = obs_temp[0:7]
            # append aggregate observations to each i
        # checks if None, if case sets to 0, else sets to 0.
        #sets all entries to true if single entry is true.
        for v in done.values():
            if v is None:
                done=dict.fromkeys(done,False)
        done = dict.fromkeys(done, bool(max(done.values())))
        for i, action in action_dict.items():
            if done[str(i)]:
                self.dones.add(str(i))


        # construct and append aggregate states
        
        try:
            self.K = sum(obs_val[0] for obs_val in obs_temp_list.values())
        except:
            print('Compare 1')
            print({i: self.agents[int(i)].step(action,self.R,self.W)[0] for i, action in action_dict.items()})
            print('Compare 2')
            print(obs_temp_list)
            print('Action Dict')
            print(action_dict)
            print(self.agents[0])
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(0, self.num):
            obs_temp_list[str(i)][3:7] = [self.K, self.N, self.R, self.W]
            obs[str(i)] = obs_temp_list[str(i)]
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info
    def render(self, mode="human", close=True):
        # TODO: work on nice render
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


if __name__ == "__main__":

    env = AiyagariMultiAgentEnv(5)
    obs = env.reset()
    for items in env.render():
        print(f"Agent: {env.render().index(items)+1} \n")
        print(items)
    print(env.action_space)

    tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))

    #obs_space = env.observation_space
    #act_spc = env.action_space
    #policies = {agent: (None, obs_space, act_spc, {}) for agent in env.agents}
    ray.init()
    config = {
            "env": "my_env",
            # General
            "horizon":5, #controls how far ahead they wait before calculating. Effectively 'memory limit'
            "num_gpus": 0,
            "num_workers": 2,
            # Method specific
            }  
    #

    #analysis=tune.run(
    #    "DDPG",
    #    mode="max",
    #    metric="episode_reward_mean",
    #    stop={"episodes_total": 1},
    #    checkpoint_freq=10,
    #    config=config,
    #)   
    #checkpoints = analysis.get_trial_checkpoints_paths(trial= analysis.get_best_trial("epsiode_reward_mean"), metric="episode_reward_mean")
    #agent = ddpg.DDPGTrainer(config=config, env="my_env")
    #agent.restore(checkpoints)  
    analysis=tune.run(
        "PPO",
        stop={"episodes_total": 10000},
        mode="max",
        metric="episode_reward_mean",
        checkpoint_freq=10,
        config=config,
        checkpoint_at_end = True
     )
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean"),
    metric="episode_reward_mean")
agent = ppo.PPOTrainer(config=config, env="my_env")
print('checkpoints!', checkpoints)
#fetches string of path of best outcome and then loads the trained model
agent.restore(checkpoints[-1][0])  
episode_reward = 0
done = False
obs = env.reset()
while not done:
    action = agent.compute_action(obs)
    obs, reward, done, info = env.step(action)
    episode_reward += reward

Last return:


= Status ==
Memory usage on this node: 21.5/32.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/0 GPUs, 0.0/10.25 GiB heap, 0.0/3.52 GiB objects
Current best trial: 75f5a_00000 with episode_reward_mean=-7.398013870831993 and
 parameters={'num_workers': 2, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 
'rollout_fragment_length': 200, 'batch_mode': 'truncate_episodes', 'num_gpus': 0, 'train_batch_size': 
4000, 
'model': {'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 
'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 
'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False,
 '_time_major': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 
'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None,
 'custom_preprocessor': None, 'lstm_use_prev_action_reward': -1}, 'optimizer': {}, 'gamma': 0.99, 
'horizon': 5, 'soft_horizon': False, 'no_done_at_end': False, 'env_config': {}, 'env': 'my_env', 
'normalize_actions': False, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind',
 'lr': 5e-05, 'monitor': False, 'log_level': 'WARN', 'callbacks': <class 
'ray.rllib.agents.callbacks.DefaultCallbacks'>, 'ignore_worker_failures': False, 'log_sys_usage': True,
 'fake_sampler': False, 'framework': 'tf', 'eager_tracing': False, 'explore': True, 'exploration_config': 
{'type': 'StochasticSampling'}, 'evaluation_interval': None, 'evaluation_num_episodes': 10, 
'in_evaluation': False, 'evaluation_config': {}, 'evaluation_num_workers': 0, 'custom_eval_function':
 None, 'sample_async': False, '_use_trajectory_view_api': True, 'observation_filter': 'NoFilter', 
'synchronize_filters': True, 'tf_session_args': {'intra_op_parallelism_threads': 2, 
'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False,
 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': 
{'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'compress_observations': False,
 'collect_metrics_timeout': 180, 'metrics_smoothing_episodes': 100, 'remote_worker_envs': False, 
'remote_env_batch_wait_ms': 0, 'min_iter_time_s': 0, 'timesteps_per_iteration': 0, 'seed': None, 
'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 
'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, 'custom_resources_per_worker': {}, 
'num_cpus_for_driver': 1, 'memory': 0, 'object_store_memory': 0, 'memory_per_worker': 0, 
'object_store_memory_per_worker': 0, 'input': 'sampler', 'input_evaluation': ['is', 'wis'], 
'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_compress_columns': 
['obs', 'new_obs'], 'output_max_file_size': 67108864, 'multiagent': {'policies': {}, 'policy_mapping_fn': 
None, 'policies_to_train': None, 'observation_fn': None, 'replay_mode': 'independent'}, 
'logger_config': None, 'replay_sequence_length': 1, 'use_critic': True, 'use_gae': True, 'lambda': 1.0, 
'kl_coeff': 0.2, 'sgd_minibatch_size': 128, 'shuffle_sequences': True, 'num_sgd_iter': 30, 'lr_schedule': 
None, 'vf_share_layers': False, 'vf_loss_coeff': 1.0, 'entropy_coeff': 0.0, 'entropy_coeff_schedule': 
None, 'clip_param': 0.3, 'vf_clip_param': 10.0, 'grad_clip': None, 'kl_target': 0.01, 'simple_optimizer': 
False, '_fake_gpus': False}
Result logdir: ~/ray_results/PPO
Number of trials: 1/1 (1 TERMINATED)
+------------------------+------------+-------+--------+------------------+-------+----------+----------------------+----------------------+--------------------+
| Trial name             | status     | loc   |   iter |   total time (s) |    ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|------------------------+------------+-------+--------+------------------+-------+----------+----------------------+----------------------+--------------------|
| PPO_my_env_75f5a_00000 | TERMINATED |       |     24 |          94.7967 | 96000 | -7.39801 |             -3.30373 |             -433.294 |            1.11111 |
+------------------------+------------+-------+--------+------------------+-------+----------+----------------------+----------------------+--------------------+


2021-06-24 11:34:55,249 INFO tune.py:448 -- Total run time: 113.70 seconds (113.36 seconds for the tuning loop).