Hi, so I run into a problem where during training sometimes (though not consistently) an element is skipped during the construction of a dictionary inside MultiAgentEnvironment. As a result, I get a ‘none’ type for that element which causes the tune.run to fail. I’m not sure what is causing this as it doesn’t happen consistently. Furthermore, if I force print statements inside MultiAgentEnvironment, this problem seems to vanish, which makes me think it is some concurrency issue.
Any idea what I can do to fix this issue? I attach my code as well as the associated output:
import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
from ray.rllib.agents import ddpg
import ray.tune as tune
INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2.0
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1.0
np.random.seed(2020)
class AiyagariEnvironment(gym.Env):
""" An environment for value function sampling from a basic RA GE model with capital"""
# idea pass assets to multiagent, and then return interest rate back to environment.
metadata = {"render.modes": ["human"]}
def __init__(self):
super(AiyagariEnvironment, self).__init__()
self.reward_range = (0, 100000)
self.seed()
# next period asset space bounds [borrow_lim, inf)
self.action_space = spaces.Box(
low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
)
# observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
self.observation_space = spaces.Box(
low=np.array([BORROW_LIM, 0, 0]),
high=np.array([100000, 100000, 100000]),
dtype=np.float32,
)
self.assets = INITIAL_ASSET_HOLDINGS
self.price = R_VALUE - DELTA
self.W = W_VALUE
self.current_step = 0
self.cons = 0
self.net_worth = self.assets * self.price
self.reward = 0
self.shock = np.exp(self.np_random.normal(0, 1))
self.income = self.W * self.shock
self.obs = np.array([self.assets, self.price, self.income])
self.current_step = 0
# resets state to initial value
# def u(cons):
# util = cons**(1-GAMMA)/(1-GAMMA)
# return utily
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def reset(self):
self.assets = INITIAL_ASSET_HOLDINGS
self.price = R_VALUE - DELTA
self.W = W_VALUE
self.current_step = 0
self.cons = 0
self.net_worth = self.assets * self.price
self.reward = 0
self.shock = np.exp(self.np_random.normal(0, 1))
self.income = self.W * self.shock
self.obs = np.array([self.assets, self.price, self.income])
self.current_step = 0
# shifted exponential for time being, can impose own distribution with custom sampling later on.
# for time being will use default distribution for sampling.
return self.obs
# updating function
@property
def n(self):
return AGENT_NUM
def step(self, action, R, W):
self.current_step += 1
self.price = R - DELTA
self.W = W
self.shock = np.exp(np.random.normal(0, 1))
self.income = self.W * self.shock
self.net_worth = (self.price) * self.assets + self.income
if action in self.action_space:
if action <= self.net_worth:
self.assets = action + (self.price) * self.assets
self.cons = self.net_worth - action
else:
self.assets = self.net_worth + (self.price) * self.assets
else:
raise ValueError(
"Received invalid action={:f} which is not part of the action space".format(
action
)
)
self.obs = np.array([self.assets, self.price, self.income])
done = self.cons <= 0
self.done = done
if self.cons > 0:
self.reward = (self.cons ** (1 - GAMMA) / (1 - GAMMA)).item()
else:
self.reward = -10000
return self.obs, self.reward, self.done, {}
def render(self, mode="human", close=False):
# work on render to make graph.
results = str(
f"Step: {self.current_step}\n"
f"Assets: {self.assets}\n"
f"Income: {self.income}\n"
f"Consumption: {self.cons}\n"
f"Net worth: {self.net_worth}\n"
f"Interest Rate: {self.price}\n"
f"Wage Rate: {self.W}\n"
f"Utility: {self.reward}\n"
)
return results
class AiyagariMultiAgentEnv(MultiAgentEnv):
def __init__(self, num):
self.agents = [AiyagariEnvironment() for _ in range(num)]
self.dones = set()
#needed here to be 7 as added aggregates.
self.observation_space = gym.spaces.Box(
low=np.array([BORROW_LIM, 0, 0, 0, 0, 0, 0]),
high=np.array([100000, 100000, 100000, 100000, 100000, 100000, 100000]),
dtype=np.float32,
)
self.action_space = gym.spaces.Box(
low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
)
self.resetted = False
self.num = num
def reset(self):
self.resetted = True
self.dones = set()
dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
# initial holdings
self.K = sum(self.agents[i].assets for i in range(self.num))
self.N = self.num
self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
agg_obs_list = [self.K, self.N, self.R, self.W]
for i in range(self.num):
dict_agents[str(i)][0:3] = self.agents[i].reset()
dict_agents[str(i)][3:7] = np.array(agg_obs_list)
return dict_agents
def step(self, action_dict):
#dict_agents = {str(i): np.zeros(7) for i, a in enumerate(self.agents)}
keylist = list(str(i) for i in range(0,self.num))
obs, rew, done, info = dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist),dict.fromkeys(keylist)
obs_temp_list = {}
obs = dict.fromkeys(keylist)
obs_temp_list = dict.fromkeys(keylist)
obs_temp = np.zeros(7)
for i, action in action_dict.items():
# get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
obs_temp[0:3], rew[str(i)], done[str(i)], info[str(i)] = self.agents[int(i)].step(
action, self.R, self.W
)
obs_temp_list[i] = obs_temp[0:7]
# append aggregate observations to each i.
if done[str(i)]:
self.dones.add(str(i))
# construct and append aggregate states
try:
self.K = sum(obs_val[0] for obs_val in obs_temp_list.values())
except:
print('Compare 1')
print({i: self.agents[int(i)].step(action,self.R,self.W)[0] for i, action in action_dict.items()})
print('Compare 2')
print(obs_temp_list)
print('Action Dict')
print(action_dict)
print(self.agents[0])
self.N = self.num
self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
for i in range(0, self.num):
obs_temp_list[str(i)][3:7] = [self.K, self.N, self.R, self.W]
obs[str(i)] = obs_temp_list[str(i)]
done["__all__"] = len(self.dones) == len(self.agents)
return obs, rew, done, info
def render(self, mode="human", close=True):
results_n = []
for agent in self.agents:
# results += env.render(mode, close)
results = agent.render(mode, close)
results_n.append(results)
return results_n
if __name__ == "__main__":
env = AiyagariMultiAgentEnv(5)
obs = env.reset()
for items in env.render():
print(f"Agent: {env.render().index(items)+1} \n")
print(items)
print(env.action_space)
tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))
#obs_space = env.observation_space
#act_spc = env.action_space
#policies = {agent: (None, obs_space, act_spc, {}) for agent in env.agents}
ray.init()
config = {
"env": "my_env",
# General
"num_gpus": 0,
"num_workers": 2,
# Method specific
}
analysis=tune.run(
"PPO",
stop={"training_iteration": 10},
checkpoint_freq=10,
config=config,
checkpoint_at_end = True
)
checkpoints = analysis.get_trial_checkpoints_paths(trial= analysis.get_best_trial("epsiode_reward_mean"), metric="episode_reward_mean")
agent = ppo.PPOTrainer(config=config, env="my_env")
agent.restore(checkpoints)
episode_reward = 0
done = False
obs = env.reset()
while not done:
action = agent.compute_action(obs)
obs, reward, done, info = env.step(action)
episode_reward += reward
and output:
File "~/aiyagari.py", line 213, in step
obs_temp_list[str(i)][3:7] = [self.K, self.N, self.R, self.W]
TypeError: 'NoneType' object does not support item assignment
Result for PPO_my_env_5f4a6_00000:
{}
== Status ==
Memory usage on this node: 19.0/32.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/0 GPUs, 0.0/8.2 GiB heap, 0.0/2.83 GiB objects
Result logdir: /Users/brandonkaplowitz/ray_results/PPO
Number of trials: 1/1 (1 ERROR)
+------------------------+----------+-------+
| Trial name | status | loc |
|------------------------+----------+-------|
| PPO_my_env_5f4a6_00000 | ERROR | |
+------------------------+----------+-------+
Number of errored trials: 1
+------------------------+--------------+------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------+--------------+------------------------------------------------------------------------------------------------|
| PPO_my_env_5f4a6_00000 | 1 | /Users/brandonkaplowitz/ray_results/PPO/PPO_my_env_5f4a6_00000_0_2021-06-18_17-30-15/error.txt |
+------------------------+--------------+------------------------------------------------------------------------------------------------+
== Status ==
Memory usage on this node: 19.0/32.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/12 CPUs, 0/0 GPUs, 0.0/8.2 GiB heap, 0.0/2.83 GiB objects
Result logdir: /Users/brandonkaplowitz/ray_results/PPO
Number of trials: 1/1 (1 ERROR)
+------------------------+----------+-------+
| Trial name | status | loc |
|------------------------+----------+-------|
| PPO_my_env_5f4a6_00000 | ERROR | |
+------------------------+----------+-------+
Number of errored trials: 1
+------------------------+--------------+------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------+--------------+------------------------------------------------------------------------------------------------|
| PPO_my_env_5f4a6_00000 | 1 | /Users/brandonkaplowitz/ray_results/PPO/PPO_my_env_5f4a6_00000_0_2021-06-18_17-30-15/error.txt |
+------------------------+--------------+------------------------------------------------------------------------------------------------+
(pid=30050) Compare 1
(pid=30050) {'0': array([array([0.6450821], dtype=float32), 0.7623486417300875,
(pid=30050) 0.08079125864003082], dtype=object), '2': array([array([1.3112853], dtype=float32), 0.7623486417300875,
(pid=30050) 0.111011391232876], dtype=object), '3': array([array([0.8796276], dtype=float32), 0.7623486417300875,
(pid=30050) 0.27394019528008057], dtype=object), '4': array([array([0.3601406], dtype=float32), 0.7623486417300875,
(pid=30050) 0.2598886475183971], dtype=object)}
(pid=30050) Compare 2
(pid=30050) {'0': array([0.48552665, 0.76234864, 0.24534271, 0. , 0. ,
(pid=30050) 0. , 0. ]), '1': None, '2': array([0.48552665, 0.76234864, 0.24534271, 0. , 0. ,
(pid=30050) 0. , 0. ]), '3': array([0.48552665, 0.76234864, 0.24534271, 0. , 0. ,
(pid=30050) 0. , 0. ]), '4': array([0.48552665, 0.76234864, 0.24534271, 0. , 0. ,
(pid=30050) 0. , 0. ])}
(pid=30050) Action Dict
(pid=30050) {'0': array([-0.01], dtype=float32), '2': array([0.27017367], dtype=float32), '3': array([0.28476968], dtype=float32), '4': array([-0.01], dtype=float32)}
(pid=30050) <AiyagariEnvironment instance>
(pid=30046) Compare 1
(pid=30046) {'0': array([array([3.5576801], dtype=float32), 0.7331338777458356,
(pid=30046) 0.8155875202662485], dtype=object), '1': array([array([0.8347855], dtype=float32), 0.7331338777458356,
(pid=30046) 0.2332221717077834], dtype=object), '3': array([array([4.1018376], dtype=float32), 0.7331338777458356,
(pid=30046) 0.45530475275933174], dtype=object), '4': array([array([2.3350573], dtype=float32), 0.7331338777458356,
(pid=30046) 0.19900735236483508], dtype=object)}
(pid=30046) Compare 2
(pid=30046) {'0': array([1.65634084, 0.73313388, 1.81556862, 0. , 0. ,
(pid=30046) 0. , 0. ]), '1': array([1.65634084, 0.73313388, 1.81556862, 0. , 0. ,
(pid=30046) 0. , 0. ]), '2': None, '3': array([1.65634084, 0.73313388, 1.81556862, 0. , 0. ,
(pid=30046) 0. , 0. ]), '4': array([1.65634084, 0.73313388, 1.81556862, 0. , 0. ,
(pid=30046) 0. , 0. ])}
(pid=30046) Action Dict
(pid=30046) {'0': array([1.4794791], dtype=float32), '1': array([-0.01], dtype=float32), '3': array([2.2348056], dtype=float32), '4': array([1.1207378], dtype=float32)}
(pid=30046) <AiyagariEnvironment instance>
Traceback (most recent call last):
There is also a secondary issue where I still get NaN when using PPO for reported statistics even when the tune.run doesn’t stop.