Hi Sven, I tried rewriting as a MultiAgentEnv, but I got kind of garbage out. I suspect it has to do with the timing of the update on the world, which requires inputs from all agents and which I have happen inside BatchMultiAgentEnv. Do you have any suggestions on how to revise it to fit the MultiAgentEnv built into RayLib? Attached is the original code for your consideration when you have a chance to take a look.
import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
INITIAL_ASSET_HOLDINGS = 0
BORROW_LIM = -0.01
R_VALUE = 0.03
W_VALUE = 0.98
GAMMA = 2
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1
np.random.seed(2020)
class BatchMultiAgentEnv(gym.Env):
metadata = {"runtime.vectorized": True, "render.modes": ["human", "rgb_array"]}
def __init__(self, env_batch):
self.env_batch = env_batch
@property
def n(self):
return np.sum([env.n for env in self.env_batch])
@property
def action_space(self):
return self.env_batch[0].action_space
@property
def observation_space(self):
return self.env_batch[0].observation_space
def step(self, action_n):
obs_n = []
reward_n = []
done_n = []
info_n = {"n": []}
W = self.W
R = self.R
i = 0
for env in self.env_batch:
# note cleverly indexes actions for each agent between 0 and 5. Allows for independent actions.
obs, reward, done, _ = env.step(
action_n[i : (i + env.n)], R, W
) # how many copies of each agent (env.n) are there?
i += env.n
obs_n += obs
reward_n += reward
done_n += done
# K and N value used to compute the wage and interest rate, not necc observed by all unless we want it to be.
# sums asset values
# from assumed cobb douglas RA production function
# agents approximately take the going interest rate as given for a large number of agents. If there is a small number, they know they can influence, but can only observe the current interest rate (that is the capital return between yesterday and today. Note the distinction with RE, where agents explicitly use the future interest rate in their calculations because they know exactly their effect on the interest rate.)
# R = Z*(1-ALPHA)*(N/K)**(ALPHA)
# W = Z*(ALPHA)*(K/N)**(1-ALPHA)
# appends R_VALUE, W_VALUE to obs. assuming all agents can see. We can either assume that all agents can see each other's positions or not.
self.K = sum(m[0] for m in obs_n)
self.N = self.n
self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
obs_n += self.K * np.ones(self.n)
obs_n += self.N * np.ones(self.n)
obs_n += self.R * np.ones(self.n)
obs_n += self.W * np.ones(self.n)
return obs_n, reward_n, done_n, info_n
def reset(self):
obs_n = []
for env in self.env_batch:
obs_n += env.reset()
# fix obs_n here... not calculating properly... need to have obs_n for each agent.
self.K = sum(m[0] for m in obs_n)
self.N = self.n
self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
obs_n += self.K * np.ones(self.n)
obs_n += self.N * np.ones(self.n)
obs_n += self.R * np.ones(self.n)
obs_n += self.W * np.ones(self.n)
print(self.R)
return obs_n
# render environment
def render(self, mode="human", close=True):
# TODO: work on nice render
results_n = []
for env in self.env_batch:
# results += env.render(mode, close)
results = env.render(mode, close)
results_n.append(results)
return results_n
class AiyagariEnvironment(gym.Env):
""" An environment for value function sampling from a basic RA GE model with capital"""
# resets state to initial value
# def u(cons):
# util = cons**(1-GAMMA)/(1-GAMMA)
# return util
# idea pass assets to multiagent, and then return interest rate back to environment.
metadata = {"render.modes": ["human"]}
def __init__(self):
super(AiyagariEnvironment, self).__init__()
self.reward_range = (0, np.inf)
self.seed()
# next period asset space bounds [borrow_lim, inf)
self.action_space = spaces.Box(
low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
)
# observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
self.observation_space = spaces.Box(
low=np.array([BORROW_LIM, 0, 0]),
high=np.array([np.inf, np.inf, 1]),
dtype=np.float32,
)
# resets state to initial value
# def u(cons):
# util = cons**(1-GAMMA)/(1-GAMMA)
# return utily
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def reset(self):
self.assets = INITIAL_ASSET_HOLDINGS
self.price = R_VALUE + 1
self.W = W_VALUE
self.current_step = 0
self.cons = 0
self.net_worth = 0
self.reward = 0
self.shock = np.exp(self.np_random.normal(0, 1))
self.income = self.W * self.shock
self.obs = [[self.assets], [self.price], [self.income]]
self.current_step = 0
# shifted exponential for time being, can impose own distribution with custom sampling later on.
# for time being will use default distribution for sampling.
return self.obs
# updating function
@property
def n(self):
return AGENT_NUM
def step(self, action, R, W):
self.current_step += 1
self.price = R
self.W = W
self.shock = np.exp(np.random.normal(0, 1))
self.income = self.W * self.shock
self.net_worth = (self.price) * self.assets + self.income
if action in self.action_space:
if action <= self.net_worth:
self.assets = action
self.cons = self.net_worth - action
else:
self.assets = self.net_worth
else:
raise ValueError(
"Received invalid action={:f} which is not part of the action space".format(
action
)
)
self.obs = [[self.assets], [self.price], [self.income]]
done = self.cons <= 0
if self.cons != 0:
self.reward = self.cons ** (1 - GAMMA) / (1 - GAMMA)
else:
self.reward = -np.inf
return self.obs, self.reward, done, {}
def render(self, mode="human", close=False):
# work on render to make graph.
results = str(
f"Step: {self.current_step}\n"
f"Assets: {self.assets}\n"
f"Income: {self.income}\n"
f"Consumption: {self.cons}\n"
f"Net worth: {self.net_worth}\n"
f"Interest Rate: {self.price}\n"
f"Wage Rate: {self.W}\n"
f"Utility: {self.reward}\n"
)
return results
# Main Code which creates copies of AiyagariEnv, appends them to a list and feeds into the BatchMultiAgentEnv to be vectorized
#####################################
AiyagariList= []
env= AiyagariEnvironment()
obs = env.reset()
for j in range(0,N):
env= AiyagariEnvironment()
AiyagariList.append(env)
world = BatchMultiAgentEnv(AiyagariList)
print(world.observation_space)
world.reset()
for items in world.render():
print(f"Agent: {world.render().index(items)+1} \n" )
print(items)
print(world.action_space)
# Test of code with simple policy
n_steps = 20
#spender agent
spender = np.zeros(N,dtype=np.float32) #hardcoded to always consume 5 units of assets
for step in range(n_steps):
#print("Step {}".format(step+1))
obs,reward,done,info = world.step(spender)
for items in world.render():
print(f"Agent: {world.render().index(items)+1} \n" )
print(items)
# print('obs=', obs, 'reward=', reward, 'done=', done)
if done:
print('Goal reached!',"reward=". reward)
break
# Test of code with RayLib
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
trainer= ppo.PPOTrainer(env=BatchMultiAgentEnv, config ={ "env_batch":
})
while True:
print(trainer.train())