Error: TypeError: 'EnvContext' object cannot be interpreted as an integer?

Hi, I recently created a new environment using Rayllib’s MultiAgentEnv. However, when running a training algorithm on it, I got the following error “TypeError: ‘EnvContext’ object cannot be interpreted as an integer”. What exactly is causing this error? I assume it is the enumeration of the different environments as agents, but this came from the basic multi agent env in the examples folder.

Code:

from gym.utils import seeding, EzPickle
import gym

from gym import spaces

import numpy as np

import copy

from ray import tune

from ray.rllib.agents.ppo import PPOTrainer
class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""
    #resets state to initial value
  #  def u(cons):
  #      util = cons**(1-GAMMA)/(1-GAMMA)
  #      return util
  # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {'render.modes':['human']}
    def __init__(self):
        super(AiyagariEnvironment,self).__init__()
        self.reward_range =(0,np.inf)
        self.seed()
        #next period asset space bounds [borrow_lim, inf)
        self.action_space =spaces.Box(low=np.array([BORROW_LIM]),high=np.array([np.inf]), dtype=np.float32)
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(low=np.array([BORROW_LIM,0,0]), high=np.array([np.inf,np.inf,1]), dtype=np.float32)
    #resets state to initial value
  #  def u(cons):
  #      util = cons**(1-GAMMA)/(1-GAMMA)
  #      return utily 
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price=R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step=0
        self.cons = 0
        self.net_worth=0
        self.reward=0
        self.shock = np.exp(self.np_random.normal(0,1))
        self.income = self.W*self.shock
        self.obs  =  [[self.assets],[self.price],[self.income]]
        self.current_step =0
        #shifted exponential for time being, can impose own distribution with custom sampling later on.
    #for time being will use default distribution for sampling.  
        return self.obs
    # updating function
    @property
    def n(self):
       return AGENT_NUM
    def step(self,action,R,W):
        self.current_step +=1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0,1))
        self.income = self.W*self.shock
        self.net_worth = (self.price)*self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets  = action + (self.price)*self.assets
                self.cons = self.net_worth-action
            else:
                self.assets = self.net_worth+(self.price)*self.assets
        else: 
            raise ValueError("Received invalid action={:f} which is not part of the action space".format(action)) 
        self.obs  =  [[self.assets],[self.price],[self.income]]
        done  =self.cons <=0
        if self.cons>0:
            self.reward =  self.cons**(1-GAMMA)/(1-GAMMA)
        else:
            self.reward = -np.inf
        return self.obs, self.reward, done, {} 

    def render(self, mode='human', close=False):
        #work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n")
        return results
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
import gym
from ray.rllib.env.multi_agent_env import MultiAgentEnv
class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        self.observation_space = gym.spaces.Box(low=np.array([BORROW_LIM,0,0]), high=np.array([np.inf,np.inf,1]), dtype=np.float32)
        self.action_space = gym.spaces.Box(low=np.array([BORROW_LIM]),high=np.array([np.inf]), dtype=np.float32) 
        self.resetted = False
        self.num = num
    def reset(self):
        self.resetted=True 
        self.dones = set()
        dict_agents = {i: a.reset() for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z*(1-ALPHA)*(self.N/self.K)**(ALPHA)
        self.W = Z*(ALPHA)*(self.K/self.N)**(1-ALPHA)
        for i in range(self.num):
            dict_agents[i].append([self.K, self.N, self.R, self.W])
        return dict_agents

    def step(self, action_dict):
        obs, rew, done, info = {},{},{},{}
        for i, action in action_dict.items():
            #get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs[i], rew[i], done[i], info[i] = self.agents[i].step(action, self.R, self.W)
            #append aggregate observations to each i.
            if done[i]:
                self.dones.add(i)
        # construct and append aggregate states
        self.K = sum(obs_agent[0] for obs_agent in obs)
        self.N = self.num
        self.R = Z*(1-ALPHA)*(self.N/self.K)**(ALPHA)
        self.W = Z*(ALPHA)*(self.K/self.N)**(1-ALPHA)
        for i in range(num): 
            obs[i] += self.K 
            obs[i] += self.N
            obs[i] += self.R 
            obs[i] += self.W
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info

        
    def render(self, mode='human', close=True):
        #TODO: work on nice render
        results_n = []
        for agent in self.agents:
            #results += env.render(mode, close)
            results = agent.render(mode,close)
            results_n.append(results)
        return results_n
env= AiyagariMultiAgentEnv(5)
obs = env.reset()
for items in env.render():
        print(f"Agent: {env.render().index(items)+1} \n" )
        print(items)
print(env.action_space)

Up to here works fine and it prints out the reset state of the agents correctly.

import ray
ray.init()

from ray.tune.registry import register_env
from ray.rllib.agents import ppo
 
trainer= ppo.PPOTrainer(env=AiyagariMultiAgentEnv)
while True:
    print(trainer.train())

This produces error.

Full error is:

RayTaskError(TypeError): ray::RolloutWorker.foreach_policy() (pid=78609, ip=192.168.1.4)
  File "python/ray/_raylet.pyx", line 422, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 456, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 459, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 463, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 415, in ray._raylet.execute_task.function_executor
  File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/evaluation/rollout_worker.py", line 372, in __init__
    self.env = _validate_env(env_creator(env_context))
  File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 1193, in <lambda>
    register_env(name, lambda config: env_object(config))
  File "<ipython-input-55-fe3e5d2bb727>", line 9, in __init__
TypeError: 'EnvContext' object cannot be interpreted as an integer

Line 9 in init corresponds to self.agents = [AiyagariEnvironment() for _ in range(num)]

1 Like

Hey @bkaplowitz thanks for posting this! However, could you please provide a concise reproduction script? Copying your snippets into one script and then running this gives me all these undefined constants, e.g. BORROW_LIM, INITIAL_ASSET_HOLDINGS, etc…

Without a reproduction script, I cannot debug your issue!

Thanks

Hi, sorry about that! Please try this script instead. This does work as I tested it to reproduce the issue.

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.registry import register_env
from ray.rllib.agents import ppo


INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1
np.random.seed(2020)

# alternative for raylib


class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return util
    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, np.inf)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([np.inf, np.inf, 1]),
            dtype=np.float32,
        )

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = 0
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = [[self.assets], [self.price], [self.income]]
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
        self.obs = [[self.assets], [self.price], [self.income]]
        done = self.cons <= 0
        if self.cons > 0:
            self.reward = self.cons ** (1 - GAMMA) / (1 - GAMMA)
        else:
            self.reward = -np.inf
        return self.obs, self.reward, done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        self.observation_space = gym.spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([np.inf, np.inf, 1]),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {i: a.reset() for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(self.num):
            dict_agents[i].append([self.K, self.N, self.R, self.W])
        return dict_agents

    def step(self, action_dict):
        obs, rew, done, info = {}, {}, {}, {}
        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs[i], rew[i], done[i], info[i] = self.agents[i].step(
                action, self.R, self.W
            )
            # append aggregate observations to each i.
            if done[i]:
                self.dones.add(i)
        # construct and append aggregate states
        self.K = sum(obs_agent[0] for obs_agent in obs)
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(self.num):
            obs[i] += self.K
            obs[i] += self.N
            obs[i] += self.R
            obs[i] += self.W
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info

    def render(self, mode="human", close=True):
        # TODO: work on nice render
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


env = AiyagariMultiAgentEnv(5)
obs = env.reset()
for items in env.render():
    print(f"Agent: {env.render().index(items)+1} \n")
    print(items)
print(env.action_space)

ray.init()
trainer = ppo.PPOTrainer(env=AiyagariMultiAgentEnv)
while True:
    print(trainer.train())

Perfect, thanks!

You just have to register your env with an “env-maker”, like so:

from ray import tune

# Register , how your env should be constructed (always with 5, or you can take values from the `config` EnvContext object):
tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))

ray.init()
trainer = ppo.PPOTrainer(env="my_env")  # <- this will use the above lambda to create an env instance
while True:
    print(trainer.train())

Alternatively, you can change your Env class to accept a config in its c’tor (not num!) and then do this in your c’tor:


self.num = config["num"]

...

ray.init()
trainer = ppo.PPOTrainer(env=AiyagariMultiAgentEnv, config={
    "env_config": {
        "num": 5,  # <- env_config will be passed into your Env's c'tor as `config`
    },
})

Great. Thanks so much! This is super helpful and really useful to know going forward.

One last question… If I want to extract the trained policy function from the agents for graphing purposes or losses over time, is there an easy way to do this from inside Ray?

One remaining issue… my apologies.

I fixed some residual bugs after working through the errors produced by Ray and now have the following code:

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
import ray.tune as tune

INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1
np.random.seed(2020)

# alternative for raylib


class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return util
    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, np.inf)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float64
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([np.inf, np.inf, np.inf]),
            dtype=np.float64,
        )

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = 0
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
        self.obs = np.array([self.assets, self.price, self.income])
        done = self.cons <= 0
        self.done = done
        if self.cons > 0:
            self.reward = self.cons ** (1 - GAMMA) / (1 - GAMMA)
        else:
            self.reward = -np.inf
        return self.obs, self.reward, self.done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        self.observation_space = gym.spaces.Box(
            low=np.array([BORROW_LIM, 0, 0,0,0,0,0]),
            high=np.array([np.inf, np.inf, np.inf ,np.inf, np.inf, np.inf, np.inf]),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {i: a.reset() for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        agg_obs_list = [self.K, self.N,self.R, self.W]
        for i in range(self.num):
            for agg_obs in agg_obs_list:
                dict_agents[i]= np.append(dict_agents[i],agg_obs) #FIXME: revise to make fixed size array for performance reasons
        return dict_agents

    def step(self, action_dict):
        obs, rew, done, info = {}, {}, {}, {}
        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs[i], rew[i], done[i], info[i] = self.agents[i].step(
                action, self.R, self.W
            )
            # append aggregate observations to each i.
            if done[i]:
                self.dones.add(i)
        # construct and append aggregate states
        self.K = sum(obs_val[0] for obs_val in obs.values())
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(self.num):
            obs[i] += self.K
            obs[i] += self.N
            obs[i] += self.R
            obs[i] += self.W
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info

    def render(self, mode="human", close=True):
        # TODO: work on nice render
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


env = AiyagariMultiAgentEnv(5)
obs = env.reset()
for items in env.render():
    print(f"Agent: {env.render().index(items)+1} \n")
    print(items)
print(env.action_space)


tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))
ray.init()
trainer = ppo.PPOTrainer(env="my_env")
while True:
    print(trainer.train())

The code executes until I run trainer.train(). At this point, for some reason on the backend Ray seems to produce an error because it tries to feed an internal observation of size (2,3) or (4,3) in place of the actual observation size which is 7.

ValueError: Cannot feed value of shape (4, 3) for Tensor 'default_policy/obs:0', which has shape '(?, 7)'

The full error traceback is:

(pid=27085) 2021-02-18 17:54:33,156     ERROR tf_run_builder.py:47 -- Error fetching: [<tf.Tensor 'default_policy/cond_1/Merge:0' shape=(?, 1) dtype=float32>, {'action_prob': <tf.Tensor 'default_policy/Exp_1:0' shape=(?,) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/cond_2/Merge:0' shape=(?,) dtype=float32>, 'action_dist_inputs': <tf.Tensor 'default_policy/model/fc_out/BiasAdd:0' shape=(?, 2) dtype=float32>, 'vf_preds': <tf.Tensor 'default_policy/Reshape_1:0' shape=(?,) dtype=float32>}], feed_dict={<tf.Tensor 'default_policy/obs:0' shape=(?, 7) dtype=float32>: array([[array([10.920127], dtype=float32), 10.93012774348259,
(pid=27085)         11.581740106223217],
(pid=27085)        [array([10.920127], dtype=float32), 10.93012774348259,
(pid=27085)         10.833709935969893],
(pid=27085)        [array([11.168118], dtype=float32), 10.93012774348259,
(pid=27085)         11.275902572902883],
(pid=27085)        [array([10.920127], dtype=float32), 10.93012774348259,
(pid=27085)         10.825643612648669]], dtype=object), <tf.Tensor 'default_policy/is_training:0' shape=() dtype=bool>: False, <tf.Tensor 'default_policy/is_exploring:0' shape=() dtype=bool>: True, <tf.Tensor 'default_policy/timestep:0' shape=() dtype=int64>: 5}
(pid=27085) Traceback (most recent call last):
(pid=27085)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/utils/tf_run_builder.py", line 44, in get
(pid=27085)     self.feed_dict, os.environ.get("TF_TIMELINE_DIR"))
(pid=27085)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/utils/tf_run_builder.py", line 89, in run_timeline
(pid=27085)     fetches = sess.run(ops, feed_dict=feed_dict)
(pid=27085)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 950, in run
(pid=27085)     run_metadata_ptr)
(pid=27085)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1149, in _run
(pid=27085)     str(subfeed_t.get_shape())))
(pid=27085) ValueError: Cannot feed value of shape (4, 3) for Tensor 'default_policy/obs:0', which has shape '(?, 7)'
(pid=27086) 2021-02-18 17:54:33,159     ERROR tf_run_builder.py:47 -- Error fetching: [<tf.Tensor 'default_policy/cond_1/Merge:0' shape=(?, 1) dtype=float32>, {'action_prob': <tf.Tensor 'default_policy/Exp_1:0' shape=(?,) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/cond_2/Merge:0' shape=(?,) dtype=float32>, 'action_dist_inputs': <tf.Tensor 'default_policy/model/fc_out/BiasAdd:0' shape=(?, 2) dtype=float32>, 'vf_preds': <tf.Tensor 'default_policy/Reshape_1:0' shape=(?,) dtype=float32>}], feed_dict={<tf.Tensor 'default_policy/obs:0' shape=(?, 7) dtype=float32>: array([[array([12.05359], dtype=float32), 11.905909303426743,
(pid=27086)         11.413463566063463],
(pid=27086)        [array([11.895909], dtype=float32), 11.905909303426743,
(pid=27086)         12.312181197626595],
(pid=27086)        [array([12.013004], dtype=float32), 11.905909303426743,
(pid=27086)         11.307892761583325],
(pid=27086)        [array([12.3968115], dtype=float32), 11.905909303426743,
(pid=27086)         11.529570315688177]], dtype=object), <tf.Tensor 'default_policy/is_training:0' shape=() dtype=bool>: False, <tf.Tensor 'default_policy/is_exploring:0' shape=() dtype=bool>: True, <tf.Tensor 'default_policy/timestep:0' shape=() dtype=int64>: 5}
(pid=27086) Traceback (most recent call last):
(pid=27086)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/utils/tf_run_builder.py", line 44, in get
(pid=27086)     self.feed_dict, os.environ.get("TF_TIMELINE_DIR"))
(pid=27086)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/utils/tf_run_builder.py", line 89, in run_timeline
(pid=27086)     fetches = sess.run(ops, feed_dict=feed_dict)
(pid=27086)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 950, in run
(pid=27086)     run_metadata_ptr)
(pid=27086)   File "/Users/brandonkaplowitz/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1149, in _run
(pid=27086)     str(subfeed_t.get_shape())))
(pid=27086) ValueError: Cannot feed value of shape (4, 3) for Tensor 'default_policy/obs:0', which has shape '(?, 7)'

Do you know why this is happening and how to fix it? Is it because I am appending additional observations in the multienvironment (aggregate observations?) If so, how do I get around this. Thanks again for all of your help!

I actually believe I have fixed the above issue. However, now I still get another issue that I really don’t understand

Cannot feed value of shape (3968, 200) for Tensor 'default_policy/advantages:0', which has shape '(?,)'

What is the advantages in Ray?

The newest version of the code is:

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
import ray.tune as tune

INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1
np.random.seed(2020)

# alternative for raylib


class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return util
    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, np.inf)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([np.inf, np.inf, np.inf]),
            dtype=np.float32,
        )
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets*self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets*self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
        self.obs = np.array([self.assets, self.price, self.income])
        done = self.cons <= 0
        self.done = done
        if self.cons > 0:
            self.reward = self.cons ** (1 - GAMMA) / (1 - GAMMA)
        else:
            self.reward = -np.inf
        return self.obs, self.reward, self.done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        self.observation_space = gym.spaces.Box(
            low=np.array([BORROW_LIM, 0, 0,0,0,0,0]),
            high=np.array([np.inf, np.inf, np.inf ,np.inf, np.inf, np.inf, np.inf]),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([np.inf]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {i: np.zeros(7) for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        agg_obs_list = [self.K, self.N,self.R, self.W]
        for i in range(self.num):
            dict_agents[i][0:3] = self.agents[i].reset()
            dict_agents[i][3:7] = np.array(agg_obs_list)
        return dict_agents

    def step(self, action_dict):
        obs, rew, done, info = {}, {}, {}, {}
        obs_temp_list = {}
        obs_temp = np.zeros(7)
        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs_temp[0:3], rew[i], done[i], info[i] = self.agents[i].step(
                action, self.R, self.W
            )
            obs_temp_list[i] = obs_temp
            # append aggregate observations to each i.
            if done[i]:
                self.dones.add(i)
        # construct and append aggregate states
        self.K = sum(obs_val[0] for obs_val in obs_temp_list.values())
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(0,5):
            print(i)
            print(np.size(obs_temp_list[i]))
            obs_temp_list[i][3:7] = np.array([self.K,self.N,self.R,self.W])
            obs[i]=obs_temp_list[i]
        done["__all__"] = len(self.dones) == len(self.agents)
        return obs, rew, done, info

    def render(self, mode="human", close=True):
        # TODO: work on nice render
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


# ? What is problem with existing size of tensor? Why is tensor recorded incorrectly?

env = AiyagariMultiAgentEnv(5)
obs = env.reset()
for items in env.render():
    print(f"Agent: {env.render().index(items)+1} \n")
    print(items)
print(env.action_space)


tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))
ray.init()
trainer = ppo.PPOTrainer(env="my_env")
while True:
    print(trainer.train())

The error may be related to the fact that if I do not add a print statement under for i in range(0,5) I get a keyvale error saying that ‘2’ or ‘3’ is not a valid key. This seems to be a timing issue caused by the parallelization more than anything else.

1 Like