Question about Environment/Observation construction

bkaplowitz · June 16, 2021, 6:44pm

Hi, the first time I used RLLIB, I defined the multiple agents in the MultiAgent Environment inside a dictionary indexing each agent that contained each observation and this seemed to work/was what the online guide said. Now, however, I get an error saying that the environment has to be a numpy style array containing all agents. What is the proper way to return the observations for multiple agents and define the observation and action space bounds? I attach my current code as well as the error it currently gives. Any suggested revision would be much appreciated.

import numpy as np
import gym
from gym import spaces
import numpy as np
from gym.utils import seeding, EzPickle
from ray.rllib.utils.typing import MultiAgentDict, AgentID
from typing import Tuple, Dict, List
from gym.envs.registration import EnvSpec
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import ray
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from ray.rllib.agents import ppo
from ray.rllib.agents import ddpg
import ray.tune as tune

INITIAL_ASSET_HOLDINGS = 1
BORROW_LIM = -0.01
R_VALUE = 1.03
DELTA = 0.01
W_VALUE = 0.98
GAMMA = 2.0
AGENT_NUM = 1
N = 5
BETA = 0.95
ALPHA = 0.33
Z = 1.0
np.random.seed(2020)

# alternative for raylib


class AiyagariEnvironment(gym.Env):
    """ An environment for value function sampling from a basic RA GE model with capital"""

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return util
    # idea pass assets to multiagent, and then return interest rate back to environment.
    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(AiyagariEnvironment, self).__init__()
        self.reward_range = (0, 100000)
        self.seed()
        # next period asset space bounds [borrow_lim, inf)
        self.action_space = spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        # observation space -- all variables agent will observe before making new decision. Since we assume r will be fixed here, this will include here assets, prices, income. Due to assets acting as summary statistic in this model we will only provide current period assets, prices, income. We can extend this to multi-period if we wanted.
        self.observation_space = spaces.Box(
            low=np.array([BORROW_LIM, 0, 0]),
            high=np.array([100000, 100000, 100000]),
            dtype=np.float32,
        )
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0

    # resets state to initial value
    #  def u(cons):
    #      util = cons**(1-GAMMA)/(1-GAMMA)
    #      return utily
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.assets = INITIAL_ASSET_HOLDINGS
        self.price = R_VALUE - DELTA
        self.W = W_VALUE
        self.current_step = 0
        self.cons = 0
        self.net_worth = self.assets * self.price
        self.reward = 0
        self.shock = np.exp(self.np_random.normal(0, 1))
        self.income = self.W * self.shock
        self.obs = np.array([self.assets, self.price, self.income])
        self.current_step = 0
        # shifted exponential for time being, can impose own distribution with custom sampling later on.
        # for time being will use default distribution for sampling.
        return self.obs

    # updating function
    @property
    def n(self):
        return AGENT_NUM

    def step(self, action, R, W):
        #!!!ERROR: not getting to here: causing problem
        self.current_step += 1
        self.price = R - DELTA
        self.W = W
        self.shock = np.exp(np.random.normal(0, 1))
        self.income = self.W * self.shock
        self.net_worth = (self.price) * self.assets + self.income
        if action in self.action_space:
            if action <= self.net_worth:
                self.assets = action + (self.price) * self.assets
                self.cons = self.net_worth - action
            else:
                self.assets = self.net_worth + (self.price) * self.assets
        else:
            raise ValueError(
                "Received invalid action={:f} which is not part of the action space".format(
                    action
                )
            )
        self.obs = np.array([self.assets, self.price, self.income])
        done = self.cons <= 0
        self.done = done
        if self.cons > 0:
            self.reward = (self.cons ** (1 - GAMMA) / (1 - GAMMA)).item()
        else:
            self.reward = -10000
        return self.obs, self.reward, self.done, {}

    def render(self, mode="human", close=False):
        # work on render to make graph.
        results = str(
            f"Step: {self.current_step}\n"
            f"Assets: {self.assets}\n"
            f"Income: {self.income}\n"
            f"Consumption: {self.cons}\n"
            f"Net worth: {self.net_worth}\n"
            f"Interest Rate: {self.price}\n"
            f"Wage Rate: {self.W}\n"
            f"Utility: {self.reward}\n"
        )
        return results


class AiyagariMultiAgentEnv(MultiAgentEnv):
    def __init__(self, num):
        self.agents = [AiyagariEnvironment() for _ in range(num)]
        self.dones = set()
        #!!! change definition ofobservation space to account for fact now 5x7. Need to modify.
        low_val = np.array([BORROW_LIM, 0, 0, 0, 0, 0, 0])
        low_list = list
        self.observation_space = gym.spaces.Box(
            low=np.repeat(np.array([BORROW_LIM, 0, 0, 0, 0, 0, 0]), num),
            high=np.repeat(np.array([100000, 100000, 100000, 100000, 100000, 100000, 100000]),num),
            dtype=np.float32,
        )
        self.action_space = gym.spaces.Box(
            low=np.array([BORROW_LIM]), high=np.array([100000]), dtype=np.float32
        )
        self.resetted = False
        self.num = num

    def reset(self):
        self.resetted = True
        self.dones = set()
        dict_agents = {i: np.zeros(7) for i, a in enumerate(self.agents)}
        # initial holdings
        self.K = sum(self.agents[i].assets for i in range(self.num))
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        agg_obs_list = [self.K, self.N, self.R, self.W]
        for i in range(self.num):
            dict_agents[i][0:3] = self.agents[i].reset()
            dict_agents[i][3:7] = np.array(agg_obs_list)
        dict_agents = np.array(list(dict_agents.items()))
        return dict_agents

    def step(self, action_dict):
        obs, rew, done, info = {}, {}, {}, {}
        obs_temp_list = {}
        obs =  dict.fromkeys(range(5))
        

        obs_temp_list = dict.fromkeys(range(5))
        obs_temp_list = {i: self.agents[i].step(action,self.R,self.W)[0] for i, action in action_dict.items()}
        obs_temp = np.zeros(7)
        for i, action in action_dict.items():
            # get observations which is tomorrow's capital earnings. Use to construct tomorrow prices. then feedback in.
            obs_temp[0:3], rew[i], done[i], info[i] = self.agents[i].step(
                action, self.R, self.W
            ) 
            obs_temp_list[i] = obs_temp
            # append aggregate observations to each i.
            if done[i]:
                self.dones.add(i)
        # construct and append aggregate states
        self.K = sum(obs_val[0] for obs_val in obs_temp_list.values())
        self.N = self.num
        self.R = Z * (1 - ALPHA) * (self.N / self.K) ** (ALPHA)
        self.W = Z * (ALPHA) * (self.K / self.N) ** (1 - ALPHA)
        for i in range(0, 5):
            #print(i)
            #print(np.size(obs_temp_list[i]))
            obs_temp_list[i][3:7] = np.array([self.K, self.N, self.R, self.W])
            obs[i] = obs_temp_list[i]
        done["__all__"] = len(self.dones) == len(self.agents)
        obs_new = np.array(list(obs.items()))
        return obs_new, rew, done, info
    def render(self, mode="human", close=True):
        # TODO: work on nice render
        results_n = []
        for agent in self.agents:
            # results += env.render(mode, close)
            results = agent.render(mode, close)
            results_n.append(results)
        return results_n


if __name__ == "__main__":

    env = AiyagariMultiAgentEnv(5)
    obs = env.reset()
    for items in env.render():
        print(f"Agent: {env.render().index(items)+1} \n")
        print(items)
    print(env.action_space)

    tune.register_env("my_env", lambda config: AiyagariMultiAgentEnv(5))

    #obs_space = env.observation_space
    #act_spc = env.action_space
    #policies = {agent: (None, obs_space, act_spc, {}) for agent in env.agents}
    ray.init()
    config = {
            "env": "my_env",
            # General
            "num_gpus": 0,
            "num_workers": 2,
            # Method specific
            }  
#   analysis=tune.run(
#         "DDPG",
#        stop={"training_iteration": 100},
#         checkpoint_freq=10,
#        config=config,
#        checkpoint_at_end = True
#    )   
#    checkpoints = analysis.get_trial_checkpoints_paths(trial= analysis.get_best_trial("epsiode_reward_mean"), metric="episode_reward_mean")
#    agent = ddpg.DDPGTrainer(config=config, env="my_env")
#    agent.restore(checkpoints)  
    # analysis=tune.run(
    #     "PPO",
    #     stop={"episodes_total": 60000},
    #     checkpoint_freq=10,
    #     config=config,
    #     checkpoint_at_end = True
    # )
    #checkpoints = analysis.get_trial_checkpoints_paths(trial= analysis.get_best_trial("epsiode_reward_mean"), metric="episode_reward_mean")

    agent = ppo.PPOTrainer(config=config, env="my_env")
    #agent.restore(checkpoints)  
    episode_reward = 0
    done = False
    obs = env.reset()
    while not done:
        action = agent.compute_action(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward

and it returns:

Traceback (most recent call last):
  File "~/aiyagari.py", line 264, in <module>
    action = agent.compute_action(obs)
  File "~/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/agents/trainer.py", line 819, in compute_action
    policy_id].transform(observation)
  File "~/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/models/preprocessors.py", line 168, in transform
    self.check_shape(observation)
  File "~/opt/anaconda3/lib/python3.7/site-packages/ray/rllib/models/preprocessors.py", line 65, in check_shape
    observation, self._obs_space)
ValueError: ('Observation ({}) outside given space ({})!', array([[0,
        array([1.        , 1.02      , 0.84956773, 5.        , 5.        ,
       0.67      , 0.33      ])],
       [1,
        array([1.      , 1.02    , 0.544302, 5.      , 5.      , 0.67    ,
       0.33    ])],
       [2,
        array([1.        , 1.02      , 0.17352003, 5.        , 5.        ,
       0.67      , 0.33      ])],
       [3,
        array([1.        , 1.02      , 0.13939275, 5.        , 5.        ,
       0.67      , 0.33      ])],
       [4,
        array([1.        , 1.02      , 0.71660652, 5.        , 5.        ,
       0.67      , 0.33      ])]], dtype=object), Box(35,))

bkaplowitz · June 17, 2021, 4:19pm

Nevermind all resolved. Wasn’t training before running.

Topic		Replies	Views
[Rllib][Bug] Custom Multi-agent Environment Observation Space " does not contain returned observation after a reset" RLlib	0	323	April 4, 2022
Ray actor error: env.observation_space.contains(dummy_obs) RLlib	4	366	November 11, 2021
How should you end a MultiAgentEnv episode? RLlib	16	1314	October 1, 2022
Error: TypeError: 'EnvContext' object cannot be interpreted as an integer? RLlib	6	1783	February 19, 2021
Obervation space and action space in multi-agent env RLlib	3	402	August 14, 2021

Question about Environment/Observation construction

Related topics