Is there a memory cheap way to rollout multiple batches of a policy? I am working on a curriculum reinforcement learning algorithm where I need to rollout several times to find the average performance of a set of policys on a given environment and vice-versa.
As a minimal example I have a class:
Class MyModel: def __init__(self, config, local_dir, env): self.config = config self.local_dir = local_dir self.policy = None self.env = env def train(self): analysis = tune.run( 'PPO', config=self.config, local_dir=self.local_dir, checkpoint_at_end=True) self.policy = analysis.get_checkpoint() @ray.remote def rollout(self) -> float: """ rollout a single episode of the environment :return: episodic reward """ config = self.config agent = ppo.PPOTrainer(config, self.env_name) agent.restore(self.policy) env = gym.make(self.env) episode_reward = 0 obs = env.reset() for _ in range(1, self.max_ep_len): action = agent.compute(obs) obs, reward, done, _ = env.step(action) episode_reward += reward if done: break return episode_reward def simulate_env(self, n_episodes: int, env_config: dict = None, policy: str = None): """ rollout multiple episodes of the environment :param n_episodes: number of episodes to take an average rollout over :param env_config: environment configuration dict, if not None will update :param policy: path to the policy contained with the environment, if not None will update :return: average rollout of n_episodes """ if env_config: self.env_config = env_config if policy: self.policy = policy rollouts = ray.get([self.rollout.remote(self) for _ in range(n_episodes)]) avg = lambda x: sum(x) / len(x) avg_reward = avg(rollouts) return avg_reward @ray.remote def eligible_to_reproduce(self, passing_score: float, pair: tuple, n_episodes: int = 5) -> bool: """ test if the env-agent pair is eligible to reproduce :param passing_score: passing score to be eligible to reproduce :param pair: list of parent pairs to test :param n_episodes: num episodes to test :return: boolean, True if it passes the defined conditions """ average_reward = self.simulate_env(n_episodes=n_episodes, env_config=pair, policy=pair) if average_reward >= passing_score: return True else: return False
Then in a seperate class I call a method:
def list_elgible_parents(self): """ If a pair in the ea list is eligible to reproduce add it to parent list :return: None """ eligible_parents = [pair for pair in self.ea_list if self.model.eligible_to_reproduce.remote(self.model, passing_score=self.passing_score, pair=pair)]
In this case I am calling n rollouts per environment agent pair in parallel and then calling n environments to check which is eligible to reproduce. This leads to an OOM error as my machine consumes lots of RAM spawning many workers.
Is there a smarter way to rollout under inference for a given policy with ray in parallel where I don’t need to instantiate multiple instances of the policy network (agent, ppo.PPOTrainer object) which I believe is what is consuming all the resources. Thanks for everyones help.