Is there a memory cheap way to rollout multiple batches of a policy? I am working on a curriculum reinforcement learning algorithm where I need to rollout several times to find the average performance of a set of policys on a given environment and vice-versa.
As a minimal example I have a class:
Class MyModel:
def __init__(self, config, local_dir, env):
self.config = config
self.local_dir = local_dir
self.policy = None
self.env = env
def train(self):
analysis = tune.run(
'PPO',
config=self.config,
local_dir=self.local_dir,
checkpoint_at_end=True)
self.policy = analysis.get_checkpoint()
@ray.remote
def rollout(self) -> float:
"""
rollout a single episode of the environment
:return: episodic reward
"""
config = self.config
agent = ppo.PPOTrainer(config, self.env_name)
agent.restore(self.policy)
env = gym.make(self.env)
episode_reward = 0
obs = env.reset()
for _ in range(1, self.max_ep_len):
action = agent.compute(obs)
obs, reward, done, _ = env.step(action)
episode_reward += reward
if done:
break
return episode_reward
def simulate_env(self, n_episodes: int, env_config: dict = None, policy: str = None):
"""
rollout multiple episodes of the environment
:param n_episodes: number of episodes to take an average rollout over
:param env_config: environment configuration dict, if not None will update
:param policy: path to the policy contained with the environment, if not None will update
:return: average rollout of n_episodes
"""
if env_config:
self.env_config = env_config
if policy:
self.policy = policy
rollouts = ray.get([self.rollout.remote(self) for _ in range(n_episodes)])
avg = lambda x: sum(x) / len(x)
avg_reward = avg(rollouts)
return avg_reward
@ray.remote
def eligible_to_reproduce(self, passing_score: float, pair: tuple, n_episodes: int = 5) -> bool:
"""
test if the env-agent pair is eligible to reproduce
:param passing_score: passing score to be eligible to reproduce
:param pair: list of parent pairs to test
:param n_episodes: num episodes to test
:return: boolean, True if it passes the defined conditions
"""
average_reward = self.simulate_env(n_episodes=n_episodes, env_config=pair[0], policy=pair[1])
if average_reward >= passing_score:
return True
else:
return False
Then in a seperate class I call a method:
def list_elgible_parents(self):
"""
If a pair in the ea list is eligible to reproduce add it to parent list
:return: None
"""
eligible_parents = [pair for pair in self.ea_list if self.model.eligible_to_reproduce.remote(self.model, passing_score=self.passing_score, pair=pair)]
In this case I am calling n rollouts per environment agent pair in parallel and then calling n environments to check which is eligible to reproduce. This leads to an OOM error as my machine consumes lots of RAM spawning many workers.
Is there a smarter way to rollout under inference for a given policy with ray in parallel where I don’t need to instantiate multiple instances of the policy network (agent, ppo.PPOTrainer object) which I believe is what is consuming all the resources. Thanks for everyones help.