Custom_experiment.py modified version

Hello, I have just taken custom_experiment.py example and made few adjustments for torch and LSTM:

"""Example of a custom experiment wrapped around an RLlib trainer."""
import argparse
import numpy as np

import ray
from ray import tune
from ray.rllib.agents import ppo

parser = argparse.ArgumentParser()
parser.add_argument("--train-iterations", type=int, default=10)


def experiment(config):
    iterations = config.pop("train-iterations")
    train_agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
    checkpoint = None
    train_results = {}

    # Train
    for i in range(iterations):
        train_results = train_agent.train()
        if i % 2 == 0 or i == iterations - 1:
            checkpoint = train_agent.save(tune.get_trial_dir())
        tune.report(**train_results)
    train_agent.stop()

    # Manual Eval
    config["num_workers"] = 0
    eval_agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
    eval_agent.restore(checkpoint)
    env = eval_agent.workers.local_worker().env

    lstm_cell_size = config["model"]["lstm_cell_size"]
    init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]

    prev_a = 0
    prev_r = 0.0

    obs = env.reset()
    done = False
    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
    state = init_state
    while not done:
        action, state, _ = eval_agent.compute_single_action(obs, state, prev_a, prev_r)
        obs, reward, done, info = env.step(action)
        prev_a = action
        prev_r = reward
        eval_results["eval_reward"] += reward
        eval_results["eval_eps_length"] += 1
    results = {**train_results, **eval_results}
    tune.report(results)


if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=3)
    config = ppo.DEFAULT_CONFIG.copy()
    config["train-iterations"] = args.train_iterations
    config["framework"] = 'torch'
    config["model"]["use_lstm"] = True
    config["model"]["lstm_cell_size"] = 256
    config["model"]["lstm_use_prev_action"] = True
    config["model"]["lstm_use_prev_reward"] = True

    config["env"] = "CartPole-v0"

    tune.run(
        experiment,
        config=config,
        resources_per_trial=ppo.PPOTrainer.default_resource_request(config))

this works fine. Nice!
This uses “CartPole-v0”, so I have tried to replace with StatelessCartPole:

"""Example of a custom experiment wrapped around an RLlib trainer."""
import argparse
import numpy as np

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole


parser = argparse.ArgumentParser()
parser.add_argument("--train-iterations", type=int, default=10)


def experiment(config):
    iterations = config.pop("train-iterations")
    train_agent = ppo.PPOTrainer(config=config, env=StatelessCartPole)
    checkpoint = None
    train_results = {}

    # Train
    for i in range(iterations):
        train_results = train_agent.train()
        if i % 2 == 0 or i == iterations - 1:
            checkpoint = train_agent.save(tune.get_trial_dir())
        tune.report(**train_results)
    train_agent.stop()

    # Manual Eval
    config["num_workers"] = 0
    eval_agent = ppo.PPOTrainer(config=config, env=StatelessCartPole)
    eval_agent.restore(checkpoint)
    env = eval_agent.workers.local_worker().env

    lstm_cell_size = config["model"]["lstm_cell_size"]
    init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]

    prev_a = 0
    prev_r = 0.0

    obs = env.reset()
    done = False
    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
    state = init_state
    while not done:
        action, state, _ = eval_agent.compute_single_action(obs, state, prev_a, prev_r)
        obs, reward, done, info = env.step(action)
        prev_a = action
        prev_r = reward
        eval_results["eval_reward"] += reward
        eval_results["eval_eps_length"] += 1
    results = {**train_results, **eval_results}
    tune.report(results)


if __name__ == "__main__":
    args = parser.parse_args()

    ray.init(num_cpus=3)
    config = ppo.DEFAULT_CONFIG.copy()
    config["train-iterations"] = args.train_iterations
    config["framework"] = 'torch'
    config["model"]["use_lstm"] = True
    config["model"]["lstm_cell_size"] = 256
    config["model"]["lstm_use_prev_action"] = True
    config["model"]["lstm_use_prev_reward"] = True

    config["env"] = StatelessCartPole

    tune.run(
        experiment,
        config=config,
        resources_per_trial=ppo.PPOTrainer.default_resource_request(config))

this code fails with the following error:

  File "/srv/local_projects/docker/ray/examples/custom/custom_experiment2.py", line 46, in experiment
    action, state, _ = eval_agent.compute_single_action(obs, state, prev_a, prev_r)
TypeError: compute_single_action() takes from 1 to 3 positional arguments but 5 were given

any good idea?

Hi @mg64ve,

I am not sure what the difference is but I could not replicate the error in this colab:

Is there a reason you are not using the rllib built in eval options in the config? I think they would return these same values.

You are right @mannyv , this code fails only with ray 2.0.0.dev0
I have tried to put into the following collab, but it runs without stopping.

in my docker container it fails.

Hi @mg64ve,

Yes, I see the error with nightly.

The issue comes from a change in the compute_single_action method definition that will now require you to pass the previous values as kwargs:

If you check the notebook you shared it is working now with this change:

action, state, _ = eval_agent.compute_single_action(obs, state,
                                                    prev_action=prev_a,
                                                    prev_reward=prev_r)

You are right @mannyv . It seems to work.