Hello, I am experimenting Trajectory View API and using the following example:
I have tried to run ray.tune and after that restore from checkpoint and run some episodes again.
The following is my code:
import argparse
import numpy as np
import ray
from ray import tune
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
from ray.rllib.examples.models.trajectory_view_utilizing_models import \
FrameStackingCartPoleModel, TorchFrameStackingCartPoleModel
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib import agents
tf1, tf, tfv = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument(
"--run",
type=str,
default="PPO",
help="The RLlib-registered algorithm to use.")
parser.add_argument(
"--framework",
choices=["tf", "tf2", "tfe", "torch"],
default="tf",
help="The DL framework specifier.")
parser.add_argument(
"--as-test",
action="store_true",
help="Whether this script should be run as a test: --stop-reward must "
"be achieved within --stop-timesteps AND --stop-iters.")
parser.add_argument(
"--stop-iters",
type=int,
default=2,
help="Number of iterations to train.")
parser.add_argument(
"--stop-timesteps",
type=int,
default=200000,
help="Number of timesteps to train.")
parser.add_argument(
"--stop-reward",
type=float,
default=150.0,
help="Reward at which we stop training.")
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=3)
ModelCatalog.register_custom_model(
"frame_stack_model", FrameStackingCartPoleModel
if args.framework != "torch" else TorchFrameStackingCartPoleModel)
tune.register_env("stateless_cartpole", lambda c: StatelessCartPole())
config = {
"env": "stateless_cartpole",
"model": {
# "vf_share_layers": True,
# "custom_model": "frame_stack_model",
# "custom_model_config": {
# "num_frames": 16,
# },
# To compare against a simple LSTM:
"use_lstm": True,
"lstm_cell_size": 128,
"lstm_use_prev_action": True,
"lstm_use_prev_reward": True,
# To compare against a simple attention net:
# "use_attention": True,
# "attention_use_n_prev_actions": 1,
# "attention_use_n_prev_rewards": 1,
},
"num_sgd_iter": 5,
"vf_loss_coeff": 0.0001,
"framework": args.framework,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run,
config=config,
stop=stop,
checkpoint_freq = 200,
local_dir = 'lstm_trajectory',
checkpoint_at_end = True,
verbose=2)
checkpoints = results.get_trial_checkpoints_paths(trial=results.get_best_trial('episode_reward_mean', mode='max'),
metric='episode_reward_mean')
checkpoint_path = checkpoints[0][0]
agent = agents.ppo.PPOTrainer(config, env="stateless_cartpole")
agent.restore(checkpoint_path)
env = StatelessCartPole()
# run until episode ends
for _ in range(10):
episode_reward = 0
done = False
obs = env.reset()
cell_size=128
state=[np.zeros(cell_size, np.float32),
np.zeros(cell_size, np.float32)]
while not done:
action, state, logits = agent.compute_action(obs, state)
obs, reward, done, info = env.step(action)
episode_reward += reward
print("reward: {}".format(episode_reward))
env.render()
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
This code fails with the following error:
2021-06-15 13:18:16,108 ERROR tf_run_builder.py:46 -- Error fetching: [<tf.Tensor 'default_policy/cond_1/Merge:0' shape=(?,) dtype=int64>, <tf.Tensor 'default_policy/model_1/lstm/while/Exit_3:0' shape=(?, 128) dtype=float32>, <tf.Tensor 'default_policy/model_1/lstm/while/Exit_4:0' shape=(?, 128) dtype=float32>, {'action_prob': <tf.Tensor 'default_policy/Exp:0' shape=(?,) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/cond_2/Merge:0' shape=(?,) dtype=float32>, 'action_dist_inputs': <tf.Tensor 'default_policy/Reshape_3:0' shape=(?, 2) dtype=float32>, 'vf_preds': <tf.Tensor 'default_policy/Reshape_5:0' shape=(?,) dtype=float32>}], feed_dict={<tf.Tensor 'default_policy/obs:0' shape=(?, 2) dtype=float32>: [array([-0.00997954, -0.01892092])], <tf.Tensor 'default_policy/seq_lens:0' shape=(?,) dtype=int32>: array([1.]),
<tf.Tensor 'default_policy/Placeholder:0' shape=(?, 128) dtype=float32>: array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
dtype=float32), <tf.Tensor 'default_policy/Placeholder_1:0' shape=(?, 128) dtype=float32>: array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
dtype=float32), <tf.Tensor 'default_policy/is_training:0' shape=() dtype=bool>: False, <tf.Tensor 'default_policy/is_exploring:0' shape=() dtype=bool>: True, <tf.Tensor 'default_policy/timestep:0' shape=() dtype=int64>: 0}
Traceback (most recent call last):
File "/home/condauser/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1375, in _do_call
return fn(*args)
File "/home/condauser/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1359, in _run_fn
return self._call_tf_sessionrun(options, feed_dict, fetch_list,
File "/home/condauser/.local/lib/python3.8/site-packages/tensorflow/python/client/session.py", line 1451, in _call_tf_sessionrun
return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'default_policy/prev_action' with dtype int64 and shape [?]
[[{{node default_policy/prev_action}}]]
Can anyone help me to understand how to create an initial state?
Thank you.