Hello, consider the following documentation:
https://docs.ray.io/en/master/rllib-training.html#computing-actions
There is no mention this does not apply to models using Trajectory API.
Now if you consider the following example:
this is training only.
Suppose that I want a customized replay where at the end of the replay I also render my environment, I could add something like the following:
checkpoints = results.get_trial_checkpoints_paths(trial=results.get_best_trial('episode_reward_mean', mode='max'),
metric='episode_reward_mean')
checkpoint_path = checkpoints[0][0]
agent = agents.ppo.PPOTrainer(config, env="stateless_cartpole")
agent.restore(checkpoint_path)
env = StatelessCartPole()
# run until episode ends
for _ in range(10):
episode_reward = 0
reward = 0.
action = 0
done = False
obs = env.reset()
state=np.zeros(2*256, np.float32).reshape(2,256)
# state=None
while not done:
action, state, logits = agent.compute_action(obs, state)
obs, reward, done, info = env.step(action)
episode_reward += reward
print("reward: {}".format(episode_reward))
env.render()
Now this code fails with the following error:
2021-07-05 17:23:10,845 WARNING deprecation.py:33 -- DeprecationWarning: `compute_action` has been deprecated. Use `compute_single_action` instead. This will raise an error in the future!
Traceback (most recent call last):
File "trajectory_view_api.py", line 115, in <module>
action, state, logits = agent.compute_action(obs, state)
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 1005, in compute_action
return self.compute_single_action(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/agents/trainer.py", line 986, in compute_single_action
result = self.get_policy(policy_id).compute_single_action(
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 224, in compute_single_action
out = self.compute_actions(
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/policy/torch_policy.py", line 239, in compute_actions
return self._compute_action_helper(input_dict, state_batches,
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/utils/threading.py", line 21, in wrapper
return func(self, *a, **k)
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/policy/torch_policy.py", line 326, in _compute_action_helper
dist_inputs, state_out = self.model(input_dict, state_batches,
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/models/modelv2.py", line 230, in __call__
res = self.forward(restored, state or [], seq_lens)
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/examples/models/trajectory_view_utilizing_models.py", line 119, in forward
obs = input_dict["prev_n_obs"]
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/policy/sample_batch.py", line 500, in __getitem__
value = dict.__getitem__(self, key)
KeyError: 'prev_n_obs'
and adding this:
checkpoint_path = checkpoints[0][0]
agent = agents.ppo.PPOTrainer(config, env="stateless_cartpole")
agent.restore(checkpoint_path)
env = StatelessCartPole()
policy = agent.get_policy()
# run until episode ends
for _ in range(10):
episode_reward = 0
reward = 0.
action = 0
done = False
obs = env.reset()
state=np.zeros(2*256, np.float32).reshape(2,256)
actions=np.zeros(16, np.float32).reshape(1,16)
rewards=np.zeros(16, np.float32).reshape(1,16)
# state=None
while not done:
action, state, logits = policy.compute_actions(obs, state, prev_action_batch=actions, prev_reward_batch=rewards)
obs, reward, done, info = env.step(action)
episode_reward += reward
print("reward: {}".format(episode_reward))
env.render()
fails with the following error:
2021-07-05 17:27:56,257 INFO trainable.py:390 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': None, '_time_total': 30.673815488815308, '_episodes_total': 1226}
/opt/conda/lib/python3.8/site-packages/gym/logger.py:30: UserWarning: WARN: Box bound precision lowered by casting to float32
warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))
Traceback (most recent call last):
File "trajectory_view_api.py", line 135, in <module>
action, state, logits = policy.compute_actions(obs, state, prev_action_batch=actions, prev_reward_batch=rewards)
File "/opt/conda/lib/python3.8/site-packages/ray/rllib/policy/torch_policy.py", line 237, in compute_actions
for s in (state_batches or [])
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Exception ignored in: <function ActorHandle.__del__ at 0x7fcb8a0d3790>
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/ray/actor.py", line 834, in __del__
Is there any further documentation on how manually replay a trained policy after tune.run with the use of Trajectory API?
I canât make it working.