How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I posted about this about a month ago and @Lars_Simon_Zehnder was so kind to fix the problem. But unfortunately it seems that the tf_action_dist.py
file was changed again and I am unable to use trainer.compute_single_action when I have a LSTM network. I keep getting the ValueError about the shapes being the wrong sizes. The training part still works fine. Here is a link to the file in colab if you want to reproduce it from there.
Here is the code as well:
# Define your problem using python and openAI's gym API:
class ParrotEnv(gym.Env):
"""Environment in which an agent must learn to repeat the seen observations.
Observations are float numbers indicating the to-be-repeated values,
e.g. -1.0, 5.1, or 3.2.
The action space is always the same as the observation space.
Rewards are r=-abs(observation - action), for all steps.
"""
def __init__(self, config):
# Make the space (for actions and observations) configurable.
self.action_space = config.get(
"parrot_shriek_range", gym.spaces.Box(-1.0, 1.0, shape=(1,))
)
# Since actions should repeat observations, their spaces must be the
# same.
self.observation_space = self.action_space
self.cur_obs = None
self.episode_len = 0
def reset(self):
"""Resets the episode and returns the initial observation of the new one."""
# Reset the episode len.
self.episode_len = 0
# Sample a random number from our observation space.
self.cur_obs = self.observation_space.sample()
# Return initial observation.
return self.cur_obs
def step(self, action):
"""Takes a single step in the episode given `action`
Returns: New observation, reward, done-flag, info-dict (empty).
"""
# Set `done` flag after 10 steps.
self.episode_len += 1
done = self.episode_len >= 10
# r = -abs(obs - action)
#print(type(self.cur_obs))
#print(type(action))
reward = int(-abs(self.cur_obs - action))
#print(type(reward))
# Set a new observation (random sample).
self.cur_obs = self.observation_space.sample()
return self.cur_obs, reward, done, {}
# Create an RLlib Trainer instance to learn how to act in the above
# environment.
trainer = PPOTrainer(
config={
# Env class to use (here: our gym.Env sub-class from above).
"env": ParrotEnv,
# Config dict to be passed to our custom env's constructor.
"env_config": {"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))},
# Parallelize environment rollouts.
"num_workers": 1,
"model":{
"use_lstm": True,
"lstm_cell_size": 256,
"lstm_use_prev_action": True,
"lstm_use_prev_reward": True
}
}
)
# Train for n iterations and report results (mean episode rewards).
# Since we have to guess 10 times and the optimal reward is 0.0
# (exact match between observation and action value),
# we can expect to reach an optimal episode reward of 0.0.
for i in range(10):
results = trainer.train()
print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly simpler env here (-3.0 to 3.0, instead
# of -5.0 to 5.0!), however, this should still work as the agent has
# (hopefully) learned to "just always repeat the observation!".
env = ParrotEnv({"parrot_shriek_range": gym.spaces.Box(-3.0, 3.0, (1,))})
# Get the initial observation (some value between -10.0 and 10.0).
obs = env.reset()
state = [np.zeros([256], np.float32) for _ in range(2)]
prev_a = 0
prev_r = 0.0
done = False
total_reward = 0.0
# Play one episode.
while not done:
# Compute a single action, given the current observation
# from the environment.
action, state, _ = trainer.compute_single_action(obs, state, prev_action=prev_a, prev_reward=prev_r)
# Apply the computed action in the environment.
obs, reward, done, info = env.step(action)
prev_a = action
prev_r = reward
# Sum up rewards for reporting purposes.
total_reward += reward
# Report results.
print(f"Played 1 episode; total-reward={total_reward}")
And the error:
2022-04-26 06:30:32,082 ERROR tf_run_builder.py:52 -- Error fetching: [<tf.Tensor 'default_policy/cond_1/Merge:0' shape=(?, 1) dtype=float32>, <tf.Tensor 'default_policy/model_1/lstm/while/Exit_3:0' shape=(?, 256) dtype=float32>, <tf.Tensor 'default_policy/model_1/lstm/while/Exit_4:0' shape=(?, 256) dtype=float32>, {'action_prob': <tf.Tensor 'default_policy/Exp_1:0' shape=(?,) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/cond_2/Merge:0' shape=(?,) dtype=float32>, 'action_dist_inputs': <tf.Tensor 'default_policy/Reshape_3:0' shape=(?, 2) dtype=float32>, 'vf_preds': <tf.Tensor 'default_policy/Reshape_4:0' shape=(?,) dtype=float32>}], feed_dict={<tf.Tensor 'default_policy/obs:0' shape=(?, 1) dtype=float32>: array([[-0.30549845]], dtype=float32), <tf.Tensor 'default_policy/state_in_0:0' shape=(?, 256) dtype=float32>: array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
dtype=float32), <tf.Tensor 'default_policy/state_in_1:0' shape=(?, 256) dtype=float32>: array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
dtype=float32), <tf.Tensor 'default_policy/prev_actions:0' shape=(?, 1) dtype=float32>: array([0]), <tf.Tensor 'default_policy/prev_rewards:0' shape=(?,) dtype=float32>: array([0.]), <tf.Tensor 'default_policy/seq_lens:0' shape=(?,) dtype=int32>: array([1.]), <tf.Tensor 'default_policy/is_training:0' shape=() dtype=bool>: False, <tf.Tensor 'default_policy/is_exploring:0' shape=() dtype=bool>: True, <tf.Tensor 'default_policy/timestep:0' shape=() dtype=int64>: 40000}
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/tf_run_builder.py", line 47, in get
os.environ.get("TF_TIMELINE_DIR"),
File "/usr/local/lib/python3.7/dist-packages/ray/rllib/utils/tf_run_builder.py", line 102, in run_timeline
fetches = sess.run(ops, feed_dict=feed_dict)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/client/session.py", line 968, in run
run_metadata_ptr)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/client/session.py", line 1165, in _run
f'Cannot feed value of shape {str(np_val.shape)} for Tensor '
ValueError: Cannot feed value of shape (1,) for Tensor default_policy/prev_actions:0, which has shape (?, 1)