How severe does this issue affect your experience of using Ray?
- Medium: It contributes to significant difficulty to complete my task, but I can work around it.
Hi all,
I am expirementing with an AI using LSTM and Attention Net wrappers. However, I want to compare the results against a more simple frame-stacking approach.
I have read throughhttps://github.com/ray-project/ray/blob/8e680c483ce326cefc62e44f68ab1a6948b1c3d2/doc/source/rllib/rllib-sample-collection.rst
and
https://docs.ray.io/en/latest/rllib/rllib-sample-collection.html#trajectory-view-api
but still don’t quite understand how to tell the PPO config to just stack the last x observations?
My policy_server:
import ray
from ray.rllib.env import PolicyServerInput
from ray.rllib.algorithms.ppo import PPOConfig
import numpy as np
import argparse
from gymnasium.spaces import MultiDiscrete, Box
ppo_config = PPOConfig()
parser = argparse.ArgumentParser(description='Optional app description')
parser.add_argument('-ip', type=str, help='IP of this device')
parser.add_argument('-checkpoint', type=str, help='location of checkpoint to restore from')
args = parser.parse_args()
def _input(ioctx):
return PolicyServerInput(
ioctx,
args.ip,
55556,
)
x = 320
y = 240
# kl_coeff, ->deafult 0.2
# vf_loss_coeff used to be 0.01??
# "entropy_coeff": 0.00005,
# "clip_param": 0.1,
ppo_config.gamma = 0.998 # default 0.99
ppo_config.lambda_ = 0.99 # default 1.0???
ppo_config.kl_target = 0.01 # default 0.01
ppo_config.rollout_fragment_length = 512
ppo_config.train_batch_size = 10240
ppo_config.sgd_minibatch_size = 256
ppo_config.num_sgd_iter = 2 # default 30???
ppo_config.lr = 3.5e-5 # 5e-5
ppo_config.model = {
# Share layers for value function. If you set this to True, it's
# important to tune vf_loss_coeff.
"vf_share_layers": False,
#"use_lstm": True,
#"max_seq_len": 32,
#"lstm_cell_size": 128,
#"lstm_use_prev_action": True,
'use_attention': True,
"max_seq_len": 64,
"attention_num_transformer_units": 1,
"attention_dim": 256,
"attention_memory_inference": 128,
"attention_memory_training": 128,
"attention_num_heads": 8,
"attention_head_dim": 32,
"attention_position_wise_mlp_dim": 128,
"attention_use_n_prev_actions": 0,
"attention_use_n_prev_rewards": 0,
"attention_init_gru_gate_bias": 2.0,
"conv_filters": [],
#"conv_activation": "relu",
#"post_fcnet_hiddens": [512],
#"post_fcnet_activation": "relu"
}
ppo_config.batch_mode = "complete_episodes"
ppo_config.simple_optimizer = True
ppo_config.num_gpus = 1
ppo_config.rollouts(num_rollout_workers=0, enable_connectors=False)
ppo_config.offline_data(input_=_input)
ppo_config.env = None
ppo_config.observation_space = Box(low=0, high=1, shape=(y, x, 1), dtype=np.float32)
ppo_config.action_space = MultiDiscrete(
[
2, # W
2, # A
2, # S
2, # D
2, # Space
2, # H
2, # J
2, # K
2 # L
]
)
ppo_config.env_config = {
"sleep": True,
'replayOn': False
}
ppo_config.framework_str = 'torch'
ppo_config.log_sys_usage = False
ppo_config.compress_observations = True
ppo_config.shuffle_sequences = False
ray.init(num_cpus=4, num_gpus=1, log_to_driver=False)
from ray import tune
name = "" + args.checkpoint
print(f"Starting: {name}")
tune.run("PPO",
resume='AUTO',
config=ppo_config.to_dict(),
name=name, keep_checkpoints_num=None, checkpoint_score_attr="episode_reward_mean",
max_failures=1,
checkpoint_freq=5, checkpoint_at_end=True)
Thanks!