How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
Hi all,
I am using rrllib with tune + ppo to try and train an agent, but it seems like as epochs complete, the gpu ram does not get emptied out and slowly builds up?
I have recorded this here: Ray GPU Memory Leak? - YouTube
Relevant timestamps:
04:19 - First Epoch + GRam Spike
07:12 - Second Epoch + GRam Spike
10:25 - Third Epoch + NO SPIKE
13:41 - Fourth Epoch + GRam Spike
17:26 - Fifth Epoch + GRam Spike
19:54 - Sixth Epoch + GRam Spike
Subsequent epochs nothing happens, so I stop recording, and on the ~12th epoch it spikes again and there’s a ton of memory thrashing as the epoch fails to complete.
Sample server code:
import ray
from ray.rllib.env import PolicyServerInput
from ray.rllib.algorithms.ppo import PPOConfig
import numpy as np
import argparse
from gymnasium.spaces import MultiDiscrete, Box
ray.init(object_store_memory=40 * (10 ** 9), num_cpus=6, num_gpus=1, log_to_driver=False )
ppo_config = PPOConfig()
def _input(ioctx):
# We are remote worker, or we are local worker with num_workers=0:
# Create a PolicyServerInput.
if ioctx.worker_index > 0 or ioctx.worker.num_workers == 0:
return PolicyServerInput(
ioctx,
'localhost',
55556 + ioctx.worker_index - (1 if ioctx.worker_index > 0 else 0),
)
# No InputReader (PolicyServerInput) needed.
else:
return None
x = 320
y = 240
ppo_config.clip_param = 0.175
ppo_config.gamma = 0.996 # default 0.99 -> how far into the future to care for rewards
ppo_config.lambda_ = 0.99
ppo_config.kl_target = 0.01 # default 0.01
ppo_config.rollout_fragment_length = 64
ppo_config.train_batch_size = 4500
ppo_config.sgd_minibatch_size = 512
ppo_config.num_sgd_iter = 1
ppo_config.lr = 9e-5
ppo_config.model = {
# Share layers for value function. If you set this to True, it's
# important to tune vf_loss_coeff.
"vf_share_layers": True,
'use_attention': True,
"max_seq_len": 50,
"attention_num_transformer_units": 1,
"attention_dim": 256,
"attention_memory_inference": 50,
"attention_memory_training": 50,
"attention_num_heads": 8,
"attention_head_dim": 32,
"attention_position_wise_mlp_dim": 128,
"attention_init_gru_gate_bias": 2.0,
"conv_filters": [
[32, [12, 16], [7, 9]],
[128, [6, 6], 4],
[256, [9, 9], 1]
],
"conv_activation": "relu"
}
ppo_config.batch_mode = "complete_episodes"
ppo_config.simple_optimizer = False
ppo_config.env = None
ppo_config.observation_space = Box(low=0, high=1, shape=(y, x, 1), dtype=np.float32)
ppo_config.action_space = MultiDiscrete(
[
2, # W
2, # A
2, # S
2, # D
2, # Space
2, # H
2, # J
2, # K
2 # L
]
)
ppo_config.env_config = {
"sleep": True,
'replayOn': False
}
ppo_config.rollouts(num_rollout_workers=2, enable_connectors=False)
ppo_config.offline_data(input_=_input)
ppo_config.framework_str = 'torch'
ppo_config.log_sys_usage = False
ppo_config.compress_observations = True
ppo_config.shuffle_sequences = False
ppo_config.num_gpus = 0.5
ppo_config.num_cpus_for_local_worker = 4
ppo_config.num_cpus_per_worker = 1
tempyy = ppo_config.to_dict()
print(tempyy)
from ray import tune
name = "Checkpoint1"
print(f"Starting: {name}")
tune.run("PPO",
resume='AUTO',
config=tempyy,
name=name,
keep_checkpoints_num=20, checkpoint_score_attr="episode_reward_mean", mode='max',
checkpoint_freq=1,
metric="episode_reward_mean",
max_failures=10,
# resume=True,
# restore="C:\\Users\\denys\\ray_results\\lstmV1_jump0005_batch15360_minibatch_1024_lr9e-5_sgd5_prevAction-False\\PPO_None_3866f_00000_0_2023-04-17_08-48-58\\checkpoint_000055",
checkpoint_at_end=True)
Please let me know if there is something I’m configuring wrong/misunderstanding or a genuine bug!