How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I upgraded my development system while Ray 2.0 came out, but now that I’m getting back into my workflow, and using my new resources, I’m noticing that my runs are using more memory than before. Just following the fractional_gpu example on github (mostly, code pasted below), my system is completely maxing out on RAM and VRAM usage. It will still run in this case, but for my application it quickly hits OOM issues. It just seems like the training process shouldn’t require this many resources.
from ray import air, tune
from ray.tune.registry import register_env
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
from pettingzoo.mpe import simple_spread_v2
# Based on code from github.com/parametersharingmadrl/parametersharingmadrl
if __name__ == "__main__":
register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env()))
tune.Tuner(
"PPO",
run_config=air.RunConfig(
stop={"episodes_total": 60000},
checkpoint_config=air.CheckpointConfig(
checkpoint_frequency=10,
),
),
param_space={
# Enviroment specific.
"env": "simple_spread",
# General
"framework":"torch",
"num_gpus": 0.001,
"num_workers": 20,
"num_gpus_per_worker": (1-0.001)/21,
"num_envs_per_worker": 1,
"compress_observations": True,
# Algorithm Specific
"lambda": 0.99,
"train_batch_size": 512,
"sgd_minibatch_size": 32,
"num_sgd_iter": 5,
"batch_mode": "truncate_episodes",
"entropy_coeff": 0.01,
"lr": 2e-5,
#Multiagent
"multiagent": {
"policies": {"shared_policy"},
"policy_mapping_fn": (
lambda agent_id, episode, **kwargs: "shared_policy"
),
},
},
).fit()