When I am conducting PPO training for MARL, I want to use an LLM. Could you please tell me if there are any problems with my definition like this? Why is it that there is neither an error message nor any execution, but it just keeps pending?
Thank you so much for your time.
ray.shutdown()
if not ray.is_initialized():
print('init ray')
ray.init(
ignore_reinit_error=True,
num_cpus=N_CPUS,
num_gpus=8,
object_store_memory=2_000_000_000, # 2 GB
logging_level="INFO",
)
@ray.remote(num_gpus=1, max_concurrency=10)
class LLMActor:
def __init__(self):
self.llm = None
def initialize(self):
model_name = "Qwen/Qwen2.5-7B-Instruct-1M"
self.llm = LLM(
model=model_name, dtype="float16",
max_model_len=4096,
gpu_memory_utilization=0.7, enforce_eager=True # tensor_parallel_size=2,
)
async def generate_text(self, prompt, sampling_params):
loop = asyncio.get_event_loop()
outputs = await loop.run_in_executor(None, self.llm.generate, [prompt], sampling_params)
return outputs[0].outputs[0].text
llm_actor = LLMActor.remote()
ray.get(llm_actor.initialize.remote())
tuner = tune.Tuner(
"PPO",
param_space=config.to_dict(),
run_config=air.RunConfig(
stop={"training_iteration": 50},
verbose=1,
log_to_file=True
)
)
results = tuner.fit()
ray.shutdown()
mannyv
March 11, 2025, 3:15pm
2
@sunmengyao321 ,
You did not provide your PPO config. Based on what you did share it is not really possible to tell what the issue is.
Thank you so much for your reply. I have set up my ppo config in the following way:
training_args = dict(
lr=0.00003,
gamma=0.99997,
train_batch_size=200 * N_CPUS,
num_sgd_iter=10,
lambda_=0.95,
use_kl_loss=False,
clip_param=0.1,
grad_clip=0.5,
mini_batch_size_per_learner=30, # 100
)
config = (
PPOConfig()
.framework(args.framework)
.environment( # .environment(TwoStepGame)
"ConstellationTasking-RLlib",
env_config=env_args,
)
.env_runners(
num_env_runners=N_CPUS - 1,
sample_timeout_s=1000.0,
) # .env_runners(batch_mode="complete_episodes", num_env_runners=0) # .training(model={"custom_model": "cc_model"})
.multi_agent(
policies={
"default": PolicySpec(
policy_class=PPOTorchPolicy,
observation_space=None, # Box(-1e+16, 1e+16, (11,), float32),
action_space=None, # Discrete(4),
config={},
),
},
policy_mapping_fn=lambda *args, **kwargs: "default",
).rl_module(
model_config_dict={
"use_lstm": False, # Use a simpler FCNet when we also have an LSTM.
"fcnet_hiddens": [2048, 2048],
"vf_share_layers": False,
}
)
)
config.callbacks(WrappedEpisodeDataCallbacks)
config.env_runners(
module_to_env_connector=lambda env: (discounting.ContinuePreviousAction(),)
)
config.training(
learner_connector=lambda obs_space, act_space: (
discounting.MakeAddedStepActionValid(expected_train_batch_size=config.train_batch_size),
discounting.CondenseMultiStepActions(),
),
)
config.training(learner_class=discounting.TimeDiscountedGAEPPOTorchLearner)
mannyv
March 12, 2025, 12:00pm
4
This setting is your most likely issue. You will need 1 cpu for the RLlib driver, N_CPUS-1 cpus for the env_runners, and 1 cpu for the llm_actor.
You have two choices here, one is to reduce the env_runners by 1, the other is to set num_cpus=0 for the llm_actor, `@ray.remote(num_cpus=0, num_gpus=1, max_concurrency=10)
class LLMActor: `