Two issues here:
- The evaluation values seem to be zero/null
- evaluation metrics like episode_len_mean and episode_reward_mean are just missing?
I am using Ray 2.0.0 and here is my PPO config line:
my_ppo_config = PPOConfig()\
.python_environment()\
.resources(
num_gpus=1,
num_cpus_per_worker=1,
num_gpus_per_worker=0,
)\
.framework(
framework='tf',
eager_tracing=False,
)\
.environment(
env='pheromone_env',
env_config=env_config,
observation_space=None,
action_space=None,
clip_rewards=None,
normalize_actions=False, # default is True
clip_actions=False,
disable_env_checking=True,
)\
.rollouts(
num_rollout_workers = 1,
num_envs_per_worker = 1,
# sample_collector = SimpleListCollector,
# create_env_on_local_worker = False,
# sample_async = False,
rollout_fragment_length = 400,
batch_mode = 'complete_episodes',
# horizon = None,
# soft_horizon = False,
# no_done_at_end = False,
observation_filter = 'NoFilter',
)\
.training(
gamma=0.99,
lr=5e-05,
train_batch_size=4000,
model=model,
lr_schedule=None,
use_critic=True,
use_gae=True,
lambda_=1.0,
kl_coeff=0.2,
sgd_minibatch_size=128,
num_sgd_iter=30,
shuffle_sequences=True,
vf_loss_coeff=1.0,
entropy_coeff=0.0,
entropy_coeff_schedule=None,
clip_param=0.3,
vf_clip_param=10,
grad_clip=None,
kl_target=0.01,
)\
.callbacks(MyCallbacks)\
.exploration(
explore=True,
exploration_config={'type': 'StochasticSampling'}
)\
.multi_agent(
policies = policies,
policy_map_capacity = 100,
policy_map_cache = None,
policy_mapping_fn = lambda agent_id: 'my_ppo',
policies_to_train = ['my_ppo'],
observation_fn = None,
replay_mode = 'independent',
count_steps_by = 'env_steps',
)\
.offline_data(
# postprocess_inputs=False,
)\
.evaluation(
evaluation_interval = 10,
evaluation_duration = 10,
evaluation_duration_unit = 'episodes',
# evaluation_sample_timeout_s = 180.0,
evaluation_parallel_to_training = False,
evaluation_config = {
'explore': False,
'exploration_config' : {'type': 'StochasticSampling'}
},
evaluation_num_workers = 1,
# custom_evaluation_function = None
always_attach_evaluation_results = True,
# in_evaluation = False,
# sync_filters_on_rollout_workers_timeout_s = 60.0
)\
.reporting(
keep_per_episode_custom_metrics = True, # default is False
metrics_episode_collection_timeout_s = 60.0,
metrics_num_episodes_for_smoothing = 100,
min_time_s_per_iteration = None,
min_train_timesteps_per_iteration = 0,
min_sample_timesteps_per_iteration = 0,
)\
.debugging(
log_level='WARN',
seed=42
)
Is there some issue with the configuration that could lead to this behaviour? Please guide me. Thank you.