PopulationBasedTraining Verbosity assignment not followed & no forward progress

How severe does this issue affect your experience of using Ray?

  • Low: It annoys or frustrates me for a moment.

I am having issues were the verbose=1 definition is not being followed. Also, there doesn’t seem to be any forward progress in the iterations or episode ts? The cmd prompt will also re-output text on the same iter/ts repeatedly (I am having this issue on regular ray tune training as well)

Training Script

import random
from ray import train, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray import  tune
from ray.tune import register_env
from RemusRayEnvSimple import CustomMultiAgentEnv
from ray.rllib.algorithms.sac import SACConfig


if __name__ == "__main__":
    import argparse



    register_env("REMUS RAY",lambda config: CustomMultiAgentEnv)
    config = (
        SACConfig()
        .framework("torch")
        .debugging(seed=0, log_level="ERROR")
        .rollouts(
            #batch_mode="truncate_episodes",
            #rollout_fragment_length=256
        )
        .training(
            train_batch_size = tune.choice([10000, 20000, 40000]),
            lr=1e-4,
            #replay_buffer_config={"type": "MultiAgentPrioritizedReplayBuffer"},
        )
        .environment(env=CustomMultiAgentEnv)
        .multi_agent(
            policies=["policy0", "policy1"],
            policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "policy0" if agent_id == "agent_0" else "policy1",
            policies_to_train=["policy0", "policy1"],
            count_steps_by="env_steps"
        )
        #.reporting(min_train_timesteps_per_iteration=256*10)
    )

    # Modify the actor and critic network configurations directly
    config.twin_q=True
    config.q_model_config["fcnet_hiddens"] = [512, 512]
    config.q_model_config["fcnet_activation"] = "relu"
    config.policy_model_config["fcnet_hiddens"] = [1024, 1024]
    config.policy_model_config["fcnet_activation"] = "relu"

    config.replay_buffer_config["_enable_replay_buffer_api"]= True
    config.replay_buffer_config["type"] = "MultiAgentReplayBuffer"
    config.replay_buffer_config["capacity"] = 5_000_000

    config.num_sgd_iter = tune.choice([10, 20, 30])
    config.sgd_minibatch_size = tune.choice([128, 512, 2048])
    config.clip_param = 0.2
    config.lambda_=0.95
    config.kl_coeff=1.0

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing"
    )
    args, _ = parser.parse_known_args()

    # Postprocess the perturbed config to ensure it's still valid
    def explore(config):
        # ensure we collect enough timesteps to do sgd
        if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
            config["train_batch_size"] = config["sgd_minibatch_size"] * 2
        # ensure we run at least one sgd iter
        if config["num_sgd_iter"] < 1:
            config["num_sgd_iter"] = 1
        return config

    hyperparam_mutations = {
        "lambda": lambda: random.uniform(0.9, 1.0),
        "clip_param": lambda: random.uniform(0.01, 0.5),
        "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
        "num_sgd_iter": lambda: random.randint(1, 30),
        "sgd_minibatch_size": lambda: random.randint(128, 16384),
        "train_batch_size": lambda: random.randint(128, 160000),
    }

    pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

    
    # Stop when we've either reached 100 training iterations or reward=300
    stopping_criteria = {"training_iteration": 250, "episode_reward_mean": 0}

    tuner = tune.Tuner(
        "SAC",
        tune_config=tune.TuneConfig(
            metric="episode_reward_mean",
            mode="max",
            scheduler=pbt,
            num_samples=1 if args.smoke_test else 2,
        ),
        param_space=config,
        run_config=train.RunConfig(stop=stopping_criteria, verbose=1),
    )
    results = tuner.fit()

import pprint

best_result = results.get_best_result()

print("Best performing trial's final set of hyperparameters:\n")
pprint.pprint(
    {k: v for k, v in best_result.config.items() if k in hyperparam_mutations}
)

print("\nBest performing trial's final reported metrics:\n")

metrics_to_print = [
    "episode_reward_mean",
    "episode_reward_max",
    "episode_reward_min",
    "episode_len_mean",
]
pprint.pprint({k: v for k, v in best_result.metrics.items() if k in metrics_to_print})

Output

Trial status: 2 RUNNING
Current time: 2024-04-24 20:31:27. Total running time: 30s
Logical resource usage: 2.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:RTX)
Current best trial: 151d4_00000 with episode_reward_mean=-3429.7659050742477 and params={'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, '_fake_gpus': False, 'num_learner_workers': 0, 'num_gpus_per_learner_worker': 0, 'num_cpus_per_learner_worker': 1, 'local_gpu_idx': 0, 'custom_resources_per_worker': {}, 'placement_strategy': 'PACK', 'eager_tracing': True, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'torch_compile_learner': False, 'torch_compile_learner_what_to_compile': <TorchCompileWhatToCompile.FORWARD_TRAIN: 'forward_train'>, 'torch_compile_learner_dynamo_backend': 'inductor', 'torch_compile_learner_dynamo_mode': None, 'torch_compile_worker': False, 'torch_compile_worker_dynamo_backend': 'onnxrt', 'torch_compile_worker_dynamo_mode': None, 'env': <class 'RemusRayEnvSimple.CustomMultiAgentEnv'>, 'env_config': {}, 'observation_space': None, 'action_space': None, 'env_task_fn': None, 'render_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'disable_env_checking': False, 'auto_wrap_old_gym_envs': True, 'action_mask_key': 'action_mask', '_is_atari': None, 'env_runner_cls': None, 'num_envs_per_worker': 1, 'enable_connectors': True, '_env_to_module_connector': None, '_module_to_env_connector': None, 'add_default_connectors_to_env_to_module_pipeline': True, 'add_default_connectors_to_module_to_env_pipeline': True, 'episode_lookback_horizon': 1, 'rollout_fragment_length': 'auto', 'batch_mode': 'complete_episodes', 'validate_workers_after_construction': True, 'compress_observations': False, 'sampler_perf_stats_ema_coef': None, 'sample_async': -1, 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'enable_tf1_exec_eagerly': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'preprocessor_pref': 'deepmind', 'observation_filter': 'NoFilter', 'update_worker_filter_stats': True, 'use_worker_filter_stats': True, 'gamma': 0.99, 'lr': 0.0001, 'grad_clip': None, 'grad_clip_by': 'global_norm', 'train_batch_size': 10000, 'train_batch_size_per_learner': None, 'model': {'_disable_preprocessor_api': False, '_disable_action_flattening': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'fcnet_weights_initializer': None, 'fcnet_weights_initializer_config': None, 'fcnet_bias_initializer': None, 'fcnet_bias_initializer_config': None, 'conv_filters': None, 'conv_activation': 'relu', 'conv_kernel_initializer': None, 'conv_kernel_initializer_config': None, 'conv_bias_initializer': None, 'conv_bias_initializer_config': None, 'conv_transpose_kernel_initializer': None, 'conv_transpose_kernel_initializer_config': None, 'conv_transpose_bias_initializer': None, 'conv_transpose_bias_initializer_config': None, 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'post_fcnet_weights_initializer': None, 'post_fcnet_weights_initializer_config': None, 'post_fcnet_bias_initializer': None, 'post_fcnet_bias_initializer_config': None, 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, 'lstm_weights_initializer': None, 'lstm_weights_initializer_config': None, 'lstm_bias_initializer': None, 'lstm_bias_initializer_config': None, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'encoder_latent_dim': None, 'always_check_shapes': False, 'lstm_use_prev_action_reward': -1, '_use_default_native_models': -1}, '_learner_connector': None, 'add_default_connectors_to_learner_pipeline': True, 'optimizer': {}, 'max_requests_in_flight_per_sampler_worker': 2, '_learner_class': None, 'explore': True, 'exploration_config': {'type': 'StochasticSampling'}, 'algorithm_config_overrides_per_module': {}, '_per_module_overrides': {}, 'count_steps_by': 'env_steps', 'policies': ['policy0', 'policy1'], 'policy_map_capacity': 100, 'policy_mapping_fn': <function <lambda> at 0x000002828D0E3880>, 'policies_to_train': ['policy0', 'policy1'], 'policy_states_are_swappable': False, 'observation_fn': None, 'input_config': {}, 'actions_in_input_normalized': False, 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_config': {}, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'offline_sampling': False, 'evaluation_interval': None, 'evaluation_duration': 10, 'evaluation_duration_unit': 'episodes', 'evaluation_sample_timeout_s': 180.0, 'evaluation_parallel_to_training': False, 'evaluation_config': None, 'off_policy_estimation_methods': {}, 'ope_split_batch_by_episode': True, 'evaluation_num_workers': 0, 'custom_async_evaluation_function': None, 'always_attach_evaluation_results': False, 'enable_async_evaluation': False, 'in_evaluation': False, 'sync_filters_on_rollout_workers_timeout_s': 60.0, 'keep_per_episode_custom_metrics': False, 'metrics_episode_collection_timeout_s': 60.0, 'metrics_num_episodes_for_smoothing': 100, 'min_time_s_per_iteration': 1, 'min_train_timesteps_per_iteration': 0, 'min_sample_timesteps_per_iteration': 100, 'export_native_model_files': False, 'checkpoint_trainable_policies_only': False, 'logger_creator': None, 'logger_config': None, 'log_level': 'ERROR', 'log_sys_usage': True, 'fake_sampler': False, 'seed': 0, 'ignore_worker_failures': False, 'recreate_failed_workers': False, 'max_num_worker_restarts': 1000, 'delay_between_worker_restarts_s': 60.0, 'restart_failed_sub_environments': False, 'num_consecutive_worker_failures_tolerance': 100, 'worker_health_probe_timeout_s': 60, 'worker_restore_timeout_s': 1800, '_rl_module_spec': None, '_AlgorithmConfig__prior_exploration_config': None, '_enable_new_api_stack': False, '_tf_policy_handles_more_than_one_loss': False, '_disable_preprocessor_api': False, '_disable_action_flattening': False, '_disable_initialize_loss_from_dummy_batch': False, 'simple_optimizer': False, 'policy_map_cache': -1, 'worker_cls': -1, 'synchronize_filters': -1, 'replay_sequence_length': None, '_disable_execution_plan_api': -1, 'twin_q': True, 'q_model_config': {'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': None, 'custom_model': None, 'custom_model_config': {}}, 'policy_model_config': {'fcnet_hiddens': [1024, 1024], 'fcnet_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': None, 'custom_model': None, 'custom_model_config': {}}, 'tau': 0.005, 'initial_alpha': 1.0, 'target_entropy': 'auto', 'n_step': 1, 'replay_buffer_config': {'_enable_replay_buffer_api': True, 'type': 'MultiAgentReplayBuffer', 'capacity': 5000000, 'prioritized_replay': False, 'prioritized_replay_alpha': 0.6, 'prioritized_replay_beta': 0.4, 'prioritized_replay_eps': 1e-06, 'worker_side_prioritization': False}, 'store_buffer_in_checkpoints': False, 'training_intensity': None, 'optimization': {'actor_learning_rate': 0.0003, 'critic_learning_rate': 0.0003, 'entropy_learning_rate': 0.0003}, 'target_network_update_freq': 0, 'num_steps_sampled_before_learning_starts': 1500, '_deterministic_loss': False, '_use_beta_distribution': False, 'use_state_preprocessor': -1, 'worker_side_prioritization': -1, 'num_sgd_iter': 30, 'sgd_minibatch_size': 2048, 'clip_param': 0.2, 'kl_coeff': 1.0, '__stdout_file__': None, '__stderr_file__': None, 'lambda': 0.95, 'input': 'sampler', 'callbacks': <class 'ray.rllib.algorithms.callbacks.DefaultCallbacks'>, 'create_env_on_driver': False, 'custom_eval_function': None, 'framework': 'torch', 'num_cpus_for_driver': 1, 'num_workers': 0}
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name                            status       train_batch_size     num_sgd_iter     sgd_minibatch_size     iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean  
   episodes_this_iter │
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ SAC_CustomMultiAgentEnv_151d4_00000   RUNNING                 10000               30                   2048        1            8.95022    500   -3429.77               -3429.77               -3429.77                  500  
                    1 │
│ SAC_CustomMultiAgentEnv_151d4_00001   RUNNING                 40000               30                    512        1            8.73973    500   -3429.77               -3429.77               -3429.77                  500  
                    1 │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Trial status: 2 RUNNING
Current time: 2024-04-24 20:31:57. Total running time: 1min 0s
Logical resource usage: 2.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:RTX)
Current best trial: 151d4_00000 with episode_reward_mean=-3402.975541878805 and params={'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, '_fake_gpus': False, 'num_learner_workers': 0, 'num_gpus_per_learner_worker': 0, 'num_cpus_per_learner_worker': 1, 'local_gpu_idx': 0, 'custom_resources_per_worker': {}, 'placement_strategy': 'PACK', 'eager_tracing': True, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 
'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'torch_compile_learner': False, 'torch_compile_learner_what_to_compile': <TorchCompileWhatToCompile.FORWARD_TRAIN: 'forward_train'>, 'torch_compile_learner_dynamo_backend': 'inductor', 'torch_compile_learner_dynamo_mode': None, 'torch_compile_worker': False, 'torch_compile_worker_dynamo_backend': 'onnxrt', 'torch_compile_worker_dynamo_mode': None, 'env': <class 'RemusRayEnvSimple.CustomMultiAgentEnv'>, 'env_config': {}, 'observation_space': None, 'action_space': None, 'env_task_fn': None, 'render_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'disable_env_checking': False, 'auto_wrap_old_gym_envs': True, 'action_mask_key': 'action_mask', '_is_atari': None, 'env_runner_cls': None, 'num_envs_per_worker': 1, 'enable_connectors': True, '_env_to_module_connector': None, '_module_to_env_connector': None, 'add_default_connectors_to_env_to_module_pipeline': True, 'add_default_connectors_to_module_to_env_pipeline': True, 'episode_lookback_horizon': 
1, 'rollout_fragment_length': 'auto', 'batch_mode': 'complete_episodes', 'validate_workers_after_construction': True, 'compress_observations': False, 'sampler_perf_stats_ema_coef': None, 'sample_async': -1, 'remote_worker_envs': False, 'remote_env_batch_wait_ms': 0, 'enable_tf1_exec_eagerly': False, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>, 'preprocessor_pref': 'deepmind', 'observation_filter': 'NoFilter', 'update_worker_filter_stats': True, 'use_worker_filter_stats': True, 'gamma': 0.99, 'lr': 0.0001, 'grad_clip': None, 'grad_clip_by': 'global_norm', 'train_batch_size': 10000, 'train_batch_size_per_learner': None, 'model': {'_disable_preprocessor_api': False, '_disable_action_flattening': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'fcnet_weights_initializer': None, 'fcnet_weights_initializer_config': None, 'fcnet_bias_initializer': None, 'fcnet_bias_initializer_config': None, 'conv_filters': None, 'conv_activation': 'relu', 'conv_kernel_initializer': None, 'conv_kernel_initializer_config': None, 'conv_bias_initializer': None, 'conv_bias_initializer_config': None, 'conv_transpose_kernel_initializer': None, 'conv_transpose_kernel_initializer_config': None, 'conv_transpose_bias_initializer': None, 'conv_transpose_bias_initializer_config': None, 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'post_fcnet_weights_initializer': None, 'post_fcnet_weights_initializer_config': None, 'post_fcnet_bias_initializer': None, 'post_fcnet_bias_initializer_config': 
None, 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action': False, 'lstm_use_prev_reward': False, 'lstm_weights_initializer': None, 'lstm_weights_initializer_config': None, 'lstm_bias_initializer': None, 'lstm_bias_initializer_config': None, '_time_major': False, 'use_attention': False, 'attention_num_transformer_units': 1, 'attention_dim': 
64, 'attention_num_heads': 1, 'attention_head_dim': 32, 'attention_memory_inference': 50, 'attention_memory_training': 50, 'attention_position_wise_mlp_dim': 32, 'attention_init_gru_gate_bias': 2.0, 'attention_use_n_prev_actions': 0, 'attention_use_n_prev_rewards': 0, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None, 'encoder_latent_dim': None, 'always_check_shapes': False, 'lstm_use_prev_action_reward': -1, '_use_default_native_models': -1}, '_learner_connector': None, 'add_default_connectors_to_learner_pipeline': True, 'optimizer': {}, 'max_requests_in_flight_per_sampler_worker': 2, '_learner_class': None, 'explore': True, 'exploration_config': {'type': 'StochasticSampling'}, 'algorithm_config_overrides_per_module': {}, '_per_module_overrides': {}, 'count_steps_by': 'env_steps', 'policies': ['policy0', 'policy1'], 'policy_map_capacity': 100, 'policy_mapping_fn': <function <lambda> at 0x000002828D0E23B0>, 'policies_to_train': ['policy0', 'policy1'], 'policy_states_are_swappable': False, 'observation_fn': None, 'input_config': {}, 'actions_in_input_normalized': False, 'postprocess_inputs': False, 'shuffle_buffer_size': 0, 'output': None, 'output_config': {}, 'output_compress_columns': ['obs', 'new_obs'], 'output_max_file_size': 67108864, 'offline_sampling': False, 'evaluation_interval': None, 'evaluation_duration': 10, 'evaluation_duration_unit': 'episodes', 'evaluation_sample_timeout_s': 180.0, 'evaluation_parallel_to_training': False, 'evaluation_config': None, 'off_policy_estimation_methods': {}, 'ope_split_batch_by_episode': True, 'evaluation_num_workers': 0, 'custom_async_evaluation_function': None, 'always_attach_evaluation_results': False, 'enable_async_evaluation': False, 'in_evaluation': False, 'sync_filters_on_rollout_workers_timeout_s': 60.0, 'keep_per_episode_custom_metrics': False, 'metrics_episode_collection_timeout_s': 60.0, 'metrics_num_episodes_for_smoothing': 100, 'min_time_s_per_iteration': 1, 'min_train_timesteps_per_iteration': 0, 'min_sample_timesteps_per_iteration': 100, 'export_native_model_files': False, 'checkpoint_trainable_policies_only': False, 'logger_creator': None, 'logger_config': None, 'log_level': 'ERROR', 'log_sys_usage': True, 'fake_sampler': False, 'seed': 0, 'ignore_worker_failures': False, 'recreate_failed_workers': False, 'max_num_worker_restarts': 1000, 'delay_between_worker_restarts_s': 60.0, 'restart_failed_sub_environments': False, 'num_consecutive_worker_failures_tolerance': 100, 'worker_health_probe_timeout_s': 60, 'worker_restore_timeout_s': 1800, '_rl_module_spec': None, '_AlgorithmConfig__prior_exploration_config': None, '_enable_new_api_stack': False, '_tf_policy_handles_more_than_one_loss': False, '_disable_preprocessor_api': False, '_disable_action_flattening': False, '_disable_initialize_loss_from_dummy_batch': False, 'simple_optimizer': False, 'policy_map_cache': -1, 'worker_cls': -1, 'synchronize_filters': -1, 'replay_sequence_length': None, '_disable_execution_plan_api': -1, 'twin_q': True, 'q_model_config': {'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': None, 'custom_model': None, 'custom_model_config': {}}, 'policy_model_config': {'fcnet_hiddens': [1024, 1024], 'fcnet_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': None, 'custom_model': None, 'custom_model_config': {}}, 'tau': 0.005, 'initial_alpha': 1.0, 'target_entropy': 'auto', 'n_step': 1, 'replay_buffer_config': {'_enable_replay_buffer_api': True, 'type': 'MultiAgentReplayBuffer', 'capacity': 5000000, 'prioritized_replay': False, 'prioritized_replay_alpha': 0.6, 'prioritized_replay_beta': 0.4, 'prioritized_replay_eps': 1e-06, 'worker_side_prioritization': False}, 'store_buffer_in_checkpoints': False, 'training_intensity': None, 'optimization': {'actor_learning_rate': 0.0003, 'critic_learning_rate': 0.0003, 'entropy_learning_rate': 0.0003}, 'target_network_update_freq': 0, 'num_steps_sampled_before_learning_starts': 1500, '_deterministic_loss': False, '_use_beta_distribution': False, 'use_state_preprocessor': -1, 'worker_side_prioritization': -1, 'num_sgd_iter': 30, 'sgd_minibatch_size': 2048, 'clip_param': 0.2, 'kl_coeff': 1.0, '__stdout_file__': None, '__stderr_file__': None, 'lambda': 0.95, 'input': 'sampler', 'callbacks': <class 'ray.rllib.algorithms.callbacks.DefaultCallbacks'>, 'create_env_on_driver': False, 'custom_eval_function': None, 'framework': 'torch', 'num_cpus_for_driver': 1, 'num_workers': 0}
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name                            status       train_batch_size     num_sgd_iter     sgd_minibatch_size     iter     total time (s)     ts     reward     episode_reward_max     episode_reward_min     episode_len_mean  
   episodes_this_iter │
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ SAC_CustomMultiAgentEnv_151d4_00000   RUNNING                 10000               30                   2048        3            26.0897   1500   -3402.98               -3102.86               -3676.31                  500  
                    1 │
│ SAC_CustomMultiAgentEnv_151d4_00001   RUNNING                 40000               30                    512        3            26.0153   1500   -3402.98               -3102.86               -3676.31                  500  
                    1 │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Trial status: 2 RUNNING