1. Severity of the issue: (select one)
None: I’m just curious or want clarification.
Low: Annoying but doesn’t hinder my work.
Medium: Significantly affects my productivity but can find a workaround.
High: Completely blocks me.
2. Environment:
- Ray version: 2.46
- Python version: 3.10
- OS: Ubuntu 24.04
- Cloud/Infrastructure: WSL2
- Other libs/tools (if relevant):
3. What happened vs. what you expected:
I’m trying to set up a reinforcement learning environment using Ray. Here are my PPO setup scripts and main program:
from configs.config_train import train_config
import ray
from pprint import pprint
import bioimitation
if __name__ == '__main__':
ray.init()
ppo = train_config().build_algo()
for _ in range(4):
pprint(ppo.train())
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
from ray.rllib.connectors.env_to_module import MeanStdFilter
def train_config():
config = (
PPOConfig()
.environment(
env='MuscleWalkingImitation3D-v0',
env_config={
'apply_perturbations': False,
'horizon': 5,
'log': False,
'mode': 'train',
'r_weights': [0.8, 0.2, 0.1],
'use_GRF': True,
'use_target_obs': True,
},
)
.training(
lr = 2e-4,
gamma = 0.995,
kl_coeff = 1.0,
train_batch_size_per_learner = 3200,
minibatch_size = 320,
num_epochs = 20,
)
.rl_module(
model_config=DefaultModelConfig(
fcnet_hiddens=[512,512],
fcnet_activation='tanh',
max_seq_len=0
)
)
# .resources(
# num_cpus_for_main_process=1,
# )
.env_runners(
num_env_runners=0,
batch_mode='complete_episodes',
num_cpus_per_env_runner = 1,
env_to_module_connector=lambda env: MeanStdFilter(multi_agent=False),
)
.learners(
num_learners = 1,
num_gpus_per_learner = 1,
)
)
return config
First, I got a warning:
2025-09-04 18:28:26,285 WARNING deprecation.py:50 -- DeprecationWarning: `RLModule(config=[RLModuleConfig object])` has been deprecated. Use `RLModule(observation_space=.., action_space=.., inference_only=.., model_config=.., catalog_class=..)` instead. This will raise an error in the future!
After searching line by line, I found that it occurs in line 445 of the rl_module.py file, but I don’t understand what exactly happens:
# TODO (sven): Deprecate this. We keep it here for now in case users
# still have custom models (or subclasses of RLlib default models)
# into which they pass in a `config` argument.
self.config = RLModuleConfig(
observation_space=self.observation_space,
action_space=self.action_space,
inference_only=self.inference_only,
learner_only=self.learner_only,
model_config_dict=self.model_config,
catalog_class=catalog_class,
)
Second, observing the training report, I can see that the model used for training ([256, 256]) seems to be different from the one I set ([512, 512]). Is there something wrong with my configuration?:
{'config': {'_disable_action_flattening': False,
'_disable_execution_plan_api': -1,
'_disable_initialize_loss_from_dummy_batch': False,
'_disable_preprocessor_api': False,
'_dont_auto_sync_env_runner_states': False,
'_enable_rl_module_api': -1,
'_env_to_module_connector': <function train_config.<locals>.<lambda> at 0x72a4144cecb0>,
'_fake_gpus': False,
'_is_atari': None,
'_is_online': True,
'_learner_class': None,
'_learner_connector': None,
'_model_config': DefaultModelConfig(fcnet_hiddens=[512, 512],
fcnet_activation='tanh',
fcnet_kernel_initializer=None,
fcnet_kernel_initializer_kwargs=None,
fcnet_bias_initializer=None,
fcnet_bias_initializer_kwargs=None,
conv_filters=None,
conv_activation='relu',
conv_kernel_initializer=None,
conv_kernel_initializer_kwargs=None,
conv_bias_initializer=None,
conv_bias_initializer_kwargs=None,
head_fcnet_hiddens=[],
head_fcnet_activation='relu',
head_fcnet_kernel_initializer=None,
head_fcnet_kernel_initializer_kwargs=None,
head_fcnet_bias_initializer=None,
head_fcnet_bias_initializer_kwargs=None,
free_log_std=False,
log_std_clip_param=20.0,
vf_share_layers=True,
use_lstm=False,
max_seq_len=0,
lstm_cell_size=256,
lstm_use_prev_action=False,
lstm_use_prev_reward=False,
lstm_kernel_initializer=None,
lstm_kernel_initializer_kwargs=None,
lstm_bias_initializer=None,
lstm_bias_initializer_kwargs=None),
'_module_to_env_connector': None,
'_per_module_overrides': {},
'_prior_exploration_config': {'type': 'StochasticSampling'},
'_rl_module_spec': None,
'_tf_policy_handles_more_than_one_loss': False,
'_torch_grad_scaler_class': None,
'_torch_lr_scheduler_classes': None,
'_train_batch_size_per_learner': 3200,
'_use_msgpack_checkpoints': False,
'_validate_config': True,
'action_mask_key': 'action_mask',
'action_space': None,
'actions_in_input_normalized': False,
'add_default_connectors_to_env_to_module_pipeline': True,
'add_default_connectors_to_learner_pipeline': True,
'add_default_connectors_to_module_to_env_pipeline': True,
'algorithm_config_overrides_per_module': {},
'always_attach_evaluation_results': -1,
'auto_wrap_old_gym_envs': -1,
'batch_mode': 'complete_episodes',
'broadcast_env_runner_states': True,
'broadcast_offline_eval_runner_states': False,
'callbacks': <class 'ray.rllib.callbacks.callbacks.RLlibCallback'>,
'callbacks_on_algorithm_init': None,
'callbacks_on_checkpoint_loaded': None,
'callbacks_on_env_runners_recreated': None,
'callbacks_on_environment_created': None,
'callbacks_on_episode_created': None,
'callbacks_on_episode_end': None,
'callbacks_on_episode_start': None,
'callbacks_on_episode_step': None,
'callbacks_on_evaluate_end': None,
'callbacks_on_evaluate_offline_end': None,
'callbacks_on_evaluate_offline_start': None,
'callbacks_on_evaluate_start': None,
'callbacks_on_offline_eval_runners_recreated': None,
'callbacks_on_sample_end': None,
'callbacks_on_train_result': None,
'checkpoint_trainable_policies_only': False,
'class': <class 'ray.rllib.algorithms.ppo.ppo.PPOConfig'>,
'clip_actions': False,
'clip_param': 0.3,
'clip_rewards': None,
'compress_observations': False,
'count_steps_by': 'env_steps',
'create_env_on_driver': False,
'create_local_env_runner': True,
'custom_async_evaluation_function': -1,
'custom_eval_function': None,
'custom_resources_per_env_runner': {},
'custom_resources_per_offline_eval_runner': {},
'dataset_num_iters_per_eval_runner': 1,
'dataset_num_iters_per_learner': None,
'delay_between_env_runner_restarts_s': 60.0,
'disable_env_checking': False,
'eager_max_retraces': 20,
'eager_tracing': True,
'enable_async_evaluation': -1,
'enable_connectors': -1,
'enable_env_runner_and_connector_v2': True,
'enable_rl_module_and_learner': True,
'enable_tf1_exec_eagerly': False,
'entropy_coeff': 0.0,
'entropy_coeff_schedule': None,
'env': 'MuscleWalkingImitation3D-v0',
'env_config': {'apply_perturbations': False,
'horizon': 5,
'log': False,
'mode': 'train',
'r_weights': [0.8, 0.2, 0.1],
'use_GRF': True,
'use_target_obs': True},
'env_runner_cls': None,
'env_runner_health_probe_timeout_s': 30.0,
'env_runner_restore_timeout_s': 1800.0,
'env_task_fn': -1,
'episode_lookback_horizon': 1,
'episodes_to_numpy': True,
'evaluation_auto_duration_max_env_steps_per_sample': 2000,
'evaluation_auto_duration_min_env_steps_per_sample': 100,
'evaluation_config': None,
'evaluation_duration': 10,
'evaluation_duration_unit': 'episodes',
'evaluation_force_reset_envs_before_iteration': True,
'evaluation_interval': None,
'evaluation_num_env_runners': 0,
'evaluation_parallel_to_training': False,
'evaluation_sample_timeout_s': 120.0,
'exploration_config': {},
'explore': True,
'export_native_model_files': False,
'extra_python_environs_for_driver': {},
'extra_python_environs_for_worker': {},
'fake_sampler': False,
'framework': 'torch',
'gamma': 0.995,
'grad_clip': None,
'grad_clip_by': 'global_norm',
'gym_env_vectorize_mode': 'SYNC',
'ignore_env_runner_failures': False,
'ignore_final_observation': False,
'ignore_offline_eval_runner_failures': False,
'in_evaluation': False,
'input': 'sampler',
'input_compress_columns': ['obs', 'new_obs'],
'input_config': {},
'input_filesystem': None,
'input_filesystem_kwargs': {},
'input_read_batch_size': None,
'input_read_episodes': False,
'input_read_method': 'read_parquet',
'input_read_method_kwargs': {},
'input_read_sample_batches': False,
'input_read_schema': {},
'input_spaces_jsonable': True,
'iter_batches_kwargs': {},
'keep_per_episode_custom_metrics': False,
'kl_coeff': 1.0,
'kl_target': 0.01,
'lambda': 1.0,
'learner_config_dict': {},
'local_gpu_idx': 0,
'local_tf_session_args': {'inter_op_parallelism_threads': 8,
'intra_op_parallelism_threads': 8},
'log_gradients': True,
'log_level': 'WARN',
'log_sys_usage': True,
'logger_config': None,
'logger_creator': None,
'lr': 0.0002,
'lr_schedule': None,
'map_batches_kwargs': {},
'materialize_data': False,
'materialize_mapped_data': True,
'max_num_env_runner_restarts': 1000,
'max_num_offline_eval_runner_restarts': 1000,
'max_requests_in_flight_per_aggregator_actor': 3,
'max_requests_in_flight_per_env_runner': 1,
'max_requests_in_flight_per_learner': 3,
'max_requests_in_flight_per_offline_eval_runner': 1,
'merge_env_runner_states': 'training_only',
'metrics_episode_collection_timeout_s': 60.0,
'metrics_num_episodes_for_smoothing': 100,
'min_sample_timesteps_per_iteration': 0,
'min_time_s_per_iteration': None,
'min_train_timesteps_per_iteration': 0,
'minibatch_size': 320,
'model': {'_disable_action_flattening': False,
'_disable_preprocessor_api': False,
'_time_major': False,
'_use_default_native_models': -1,
'always_check_shapes': False,
'attention_dim': 64,
'attention_head_dim': 32,
'attention_init_gru_gate_bias': 2.0,
'attention_memory_inference': 50,
'attention_memory_training': 50,
'attention_num_heads': 1,
'attention_num_transformer_units': 1,
'attention_position_wise_mlp_dim': 32,
'attention_use_n_prev_actions': 0,
'attention_use_n_prev_rewards': 0,
'conv_activation': 'relu',
'conv_bias_initializer': None,
'conv_bias_initializer_config': None,
'conv_filters': None,
'conv_kernel_initializer': None,
'conv_kernel_initializer_config': None,
'conv_transpose_bias_initializer': None,
'conv_transpose_bias_initializer_config': None,
'conv_transpose_kernel_initializer': None,
'conv_transpose_kernel_initializer_config': None,
'custom_action_dist': None,
'custom_model': None,
'custom_model_config': {},
'custom_preprocessor': None,
'dim': 84,
'encoder_latent_dim': None,
'fcnet_activation': 'tanh',
'fcnet_bias_initializer': None,
'fcnet_bias_initializer_config': None,
'fcnet_hiddens': [256, 256],
'fcnet_weights_initializer': None,
'fcnet_weights_initializer_config': None,
'framestack': True,
'free_log_std': False,
'grayscale': False,
'log_std_clip_param': 20.0,
'lstm_bias_initializer': None,
'lstm_bias_initializer_config': None,
'lstm_cell_size': 256,
'lstm_use_prev_action': False,
'lstm_use_prev_action_reward': -1,
'lstm_use_prev_reward': False,
'lstm_weights_initializer': None,
'lstm_weights_initializer_config': None,
'max_seq_len': 20,
'no_final_linear': False,
'post_fcnet_activation': 'relu',
'post_fcnet_bias_initializer': None,
'post_fcnet_bias_initializer_config': None,
'post_fcnet_hiddens': [],
'post_fcnet_weights_initializer': None,
'post_fcnet_weights_initializer_config': None,
'use_attention': False,
'use_lstm': False,
'vf_share_layers': False,
'zero_mean': True},
'normalize_actions': True,
'num_aggregator_actors_per_learner': 0,
'num_consecutive_env_runner_failures_tolerance': 100,
'num_cpus_for_main_process': 1,
'num_cpus_per_env_runner': 1,
'num_cpus_per_learner': 'auto',
'num_cpus_per_offline_eval_runner': 1,
'num_env_runners': 0,
'num_envs_per_env_runner': 1,
'num_epochs': 20,
'num_gpus': 0,
'num_gpus_per_env_runner': 0,
'num_gpus_per_learner': 1,
'num_gpus_per_offline_eval_runner': 0,
'num_learners': 1,
'num_offline_eval_runners': 0,
'observation_filter': 'NoFilter',
'observation_fn': None,
'observation_space': None,
'off_policy_estimation_methods': {},
'offline_data_class': None,
'offline_eval_batch_size_per_runner': 256,
'offline_eval_rl_module_inference_only': False,
'offline_eval_runner_class': None,
'offline_eval_runner_health_probe_timeout_s': 30.0,
'offline_eval_runner_restore_timeout_s': 1800.0,
'offline_evaluation_duration': 1,
'offline_evaluation_interval': None,
'offline_evaluation_parallel_to_training': False,
'offline_evaluation_timeout_s': 120.0,
'offline_evaluation_type': None,
'offline_loss_for_module_fn': None,
'offline_sampling': False,
'ope_split_batch_by_episode': True,
'optimizer': {},
'output': None,
'output_compress_columns': ['obs', 'new_obs'],
'output_config': {},
'output_filesystem': None,
'output_filesystem_kwargs': {},
'output_max_file_size': 67108864,
'output_max_rows_per_file': None,
'output_write_episodes': True,
'output_write_method': 'write_parquet',
'output_write_method_kwargs': {},
'output_write_remaining_data': False,
'placement_strategy': 'PACK',
'policies': {'default_policy': (None, None, None, None)},
'policies_to_train': None,
'policy_map_cache': -1,
'policy_map_capacity': 100,
'policy_mapping_fn': <function AlgorithmConfig.DEFAULT_POLICY_MAPPING_FN at 0x72a2c924c5e0>,
'policy_states_are_swappable': False,
'postprocess_inputs': False,
'prelearner_buffer_class': None,
'prelearner_buffer_kwargs': {},
'prelearner_class': None,
'prelearner_module_synch_period': 10,
'preprocessor_pref': 'deepmind',
'remote_env_batch_wait_ms': 0,
'remote_worker_envs': False,
'render_env': False,
'replay_sequence_length': None,
'restart_failed_env_runners': True,
'restart_failed_offline_eval_runners': True,
'restart_failed_sub_environments': False,
'rollout_fragment_length': 'auto',
'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector'>,
'sample_timeout_s': 60.0,
'sampler_perf_stats_ema_coef': None,
'seed': None,
'sgd_minibatch_size': -1,
'shuffle_batch_per_epoch': True,
'shuffle_buffer_size': 0,
'simple_optimizer': False,
'sync_filters_on_rollout_workers_timeout_s': 10.0,
'synchronize_filters': -1,
'tf_session_args': {'allow_soft_placement': True,
'device_count': {'CPU': 1},
'gpu_options': {'allow_growth': True},
'inter_op_parallelism_threads': 2,
'intra_op_parallelism_threads': 2,
'log_device_placement': False},
'torch_compile_learner': False,
'torch_compile_learner_dynamo_backend': 'inductor',
'torch_compile_learner_dynamo_mode': None,
'torch_compile_learner_what_to_compile': <TorchCompileWhatToCompile.FORWARD_TRAIN: 'forward_train'>,
'torch_compile_worker': False,
'torch_compile_worker_dynamo_backend': 'onnxrt',
'torch_compile_worker_dynamo_mode': None,
'torch_ddp_kwargs': {},
'torch_skip_nan_gradients': False,
'train_batch_size': 4000,
'update_worker_filter_stats': True,
'use_critic': True,
'use_gae': True,
'use_kl_loss': True,
'use_worker_filter_stats': True,
'validate_env_runners_after_construction': True,
'validate_offline_eval_runners_after_construction': True,
'vf_clip_param': 10.0,
'vf_loss_coeff': 1.0,
'vf_share_layers': -1,
'worker_cls': -1},
'date': '2025-09-04_18-30-01',
'done': False,
'env_runner_group': {'actor_manager_num_outstanding_async_reqs': 0},
'env_runners': {'agent_episode_return_mean': {'default_agent': 11.575779017750852},
'env_reset_timer': 0.006791267537448646,
'env_step_timer': 0.025193770310959537,
'env_to_module_connector': {'connector_pipeline_timer': 0.0003828619657284869,
'timers': {'connectors': {'add_observations_from_episodes_to_batch': 1.5781515926946097e-05,
'add_states_from_episodes_to_batch': 4.4934387626969004e-06,
'add_time_dim_to_batch_and_zero_pad': 9.876769348612596e-06,
'batch_individual_items': 2.892487388427993e-05,
'mean_std_filter': 0.00011615123369903275,
'numpy_to_tensor': 0.00010062233940553132}}},
'env_to_module_sum_episodes_length_in': 34.705966945785796,
'env_to_module_sum_episodes_length_out': 34.705966945785796,
'episode_duration_sec_mean': 1.0544752141836993,
'episode_len_max': 147,
'episode_len_mean': 39.2519512195122,
'episode_len_min': 13,
'episode_return_max': 36.83933212891187,
'episode_return_mean': 11.575779017750852,
'episode_return_min': 4.85910685503755,
'module_episode_return_mean': {'default_policy': 11.575779017750852},
'module_to_env_connector': {'connector_pipeline_timer': 0.0008400959755298729,
'timers': {'connectors': {'get_actions': 0.0004019895782968658,
'listify_data_for_vector_env': 4.6700409549720475e-05,
'normalize_and_clip_actions': 0.00010093106277396176,
'remove_single_ts_time_rank_from_batch': 4.684181375128792e-06,
'tensor_to_numpy': 0.00011767198249401356,
'un_batch_to_individual_items': 2.8199102250637626e-05}}},
'num_agent_steps_sampled': {'default_agent': 3268.0000000000005},
'num_agent_steps_sampled_lifetime': {'default_agent': 130509.00000000003},
'num_env_steps_sampled': 3268.0000000000005,
'num_env_steps_sampled_lifetime': 130509.00000000003,
'num_env_steps_sampled_lifetime_throughput': 37.33082226664046,
'num_episodes': 83.0,
'num_episodes_lifetime': 3486.0,
'num_module_steps_sampled': {'default_policy': 3268.0000000000005},
'num_module_steps_sampled_lifetime': {'default_policy': 130509.00000000003},
'rlmodule_inference_timer': 0.0008424967593561691,
'sample': 1.1010686393623417,
'time_between_sampling': 0.001946807894059591,
'weights_seq_no': 0.0},
'fault_tolerance': {'num_healthy_workers': 0, 'num_remote_worker_restarts': 0},
'hostname': 'DESKTOP-A9305JF',
'iterations_since_restore': 1,
'learners': {'__all_modules__': {'learner_connector': {'connector_pipeline_timer': 0.15810381900519133,
'timers': {'connectors': {'add_columns_from_episodes_to_train_batch': 0.025123965984676033,
'add_observations_from_episodes_to_batch': 0.0004855829756706953,
'add_one_ts_to_episodes_and_truncate': 0.004187669022940099,
'add_states_from_episodes_to_batch': 4.5000051613897085e-06,
'add_time_dim_to_batch_and_zero_pad': 2.524498268030584e-05,
'batch_individual_items': 0.01944736100267619,
'general_advantage_estimation': 0.10596923399134539,
'numpy_to_tensor': 0.0024374339845962822}}},
'learner_connector_sum_episodes_length_in': 3268,
'learner_connector_sum_episodes_length_out': 3351,
'num_env_steps_trained': 703710,
'num_env_steps_trained_lifetime': 703710,
'num_env_steps_trained_lifetime_throughput': 534374.507890259,
'num_module_steps_trained': 67200,
'num_module_steps_trained_lifetime': 67200,
'num_module_steps_trained_lifetime_throughput': 51025.1890487095,
'num_module_steps_trained_throughput': 51024.96975776463,
'num_non_trainable_parameters': 0,
'num_trainable_parameters': 389165},
'default_policy': {'curr_entropy_coeff': 0.0,
'curr_kl_coeff': 1.5,
'default_optimizer_learning_rate': 0.0002,
'diff_num_grad_updates_vs_sampler_policy': 1.0,
'entropy': 31.00502,
'gradients_default_optimizer_global_norm': 3.8752627,
'mean_kl_loss': 0.05342823,
'module_train_batch_size_mean': 320.0,
'num_module_steps_trained': 67200,
'num_module_steps_trained_lifetime': 67200,
'num_module_steps_trained_lifetime_throughput': 51022.92181404588,
'num_trainable_parameters': 389165,
'policy_loss': -0.26968503,
'total_loss': 4.9205713,
'vf_explained_var': 0.0001603961,
'vf_loss': 5.1368275,
'vf_loss_unclipped': 57.31934,
'weights_seq_no': 1.0}},
'node_ip': '10.64.87.213',
'num_env_steps_sampled_lifetime': 130509.00000000003,
'num_training_step_calls_per_iteration': 1,
'perf': {'cpu_util_percent': 16.13228346456693,
'ram_util_percent': 12.133070866141727},
'pid': 756711,
'time_since_restore': 90.64898371696472,
'time_this_iter_s': 90.64898371696472,
'time_total_s': 90.64898371696472,
'timers': {'env_runner_sampling_timer': 88.60684963400126,
'learner_update_timer': 2.0253292619891,
'restore_env_runners': 3.4513999707996845e-05,
'synch_weights': 0.007450081990100443,
'training_iteration': 90.64194697301718,
'training_step': 90.64162584897713},
'timestamp': 1756981801,
'training_iteration': 1,
'trial_id': 'default'}