I ran your code with no environment and got the missing scores as below
# rllib train \
# --run=PG \
# --env=CartPole-v0 \
# --config='{"framework": "torch", "output": "/Users/jweinbe3/Documents/ray_example", "output_max_file_size": 5000000}' \
# --stop='{"timesteps_total": 100000}'
from gym import spaces
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.offline.estimators import (
ImportanceSampling,
WeightedImportanceSampling,
DirectMethod,
DoublyRobust,
)
import numpy as np
from ray.rllib.offline.estimators.fqe_torch_model import FQETorchModel
DEBUG = True
config = (
DQNConfig()
.resources(num_gpus=0 if DEBUG else 1)
.debugging(seed=42 if DEBUG else None)
.environment(env=None, action_space=spaces.Discrete(2),
observation_space=spaces.Box(np.array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38]),
np.array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38]), (4,), np.float32))
.framework("torch")
.offline_data(input_="/Users/jweinbe3/Documents/ray_example")
.evaluation(
evaluation_interval=1,
evaluation_duration=1000,
evaluation_num_workers=1,
evaluation_parallel_to_training=True,
evaluation_duration_unit="episodes",
evaluation_config={"input": "/Users/jweinbe3/Documents/ray_example"},
off_policy_estimation_methods={
"is": {"type": ImportanceSampling},
"wis": {"type": WeightedImportanceSampling},
"dm_fqe": {
"type": DirectMethod,
"q_model_config": {"type": FQETorchModel, "polyak_coef": 0.05},
},
"dr_fqe": {
"type": DoublyRobust,
"q_model_config": {"type": FQETorchModel, "polyak_coef": 0.05},
},
},
)
)
from ray.tune import Tuner
from ray import air
stop = {
"training_iteration": 1
}
t = Tuner(
"DQN", param_space=config.to_dict(), run_config=air.RunConfig(stop=stop))
results = t.fit()
print(results.get_best_result().metrics)
Output
{'evaluation': {'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'num_agent_steps_sampled_this_iter': 200000, 'num_env_steps_sampled_this_iter': 200000, 'timesteps_this_iter': 200000, 'off_policy_estimator': {'is': {'v_behavior': 71.56536521593185, 'v_target': 18.059243003719995, 'v_gain': 0.31237773674484337, 'v_behavior_std': 20.91077007684791, 'v_target_std': 46.671140611645654, 'v_gain_std': 0.7329956392103542}, 'wis': {'v_behavior': 71.56536521593185, 'v_target': 68.42401369219736, 'v_gain': 1.0540700766850335, 'v_behavior_std': 20.91077007684791, 'v_target_std': 286.03094352690056, 'v_gain_std': 4.112121540270352}, 'dm_fqe': {'v_behavior': 71.56536521593185, 'v_target': 0.01854718171184442, 'v_gain': 0.00035035406752660864, 'v_behavior_std': 20.91077007684791, 'v_target_std': 0.000986783183631341, 'v_gain_std': 0.0005032335443440929}, 'dr_fqe': {'v_behavior': 71.56536521593186, 'v_target': 18.090232510308883, 'v_gain': 0.3128705622792621, 'v_behavior_std': 20.910770076847918, 'v_target_std': 46.70760504597745, 'v_gain_std': 0.7333744750405439}}, 'num_healthy_workers': 1, 'num_recreated_workers': 0}, 'custom_metrics': {}, 'episode_media': {}, 'num_recreated_workers': 0, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.26329100131988525, 'mean_q': 3.2651255130767822, 'min_q': 0.17295058071613312, 'max_q': 9.400323867797852, 'cur_lr': 0.0005}, 'td_error': array([ 0.49190593, -2.372336 , 0.15597558, 0.46284115, 0.27084064,
-0.8244903 , -0.16397142, 0.42400658, -0.5302119 , -0.58858585,
-2.811394 , -2.8838887 , -0.4350536 , -1.3666189 , -0.24729967,
-1.5492 , -1.6928301 , -1.4913654 , 0.17717671, -1.9149044 ,
-1.1952596 , -2.4955556 , -2.0318499 , -0.850914 , 0.9374056 ,
1.0244606 , -0.9451004 , -1.6365018 , -0.7415261 , -2.1879308 ,
-0.53181267, 0.5695535 ], dtype=float32), 'mean_td_error': -0.8429511189460754, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 32.0, 'off_policy_estimation': {'is': {}, 'wis': {}, 'dm_fqe': {'loss': 0.9612238324300886}, 'dr_fqe': {'loss': 0.9596758003567933}}}}