Detail Info:
Failure # 1 (occurred at 2023-02-20_08-01-16)
Traceback (most recent call last):
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/tune/execution/ray_trial_executor.py”, line 989, in get_next_executor_event
future_result = ray.get(ready_future)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/_private/client_mode_hook.py”, line 105, in wrapper
return func(*args, **kwargs)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/_private/worker.py”, line 2277, in get
raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, e[36mray::DDPPOTrainerAddAsyn.init()e[39m (pid=14724, ip=10.19.196.43, repr=DDPPOTrainerAddAsyn)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/algorithms/ddppo/ddppo.py”, line 179, in init
super().init(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py”, line 308, in init
super().init(config=config, logger_creator=logger_creator, **kwargs)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/tune/trainable/trainable.py”, line 157, in init
self.setup(copy.deepcopy(self.config))
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/algorithms/ddppo/ddppo.py”, line 253, in setup
super().setup(config)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/algorithms/algorithm.py”, line 418, in setup
self.workers = WorkerSet(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py”, line 171, in init
self._local_worker = self._make_worker(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/evaluation/worker_set.py”, line 661, in _make_worker
worker = cls(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py”, line 613, in init
self._build_policy_map(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py”, line 1789, in _build_policy_map
self.policy_map.create_policy(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/policy/policy_map.py”, line 123, in create_policy
self[policy_id] = create_policy_for_framework(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/utils/policy.py”, line 80, in create_policy_for_framework
return policy_class(observation_space, action_space, merged_config)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/algorithms/ppo/ppo_torch_policy.py”, line 50, in init
TorchPolicyV2.init(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py”, line 81, in init
model, dist_class = self._init_model_and_dist_class()
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/policy/torch_policy_v2.py”, line 446, in _init_model_and_dist_class
model = ModelCatalog.get_model_v2(
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/ray/rllib/models/catalog.py”, line 622, in get_model_v2
instance = model_cls(
File “ray_adapter/ray_runner.py”, line 359, in init
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 927, in to
return self._apply(convert)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 579, in _apply
module._apply(fn)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 579, in _apply
module._apply(fn)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 579, in _apply
module._apply(fn)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 602, in _apply
param_applied = fn(param)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 925, in convert
return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
File “/opt/conda/envs/rl_decision/lib/python3.8/site-packages/torch/cuda/init.py”, line 217, in _lazy_init
torch._C._cuda_init()
RuntimeError: No CUDA GPUs are available
My config:
config = {
# Also try common gym envs like: “CartPole-v0” or “Pendulum-v1”.
“recreate_failed_workers”: True,
“restart_failed_sub_environments”: True,
“disable_env_checking”: True,
“ignore_worker_failures”: True,
“horizon”: 1000,
# “vtrace”:False,
“num_gpus”: 0,
“model”: {
“custom_model”: “my_model”,
“vf_share_layers”: True,
“custom_model_config”: {}
},
“sgd_minibatch_size”: 512,
“train_batch_size”: 5120,
“num_workers”: 2,
“num_envs_per_worker”: 1,
“num_gpus_per_worker”: 0.5,
“num_cpus_per_worker”: 2,
“framework”: “torch”,
“no_done_at_end”: True,
“sample_async”: True,
“placement_strategy”: “SPREAD”, # SPREAD: The bundles are placed as evenly as possible across the different nodes
##DDPPO Config
"keep_local_weights_in_sync":True,
}