After upgrading to ray[rllib] 1.7.0 on Ubuntu Linux 20.04 with latest tensorflow
and tensorflow-gpu
python packages installed. I’m seeing the following error while tryping to create workers. What might be the problem?
The issue appears when I set ‘num_gpus’ config to any value above 0. Here is the script as well.
from ray import tune
env_cfg = {
"env": "gym_backtest:backtest-v0",
"env_config" : {
"port": 8125,
"host": "127.0.0.1",
},
"num_gpus": 0.01,
"num_workers": 1,
"gamma": 0.9,
"timesteps_per_iteration": 100000,
"model" : {
"fcnet_hiddens": [ 64 ],
"fcnet_activation": "relu",
},
}
tune.run(
"PPO",
num_samples=32,
config=env_cfg)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/trial_runner.py", line 812, in _process_trial
results = self.trial_executor.fetch_result(trial)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/ray_trial_executor.py", line 767, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 89, in wrapper
return func(*args, **kwargs)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/worker.py", line 1623, in get
raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::PPO.__init__() (pid=1060557, ip=192.168.1.3)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 137, in __init__
Trainer.__init__(self, config, env, logger_creator)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 611, in __init__
super().__init__(config, logger_creator)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/trainable.py", line 106, in __init__
self.setup(copy.deepcopy(self.config))
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 147, in setup
super().setup(config)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 764, in setup
self._init(self.config, self.env_creator)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 171, in _init
self.workers = self._make_workers(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 846, in _make_workers
return WorkerSet(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 103, in __init__
self._local_worker = self._make_worker(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 399, in _make_worker
worker = cls(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 583, in __init__
self._build_policy_map(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1382, in _build_policy_map
self.policy_map.create_policy(name, orig_cls, obs_space, act_space,
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/policy/policy_map.py", line 123, in create_policy
sess = self.session_creator()
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 316, in session_creator
return tf1.Session(
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1601, in __init__
super(Session, self).__init__(target, graph, config=config)
File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 711, in __init__
self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
MemoryError: std::bad_alloc