After upgrading to ray[rllib] 1.7.0 on Ubuntu Linux 20.04 with latest tensorflow and tensorflow-gpu python packages installed. I’m seeing the following error while tryping to create workers. What might be the problem?
The issue appears when I set ‘num_gpus’ config to any value above 0. Here is the script as well.
from ray import tune
env_cfg = {
    "env": "gym_backtest:backtest-v0",
    "env_config" : {
        "port": 8125,
        "host": "127.0.0.1",
    },
    "num_gpus": 0.01,
    "num_workers": 1,
    "gamma": 0.9,
    "timesteps_per_iteration": 100000,
    "model" : {
        "fcnet_hiddens": [ 64 ],
        "fcnet_activation": "relu",
    },
}
tune.run(
    "PPO",
    num_samples=32,
    config=env_cfg)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/trial_runner.py", line 812, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/ray_trial_executor.py", line 767, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 89, in wrapper
    return func(*args, **kwargs)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/worker.py", line 1623, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::PPO.__init__() (pid=1060557, ip=192.168.1.3)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 137, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 611, in __init__
    super().__init__(config, logger_creator)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/tune/trainable.py", line 106, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 147, in setup
    super().setup(config)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 764, in setup
    self._init(self.config, self.env_creator)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer_template.py", line 171, in _init
    self.workers = self._make_workers(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/agents/trainer.py", line 846, in _make_workers
    return WorkerSet(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 103, in __init__
    self._local_worker = self._make_worker(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 399, in _make_worker
    worker = cls(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 583, in __init__
    self._build_policy_map(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1382, in _build_policy_map
    self.policy_map.create_policy(name, orig_cls, obs_space, act_space,
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/policy/policy_map.py", line 123, in create_policy
    sess = self.session_creator()
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/ray/rllib/evaluation/worker_set.py", line 316, in session_creator
    return tf1.Session(
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1601, in __init__
    super(Session, self).__init__(target, graph, config=config)
  File "/home/anton/miniconda3/envs/rllib/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 711, in __init__
    self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
MemoryError: std::bad_alloc