Ray not finding available GPU on Windows

While using ray 1.5.2, I noticed my code was running very slowly, and it turns out that was because the trainers were not putting my models on the GPU. This was surprising behavior because when I initialize ray, I am putting num_gpus = 1 in ray.init.

So, I went digging in the PPOTrainer to see where it was placing my models, and found that when I passed num_gpus: 1 to the PPOTrainer config, this crashed my program because of an index error into the gpu_ids that ray looked up on my system with ray.get_gpu_ids(). This behavior seemed weird to me (that the get_gpu_ids function returned an empty list because torch can see my gpu.

image

I’ll edit this shortly and add the full code I was running (it’s on my main computer).

Code:

    import gym
    import os
    from datetime import datetime
    from griddly.util.rllib.environment.core import RLlibEnv
    from utils.register import Registrar
    from utils.loader import load_from_yaml
    import tempfile
    import ray
    from ray.tune.logger import UnifiedLogger
    from ray.tune.registry import register_env
    from ray.rllib.agents.ppo import PPOTrainer
    from ray.rllib.models import ModelCatalog
    import sys

    from models.AIIDE_network import AIIDEActor
    from models.PCGRL_network import PCGRLAdversarial

    os.chdir('..')

    sep = os.pathsep
    os.environ['PYTHONPATH'] = sep.join(sys.path)

    ray.init(num_gpus=1, ignore_reinit_error=True)  # , log_to_driver=False, local_mode=True)

    registery = Registrar(file_args=load_from_yaml('args.yaml'))

    config = registery.get_config_to_build_rllib_env
    config['board_shape'] = (15, 15)
    config['builder_max_steps'] = 50
    config['max_steps'] = 250


    def make_env(config):
        env = RLlibEnv(config)
        env = AlignedReward(env, config)
        h_env = Regret(env, config)
        return h_env


    def policy_mapping_fn(agent_id):
        if agent_id.startswith('antagonist'):
            return 'antagonist'
        elif agent_id.startswith('protagonist'):
            return 'protagonist'
        else:
            return 'builder'


    ModelCatalog.register_custom_model('AIIDE', AIIDEActor)
    ModelCatalog.register_custom_model('PCGRL', PCGRLAdversarial)
    register_env('h_zelda', make_env)

    h_env = make_env(config)
    # print(h_env.builder_env.action_space)
    _ = h_env.reset()
    config2 = {
        'env': 'h_zelda',
        'num_workers': 2,
        "num_envs_per_worker": 2,
        'env_config': config,
        # "callbacks": PairedTrainingCallback,
        'multiagent': {
            'policies': {
                'builder': (None, h_env.builder_env.observation_space,
                            h_env.builder_env.action_space, {'model': {'custom_model': 'PCGRL',
                                                                       'custom_model_config': {'cell_size': 2704}}}),
                'antagonist': (None, h_env.env.observation_space,
                               h_env.env.action_space, {'model': {'custom_model': 'AIIDE',
                                                                  'custom_model_config': {}}}),
                'protagonist': (None, h_env.env.observation_space,
                                h_env.env.action_space, {'model': {'custom_model': 'AIIDE',
                                                                   'custom_model_config': {}}})
            },
            'policy_mapping_fn': policy_mapping_fn
        },
        "framework": 'torch',
    }

    def custom_log_creator(custom_path, custom_str):

        timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
        logdir_prefix = "{}_{}".format(custom_str, timestr)

        def logger_creator(config):
            if not os.path.exists(custom_path):
                os.makedirs(custom_path)
            logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
            return UnifiedLogger(config, logdir, loggers=None)

        return logger_creator

    # try:
    trainer = PPOTrainer(config=config2, env="h_zelda",
                         logger_creator=custom_log_creator(os.path.join('..', 'enigma_logs'),
                                                           'paired'))
    for i in range(10):
        print(i)
        result = trainer.train()
        trainer.log_result(result)
    print(f"cuda available: {torch.cuda.is_available()}") # True
    print(next(trainer.get_policy('builder').model.parameters()).device) # cpu

    ray.shutdown()

This is simply a small test to make sure my MultiAgent Regret env is working correctly and processing all three agents. However, this execution is similar to what I would expect a full run of the algorithm to do.

In general, I’m using the trainer.train API more than the tune API because that’s more conducive to my research where I can add other stuff around the training loop (e.g. managing a population of trainers and/or changing the task a given trainer is training on in an outer loop).

When I add num_gpus: 1 to the trainerConfig, the constructor dies mid build with the index error because ray.get_gpu_ids() returns an empty list.

When I wrap my call in tune:

# same as above until the trainer constructor. 
# ...
stop = {"timesteps_total": 50000}

results = tune.run(PPOTrainer, config=config2, stop=stop)

I either get the following error (when I specify num_gpus in the config to 1):

Failure # 1 (occurred at 2021-09-01_19-28-18)
Traceback (most recent call last):
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\tune\trial_runner.py", line 739, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\tune\ray_trial_executor.py", line 729, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\_private\client_mode_hook.py", line 82, in wrapper
    return func(*args, **kwargs)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\worker.py", line 1566, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, e[36mray::PPO.__init__()e[39m (pid=37420, ip=192.168.86.99)
  File "python\ray\_raylet.pyx", line 534, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\_private\function_manager.py", line 563, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\agents\trainer_template.py", line 123, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\agents\trainer.py", line 584, in __init__
    super().__init__(config, logger_creator)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\tune\trainable.py", line 103, in __init__
    self.setup(copy.deepcopy(self.config))
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\agents\trainer.py", line 731, in setup
    self._init(self.config, self.env_creator)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\agents\trainer_template.py", line 152, in _init
    num_workers=self.config["num_workers"])
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\agents\trainer.py", line 819, in _make_workers
    logdir=self.logdir)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 86, in __init__
    lambda p, pid: (pid, p.observation_space, p.action_space)))
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\_private\client_mode_hook.py", line 82, in wrapper
    return func(*args, **kwargs)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\worker.py", line 1566, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, e[36mray::RolloutWorker.__init__()e[39m (pid=37272, ip=192.168.86.99)
  File "python\ray\_raylet.pyx", line 534, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 484, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\_private\function_manager.py", line 563, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 557, in __init__
    policy_dict, policy_config)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1342, in _build_policy_map
    policy_map[name] = cls(obs_space, act_space, merged_conf)
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\policy\policy_template.py", line 267, in __init__
    get_batch_divisibility_req=get_batch_divisibility_req,
  File "D:\miniconda\envs\enigma\lib\site-packages\ray\rllib\policy\torch_policy.py", line 157, in __init__
    self.device = self.devices[0]
IndexError: list index out of range

or it chooses not to use my gpu.

@aadharna

This was fixed in this PR

I am not sure if that made it into the 1.6 release but I would guess it did.

I spun up a conda env with 1.6 installed, but the same error is still persisting albeit in a different part of my code now (I have a ray Actor which houses a trainer object remotely):

The actor died because of an error raised in its creation task, ray::SingleAgentSolver.__init__() (pid=40716, ip=192.168.86.99)
  File "python\ray\_raylet.pyx", line 536, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 486, in ray._raylet.execute_task.function_executor
  File "D:\miniconda\envs\foo\lib\site-packages\ray\_private\function_manager.py", line 563, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "D:\PycharmProjects\thesis\enigma\solvers\SingleAgentSolver.py", line 22, in __init__
    self.trainer = trainer_constructor(config=trainer_config, env=registered_gym_name)
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer_template.py", line 136, in __init__
    Trainer.__init__(self, config, env, logger_creator)
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer.py", line 592, in __init__
    super().__init__(config, logger_creator)
  File "D:\miniconda\envs\foo\lib\site-packages\ray\tune\trainable.py", line 103, in __init__
    self.setup(copy.deepcopy(self.config))
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer_template.py", line 146, in setup
    super().setup(config)
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer.py", line 739, in setup
    self._init(self.config, self.env_creator)
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer_template.py", line 170, in _init
    self.workers = self._make_workers(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\agents\trainer.py", line 821, in _make_workers
    return WorkerSet(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 103, in __init__
    self._local_worker = self._make_worker(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 399, in _make_worker
    worker = cls(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 580, in __init__
    self._build_policy_map(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1375, in _build_policy_map
    self.policy_map.create_policy(name, orig_cls, obs_space, act_space,
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\policy\policy_map.py", line 136, in create_policy
    self[policy_id] = class_(observation_space, action_space,
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\policy\policy_template.py", line 256, in __init__
    self.parent_cls.__init__(
  File "D:\miniconda\envs\foo\lib\site-packages\ray\rllib\policy\torch_policy.py", line 177, in __init__
    raise ValueError(
ValueError: TorchPolicy was not able to find enough GPU IDs! Found [], but num_gpus=0.1.

This persisted with different values of num_gpus (fractional and 1) being passed through the trainer_constructor config. Ray is being initialized with num_gpus = 1.

If I remove num_gpus from the trainer config dict, everything runs, but it places my models on the cpu which is not desirable behavior.