Torch does not find cuda inside the ray

Guilherme_Parreira_d · August 19, 2025, 6:34pm

1. Severity of the issue: (select one)
None: I’m just curious or want clarification.
Low: Annoying but doesn’t hinder my work.
Medium: Significantly affects my productivity but can find a workaround.
High: Completely blocks me.

2. Environment:

Ray version: 2.48.0
Python version: 3.11.11
OS: Ubuntu 24.03.3 LTS
Cloud/Infrastructure: Ubuntu Server
Other libs/tools (if relevant): TimeMoe e Cronos

3. What happened vs. what you expected:

Expected:
- I am migrating my pipeline to Ray. Before I used joblib Parallel.
- With Parallel I was able to run my code, however as processes starts independently, every foundation model (TimeMoe and Cronos) had to be loaded on each processor using Parallel.
- With Ray actors I want to load these large models only once, and then pass then to every process. Every process will just run the prediction.
Wishes:
- I want to load both models in a single GPU and make the predictions with them. Each takes 1Gb of memory and my GPU has 4 cards with 44Gb of memory each.
- I want to be able to run in local_mode = [True, False] because sometimes I am going to need to debug.
Actual:
- I have the following error: 2025-08-19 18:18:06,363 ERROR serialization.py:533 -- Attempting to deserialize object on CUDA device 0 but torch.cuda.device_count() is 0. Please use torch.load with map_location to map your storages to an existing device.
- I tried to fix with torch.load() but I received another error from trorch ;

4. My pipeline code:

import torch
import os
import torch
from chronos import BaseChronosPipeline
from core_forecast.model_forecast import PreTrainedModels
import ray
from model_hyper_parameters import model_params
from event_based_multi_modelos import process_series
from transformers import AutoModelForCausalLM
 
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available")
else:
    print(f"CUDA version in Pytorch: {torch.version.cuda}")
    device = "cuda"

# --- Model Actor for Chronos ---
@ray.remote(num_gpus=0.25)
class ChronosModelActor:
    def __init__(self, model_path: str, device: str, torch_dtype):
        self.device = device if torch.cuda.is_available() else "cpu"
        self.model = BaseChronosPipeline.from_pretrained(model_path, device_map=self.device, torch_dtype=torch_dtype)

    def predict(self, context_tensor, forecast_horizon: int):
        context_tensor = context_tensor.to(self.device)
        with torch.no_grad():
            _, mean = self.model.predict_quantiles(context=context_tensor, prediction_length=forecast_horizon)
        return mean.detach().cpu()

# --- Model Actor for TimeMoE ---
@ray.remote(num_gpus=0.25)
class TimeMoEModelActor:
    def __init__(self, model_path: str, device: str, torch_dtype):
        self.device = device if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_path,
            device_map=self.device,  ## MM, era "cpu"
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            
        )

        self.model = self.model.to(self.device)

    def predict(self, context_tensor, forecast_horizon: int):
        context_tensor = context_tensor.to(self.device)
        # You can adjust this method based on TimeMoE inference specifics
        with torch.no_grad():  # aqui não calcula gradientes
            output = self.model.generate(context_tensor, max_new_tokens=forecast_horizon)
        
        forecast = output[:, -forecast_horizon:]
        return forecast.cpu()


# --- Main Parallel Execution ---
@ray.remote
def parallel_process_series(
    ray_actors, directories, path, yml_input, externas_categoricas, externas_numericas, scaler_y, kill_model_in_train, variaveis_usuario_nao_serao_usadas
):
    # Dispatch parallel tasks
    futures = []
    for directory in directories:
        futures.append(process_series(ray_actors, directory, path, yml_input, externas_categoricas, externas_numericas, scaler_y, kill_model_in_train, variaveis_usuario_nao_serao_usadas))

    results = ray.get(futures)
    return results

N_CORES = 10
ray.init(num_cpus=N_CORES, num_gpus=1, include_dashboard=False, local_mode=True)

# Load models as Ray Actors
chronos_actor = ChronosModelActor.remote(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("BaseChronosPipeline")["model_name"][0]), device, torch.bfloat16)
timemoe_actor = TimeMoEModelActor.remote(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("TimeMoE")["model_name"][0]), device, torch.bfloat16)
# Example usage:
results_ref = [parallel_process_series.remote(
    ray_actors={"chronos": chronos_actor, "timemoe": timemoe_actor},
    directories=directories,
    path=path,
    yml_input=yml_input,
    externas_categoricas=VARIAVEIS_EXTERNAS_CATEGORICAS,
    externas_numericas=VARIAVEIS_EXTERNAS_NUMERICAS,
    scaler_y=SCALER_Y,
    kill_model_in_train=KILL_MODEL_FIT_IN_TRAIN,
    variaveis_usuario_nao_serao_usadas=VARIAVEIS_USUARIO_NAO_SERAO_USADAS
)]

# Esse método é fail-first. Ou seja, se um erro acontecer, ele para imediatamente. Para deixar ele rodando, tem que adaptar o código, não basta mudar um argumento. Então, deixei ele fixo.
ray.get(results_ref)

5. A piece of my process_series function

def process_series(
    ray_actors,
    d,
    path,
    yml_input,
    externas_categoricas,
    externas_numericas,
    scaler_y,
    kill_model_in_train,
    variaveis_usuario_nao_serao_usadas,
):
# ...... code .... #
class PreTrainedModels:
    
    # Reproducibility
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
    os.environ["TF_CUDNN_DETERMINISTIC"] = "true"  # I include for TimeMoe
    os.environ["TF_DETERMINISTIC_OPS"] = "true"  # I include for TimeMoe

    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    os.environ["PYTHONHASHSEED"] = str(42)
    # Fix seeds in torch
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    # transformers.set_seed(42, deterministic=True)

    # torch_dtype = torch.float32  # More precision
    torch_dtype = torch.bfloat16
    chunk_size = 600  # How many IDs should I pass at once to any PreTrained model to make prediction

    model = ray_actors.get("cronos")
    _, mean_ref = ray.get(model.predict.remote(
            context_tensor=context_tensor,
            forecast_horizon=forecast_horizon,
        ))
    mean = ray.get(mean_ref)

I tried to add:

 `torch.load(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("BaseChronosPipeline")["model_name"][0]), map_location=device)`
 `torch.load(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("TimeMoE")["model_name"][0]), map_location=device)`

But I got: IsADirectoryError: [Errno 21] Is a directory: '/mnt/data/gui/hub/models--amazon--chronos-bolt-base/snapshots/6f8ced46a499ae1dfd399981f551152d756cf4f6'.

Any clue what may be missing in my code?

Guilherme_Parreira_d · August 26, 2025, 4:26pm

I was able to solve it. So many issues around. I will post the answer:

num_cpus_actor = .01
num_gpus_actor = .03
# --- Model Actor for Chronos ---
@ray.remote(num_gpus=num_gpus_actor, num_cpus=num_cpus_actor)
class ChronosModelActor:
    # it will substitute the predict_chronos_from_2d_tensor()
    # I firstly load the model
    def __init__(self, model_path: str, device: str, torch_dtype):
        self.device = device if torch.cuda.is_available() else "cpu"
        print("ray seeing gpus", ray.get_gpu_ids())
        self.model = BaseChronosPipeline.from_pretrained(model_path, device_map=self.device, torch_dtype=torch_dtype)

    def predict(self, context_tensor, forecast_horizon: int):
        context_tensor = context_tensor.to(self.device)
        with torch.no_grad():
            _, mean = self.model.predict_quantiles(context=context_tensor, prediction_length=forecast_horizon)
        return mean

# --- Model Actor for TimeMoE ---
@ray.remote(num_gpus=num_gpus_actor, num_cpus=num_cpus_actor)
class TimeMoEModelActor:
    def __init__(self, model_path: str, device: str, torch_dtype):
        self.device = device if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_path,
            device_map=self.device,  ## MM, era "cpu"
            # attn_implementation="flash_attention_2",  ## It is not available in Docker - https://github.com/Dao-AILab/flash-attention/issues/1220
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            
        )
        self.model = self.model.to(self.device)

    def predict(self, context_tensor, forecast_horizon: int):
        context_tensor = context_tensor.to(self.device)
        # You can adjust this method based on TimeMoE inference specifics
        with torch.no_grad():  # aqui não calcula gradientes
            output = self.model.generate(context_tensor, max_new_tokens=forecast_horizon)
        
        forecast = output[:, -forecast_horizon:]
        return forecast.cpu()

# Over all setup of process
N_CORES = 40
ray.init(num_cpus=N_CORES, num_gpus=1, include_dashboard=True, local_mode=False)

# Load models as Ray Actors
chronos_actor = ChronosModelActor.remote(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("BaseChronosPipeline")["model_name"][0]), 
                                         device, torch.bfloat16)
timemoe_actor = TimeMoEModelActor.remote(os.path.join(BASE_PATH_TRAINED_MODELS, model_params.get("TimeMoE")["model_name"][0]), device, torch.bfloat16)


futures = []
for directory in directories:
    futures.append(
        process_series.remote(
            ray_actors={"chronos": chronos_actor, "timemoe": timemoe_actor},
            d=directory,
            path=path,
            yml_input=yml_input,
            externas_categoricas=VARIAVEIS_EXTERNAS_CATEGORICAS,
            externas_numericas=VARIAVEIS_EXTERNAS_NUMERICAS,
            scaler_y=SCALER_Y,
            kill_model_in_train=KILL_MODEL_FIT_IN_TRAIN,
            variaveis_usuario_nao_serao_usadas=VARIAVEIS_USUARIO_NAO_SERAO_USADAS,
        )
    )

results = ray.get(futures)

@ray_debug
@ray.remote(num_cpus=.95, num_gpus=0)
def process_series(
    ray_actors,
    .... )

    @staticmethod
    def predict_chronos_from_2d_tensor_ray(context_tensor, forecast_horizon: int, ray_actors):
        model = ray_actors.get("chronos")
        # Make the predictions for all ids at once
        mean = ray.get(model.predict.remote(
            context_tensor=context_tensor,
            forecast_horizon=forecast_horizon,
        ))
        return mean

Topic		Replies	Views
CUDA error: all CUDA-capable devices are busy or unavailable Ray Tune	4	1832	February 11, 2022
[Ray Core] RuntimeError: No CUDA GPUs are available Ray Core	5	5011	October 15, 2022
Cannot checkpoint a simple model RLlib	4	123	June 6, 2025
Pytorch+ray train example not working Ray Train	4	807	November 9, 2023
Tensor parallel inference with deepspeed on ray	1	125	September 27, 2024

Torch does not find cuda inside the ray

Related topics