Pytorch+ray train example not working

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I was trying pytorch+ray train example [Get Started with PyTorch — Ray 2.8.0](ray torch train example)

import torch
torch.version
‘1.13.1+cu117’
import ray;ray.version
‘2.8.0’

import tempfile
import torch
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

# Model, Loss, Optimizer
model = resnet18(num_classes=10)
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Data
transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)

# Training
for epoch in range(10):
    for images, labels in train_loader:
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    checkpoint_dir = tempfile.gettempdir()
    checkpoint_path = checkpoint_dir + "/model.checkpoint"
    torch.save(model.state_dict(), checkpoint_path)

it doesnot work throwing this error (tried with and without use_gpu=True) :

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device(‘cpu’) to map your storages to the CPU.
2023-11-07 15:07:19,866 ERROR tune_controller.py:1383 – Trial task failed for trial TorchTrainer_f3f79_00000
Traceback (most recent call last):

Can you share the code you’re running? Are you calling torch.load somewhere?

I am using above code taken from ray website and NO, I am not using torch.loadanywhere!

i am getting this error now @matthewdeng :

ERROR tune_controller.py:1383 – Trial task failed for trial TorchTrainer_ebf6a_00000
Traceback (most recent call last):
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py”, line 110, in resolve_future
result = ray.get(future)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/_private/auto_init_hook.py”, line 24, in auto_init_wrapper
return fn(*args, **kwargs)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/_private/client_mode_hook.py”, line 103, in wrapper
return func(*args, **kwargs)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/_private/worker.py”, line 2563, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::_Inner.train() (pid=37791, ip=87.77.183.165, actor_id=b7d92face11831bce637948301000000, repr=TorchTrainer)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py”, line 342, in train
raise skipped from exception_cause(skipped)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/utils.py”, line 43, in check_for_failure
ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): ray::_RayTrainWorker__execute.get_next() (pid=37866, ip=87.77.183.165, actor_id=55719d785f8d44e5a17e462b01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7f19b6089db0>)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/worker_group.py”, line 33, in __execute
raise skipped from exception_cause(skipped)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/utils.py”, line 118, in discard_return_wrapper
train_func(*args, **kwargs)
File “/home/mayag88/Documents/Code/Rep_learaning/flower/examples/quickstart-CiR/example_ray.py”, line 32, in train_func
train_data = FashionMNIST(
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/mnist.py”, line 99, in init
self.download()
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/mnist.py”, line 187, in download
download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/utils.py”, line 434, in download_and_extract_archive
download_url(url, download_root, filename, md5)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/utils.py”, line 155, in download_url
raise RuntimeError(“File not found or corrupted.”)
RuntimeError: File not found or corrupted.

Training errored after 0 iterations at 2023-11-08 14:37:25. Total running time: 13s
Error file: /home/mayag88/ray_results/TorchTrainer_2023-11-08_14-37-09/TorchTrainer_ebf6a_00000_0_2023-11-08_14-37-12/error.txt

2023-11-08 14:37:25,293 ERROR tune.py:1043 – Trials did not complete: [TorchTrainer_ebf6a_00000]
ray.exceptions.RayTaskError(RuntimeError): ray::_Inner.train() (pid=37791, ip=87.77.183.165, actor_id=b7d92face11831bce637948301000000, repr=TorchTrainer)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/tune/trainable/trainable.py”, line 342, in train
raise skipped from exception_cause(skipped)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/utils.py”, line 43, in check_for_failure
ray.get(object_ref)
ray.exceptions.RayTaskError(RuntimeError): ray::_RayTrainWorker__execute.get_next() (pid=37866, ip=87.77.183.165, actor_id=55719d785f8d44e5a17e462b01000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7f19b6089db0>)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/worker_group.py”, line 33, in __execute
raise skipped from exception_cause(skipped)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/_internal/utils.py”, line 118, in discard_return_wrapper
train_func(*args, **kwargs)
File “/home/mayag88/Documents/Code/Rep_learaning/flower/examples/quickstart-CiR/example_ray.py”, line 32, in train_func
train_data = FashionMNIST(
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/mnist.py”, line 99, in init
self.download()
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/mnist.py”, line 187, in download
download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/utils.py”, line 434, in download_and_extract_archive
download_url(url, download_root, filename, md5)
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/torchvision/datasets/utils.py”, line 155, in download_url
raise RuntimeError(“File not found or corrupted.”)
RuntimeError: File not found or corrupted.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File “/home/mayag88/Documents/Code/Rep_learaning/flower/examples/quickstart-CiR/example_ray.py”, line 75, in
result = trainer.fit()
File “/home/mayag88/anaconda3/envs/ray_env/lib/python3.10/site-packages/ray/train/base_trainer.py”, line 618, in fit
raise TrainingFailedError(
ray.train.base_trainer.TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: trainer = TorchTrainer.restore("/home/mayag88/ray_results/TorchTrainer_2023-11-08_14-37-09").
To start a new run that will retry on training failures, set train.RunConfig(failure_config=train.FailureConfig(max_failures)) in the Trainer’s run_config with max_failures > 0, or max_failures = -1 for unlimited retries.
(RayTrainWorker pid=37867) Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw [repeated 2x across cluster]
(RayTrainWorker pid=37866) [repeated 3x across cluster]
(RayTrainWorker pid=37867) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz [repeated 2x across cluster]
(RayTrainWorker pid=37867) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
100%|██████████| 4422102/4422102 [00:01<00:00, 2716572.04it/s] [repeated 6x across cluster]
0%| | 0/4422102 [00:00<?, ?it/s] [repeated 2x across cluster]
29%|██▉ | 1277952/4422102 [00:00<00:00, 6289497.23it/s] [repeated 25x across cluster]

updated code to run CPU only

import tempfile
import torch
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

from ray.train.torch import TorchTrainer

from ray.train import ScalingConfig, Checkpoint
import ray


def train_func(config):
    # Model, Loss, Optimizer
    model = resnet18(num_classes=10)
    model.conv1 = torch.nn.Conv2d(
        1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
    )

    # [1] Prepare model.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.001)

    # Data
    transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
    train_data = FashionMNIST(
        root="./data", train=True, download=True, transform=transform
    )
    train_loader = DataLoader(train_data, batch_size=128, shuffle=True)

    # [2] Prepare dataloader.
    train_loader = ray.train.torch.prepare_data_loader(train_loader)

    # Training
    for epoch in range(10):
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        checkpoint_dir = tempfile.gettempdir()
        checkpoint_path = checkpoint_dir + "/model.checkpoint"
        torch.save(model.state_dict(), checkpoint_path)

        # [3] Report metrics and checkpoint.
        ray.train.report(
            {"loss": loss.item()}, checkpoint=Checkpoint.from_directory(checkpoint_dir)
        )


# [4] Configure scaling and resource requirements.
scaling_config = ScalingConfig(
    num_workers=2,
    use_gpu=False,  # Use GPU
    resources_per_worker={
        "CPU": 4,  # Reduce CPU to allocate more GPU resources
        # "GPU": 0.25,  # Fraction of one GPU per worker
    },
)

# [5] Launch distributed training job.
trainer = TorchTrainer(train_func, scaling_config=scaling_config)

result = trainer.fit()

Oh this looks like an error in loading the dataset, maybe because there are two workers. Can you try this and see if it resolves the issue?

    from filelock import FileLock
    with FileLock("/tmp/dataset.lock"):
        train_data = FashionMNIST(
            root="./data", train=True, download=True, transform=transform
        )