Hello. I am a student from Korea who just started studying Ray.
I used a translation tool, so the sentences may sound awkward. Please understand.
I tried running “Getting Started with Distributed Model Training in Ray Train” from the Ray homepage, but I encountered an error. I would like to know how to resolve it.
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
def get_dataset():
return datasets.FashionMNIST(
root="./data",
train=True,
download=True,
transform=ToTensor(),
)
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, inputs):
inputs = self.flatten(inputs)
logits = self.linear_relu_stack(inputs)
return logits
from ray import train
def train_func_distributed():
num_epochs = 10
batch_size = 64
dataset = get_dataset()
dataloader = DataLoader(dataset, batch_size=batch_size)
dataloader = train.torch.prepare_data_loader(dataloader)
model = NeuralNetwork()
model = train.torch.prepare_model(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(num_epochs):
for inputs, labels in dataloader:
optimizer.zero_grad()
pred = model(inputs)
loss = criterion(pred, labels)
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig
# For GPU Training, set `use_gpu` to True.
use_gpu = False
trainer = TorchTrainer(
train_func_distributed,
scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)
)
trainer.fit()
error message
== Status ==
Current time: 2023-07-25 17:05:48 (running for 00:01:35.56)
Using FIFO scheduling algorithm.
Logical resource usage: 9.0/72 CPUs, 0/0 GPUs
Result logdir: /home/sykang/ray_results/TorchTrainer_2023-07-25_17-04-12
Number of trials: 1/1 (1 RUNNING)
+--------------------------+----------+-----------------------+
| Trial name | status | loc |
|--------------------------+----------+-----------------------|
| TorchTrainer_47823_00000 | RUNNING | 192.168.0.201:2502631 |
+--------------------------+----------+-----------------------+
Trial TorchTrainer_47823_00000 completed. Last result:
== Status ==
Current time: 2023-07-25 17:05:53 (running for 00:01:40.36)
Using FIFO scheduling algorithm.
Logical resource usage: 9.0/72 CPUs, 0/0 GPUs
Result logdir: /home/sykang/ray_results/TorchTrainer_2023-07-25_17-04-12
Number of trials: 1/1 (1 TERMINATED)
+--------------------------+------------+-----------------------+
| Trial name | status | loc |
|--------------------------+------------+-----------------------|
| TorchTrainer_47823_00000 | TERMINATED | 192.168.0.201:2502631 |
+--------------------------+------------+-----------------------+
2023-07-25 17:06:03,612 INFO tune.py:1111 -- Total run time: 110.68 seconds (100.36 seconds for the tuning loop).
2023-07-25 17:06:03,614 WARNING experiment_analysis.py:910 -- Failed to read the results for 1 trials:
- /home/sykang/ray_results/TorchTrainer_2023-07-25_17-04-12/TorchTrainer_47823_00000_0_2023-07-25_17-04-12