Hi I also had same error, and I wonder is it possible to use tune.with_parameters
just because I had problems with dataset size, but not necessarily because I want to tune hyperparams ?
for quick PoC if I can with large dataset, then I should also be able to do with smaller dataset using tune.with_parameters
so I try to apply that to fashion mnist dataset, base on this, I modify the data (truncated a bit)
import argparse
from typing import Dict
from ray.air import session
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig
from ray.tune import Tuner
from ray import tune
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="~/data",
train=True,
download=True,
transform=ToTensor(),
)
testing_data = datasets.FashionMNIST(...
# exactly same as docs
)
# Define model
class NeuralNetwork(nn.Module):
# exactly same as docs
def train_epoch(dataloader, model, loss_fn, optimizer):
# exactly same as docs
def validate_epoch(dataloader, model, loss_fn):
# exactly same as docs
def train_func_tune(config, train_data, test_data):
batch_size = config["batch_size"]
lr = config["lr"]
epochs = config["epochs"]
worker_batch_size = batch_size // session.get_world_size()
# Create data loaders.
train_dataloader = DataLoader(train_data, batch_size=worker_batch_size)
test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)
train_dataloader = train.torch.prepare_data_loader(train_dataloader)
test_dataloader = train.torch.prepare_data_loader(test_dataloader)
# Create model.
model = NeuralNetwork()
model = train.torch.prepare_model(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
for _ in range(epochs):
train_epoch(train_dataloader, model, loss_fn, optimizer)
loss = validate_epoch(test_dataloader, model, loss_fn)
session.report(dict(loss=loss))
if __name__ == "__main__":
tuner = Tuner(
tune.with_parameters(train_func_tune,
config=config,
train_data=training_data, test_data=test_data)
)
results = tuner.fit()
results
Then it only says trials did not complete, it doesn’t specify which part is causing it
any help is appreciated
update: following some suggestion I cd into ray_results
dir and got this error.txt, why it complains about my config having multiple argument?
Failure # 1 (occurred at 2023-04-10_10-46-11)
ray::ImplicitFunc.train() (pid=863, ip=10.42.59.67, repr=train_func_tune)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 368, in train
raise skipped from exception_cause(skipped)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 337, in entrypoint
return self._trainable_func(
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
output = fn()
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 406, in _inner
return inner(config, checkpoint_dir=None)
File "/opt/conda/lib/python3.8/site-packages/ray/tune/trainable/util.py", line 398, in inner
return trainable(config, **fn_kwargs)
TypeError: train_func_tune() got multiple values for argument 'config'