How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
I am trying to train a VAE to find a good latent variable representation of a bunch of simulated data. The VAE suffers from posterior collapse and so I thought it would be useful to run a hyper parameter search. I use something along the lines of the code below:
import math
import os
import numpy as np
from scipy.stats import random_correlation, loguniform
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.tune import Trial
class MDVAE(nn.Module):
# denoising auto-encoder; pass in noisy features, decode to noise-free features; use latent variables for inference
def __init__(
self,
n_data: int,
n_latent = 0.3,
n_hidden = 0.6,
n_hidden_layers = 1,
bias = True,
activation=nn.LeakyReLU(0.01),
):
super().__init__()
if isinstance(n_latent, float):
n_latent = math.ceil(n_latent * n_data)
if isinstance(n_hidden, float):
n_hidden = math.ceil(n_hidden * n_data)
if (n_latent > n_hidden) or (n_latent > n_data):
raise ValueError
endocer_layers = [nn.Linear(n_data, n_hidden, bias=bias), activation]
for i in range(n_hidden_layers):
endocer_layers.extend([nn.Linear(n_hidden, n_hidden, bias=bias), activation])
self.encoder = nn.Sequential(*endocer_layers)
self.mean_lay = nn.Linear(n_hidden, n_latent)
self.log_var_lay = nn.Linear(n_hidden, n_latent)
decoder_layers = []
for layer in endocer_layers:
if layer == activation:
continue
decoder_layers.extend([nn.Linear(layer.out_features, layer.in_features), activation])
decoder_layers.extend([nn.Linear(n_latent, n_hidden)])
self.decoder = nn.Sequential(*decoder_layers[::-1])
def encode_and_sample(self, x):
hidden = self.encoder(x)
mean = self.mean_lay(hidden)
log_var = self.log_var_lay(hidden)
# reparametrization trick
std = torch.exp(0.5 * log_var)
epsilon = torch.randn_like(std)
z = mean + std * epsilon
return z, mean, log_var
def forward(self, x):
z, mean, log_var = self.encode_and_sample(x)
return self.decoder(z), mean, log_var
class VAE_Dataset(Dataset):
def __init__(self, data: torch.Tensor):
self.data = data
def __len__(self):
return self.data.shape[0]
def __getitem__(self, idx):
return self.data[idx], self.data[idx]
_PRR = lambda x: x.to('cpu').data.numpy()
def ray_train(config, show_progress=False):
_CWD = config.get('_CWD')
n_data = int(config.get('n_data'))
n_hidden = float(config.get('n_hidden', 0.6))
n_latent = int(config.get('n_latent', 0.3))
n_hidden_layers = int(config.get('n_hidden_layers', 2))
batch_size = int(config.get('batch_size', 64))
learning_rate = float(config.get('learning_rate', 1e-3))
weight_decay = float(config.get('weight_decay', 1e-4))
KL_factor = float(config.get('KL_factor', 1.0))
n_epochs = int(config.get('n_epochs', 5))
train_ds = torch.load(os.path.join(_CWD, 'train_ds.pt'))
val_ds = torch.load(os.path.join(_CWD, 'val_ds.pt'))
x_val, y_val = val_ds[:]
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
mdvae = MDVAE(
n_data=n_data,
n_hidden=n_hidden,
n_latent=n_latent,
n_hidden_layers=n_hidden_layers,
)
loss_f = nn.MSELoss()
optimizer = torch.optim.Adam(mdvae.parameters(), lr=learning_rate, weight_decay=weight_decay)
if show_progress:
raise NotImplementedError
for epoch in range(n_epochs):
for i, (x, y) in enumerate(train_loader):
x_hat, mean, log_var = mdvae.forward(x)
reconstruct = loss_f(x_hat, y)
KL_div = - 0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
loss = reconstruct + KL_factor * KL_div
optimizer.zero_grad()
if ~(torch.isnan(loss) | torch.isinf(loss)):
loss.backward()
optimizer.step()
with torch.no_grad():
x_val_hat, mean, log_var = mdvae.forward(x_val)
reconstruct_val = loss_f(x_val_hat, y_val)
KL_div_val = - 0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
val_loss = reconstruct_val + KL_factor * KL_div_val
train.report({'val_loss': _PRR(val_loss)})
def short_dirname(trial: Trial):
return "trial_" + str(trial.trial_id)
def ray_main(
val_frac: float=0.1,
max_n_hidden:float=0.8,
):
rng = torch.Generator()
rng.manual_seed(3)
np_rng = np.random.RandomState(3)
n_samples = 20000
n_data = 15
n_latent = 5
eigs = np_rng.rand(n_data)
eigs = eigs * (n_data / sum(eigs))
corr = torch.from_numpy(
random_correlation.rvs(eigs, random_state=np_rng)
)
chol = torch.linalg.cholesky(corr)
data = torch.randn((n_samples, n_data), generator=rng, dtype=torch.float64)
data = data @ chol # correlated data
data = (data - torch.mean(data, 0)) / torch.std(data, 0) # normalized data
dataset = VAE_Dataset(data.float())
n_validate = math.ceil(val_frac * n_samples)
# prepare datasets
train_ds, val_ds = random_split(
dataset,
lengths=(len(dataset) - n_validate, n_validate),
)
_CWD = os.getcwd()
torch.save(train_ds, os.path.join(_CWD, 'train_ds.pt'))
torch.save(val_ds, os.path.join(_CWD, 'val_ds.pt'))
# figure out dimensions of the VAE n_data -> n_hidden -> n_latent == n_free_vars
min_n_hidden = n_latent / n_data
if min_n_hidden > max_n_hidden:
raise ValueError
config = {
'_CWD': _CWD,
'batch_size': tune.choice(list(2 ** np.arange(3, 9))),
'n_hidden': tune.uniform(min_n_hidden, max_n_hidden),
'n_data': n_data,
'n_latent': n_latent,
'n_hidden_layers': tune.randint(1, 4),
'learning_rate': tune.loguniform(1e-5, 3e-1),
}
if not ray.is_initialized():
ray.init(
local_mode=True,
num_cpus=2,
)
metric = 'val_loss'
mode = 'min'
hyperopt_search = HyperOptSearch(metric=metric, mode=mode)
tuner = tune.Tuner(
ray_train,
param_space=config,
tune_config=tune.TuneConfig(
num_samples=40,
search_alg=hyperopt_search,
max_concurrent_trials=6,
scheduler=ASHAScheduler(time_attr='epoch', metric=metric, mode=mode),
trial_dirname_creator=short_dirname
),
run_config=train.RunConfig(storage_path=os.path.join(_CWD, 'ray_mdvae'), name="test_experiment")
)
results = tuner.fit()
return results
if __name__ == "__main__":
results = ray_main()
I am getting some super weird results. First of all, only 1 of the allocated 2 CPUs is used. I am running this job locally on my PC and would love to utilize all cores, but this does not seem to be happening. Second, I get a Windows fatal exception: access violation
that exits the program; this seems to have something to do with threading, but I am unable to address this issue. Third, there is this odd part of the error: [...] logging to tensorboard: {'batch_size': ('__ref_ph', '97948da7')}
. Im just selecting a random number with .choice, so why is that converted to a tuple with strange contents?
Here is the full report:
2024-08-08 02:02:28,203 INFO worker.py:1743 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265
2024-08-08 02:02:30,240 INFO tune.py:622 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949
╭──────────────────────────────────────────────────────────╮
│ Configuration for experiment test_experiment │
├──────────────────────────────────────────────────────────┤
│ Search algorithm SearchGenerator │
│ Scheduler AsyncHyperBandScheduler │
│ Number of trials 40 │
╰──────────────────────────────────────────────────────────╯
View detailed results here: C:/python_projects/sbmfi/src/sbmfi/inference/ray_mdvae/test_experiment
To visualize your results with TensorBoard, run: `tensorboard --logdir C:/Users/diede/AppData/Local/Temp/ray/session_2024-08-08_02-02-24_098276_8/artifacts/2024-08-08_02-02-30/test_experiment/driver_artifacts`
:job_id:01000000
:task_name:bundle_reservation_check_func
:job_id:01000000
:task_name:bundle_reservation_check_func
:actor_name:ImplicitFunc
:actor_name:ray_train
:actor_name:ImplicitFunc
:actor_name:ray_train
Trial ray_train_c549ac92 started with configuration:
╭──────────────────────────────────────────────────────────╮
│ Trial ray_train_c549ac92 config │
├──────────────────────────────────────────────────────────┤
│ _CWD ...c\sbmfi\inference │
│ batch_size 128 │
│ learning_rate 0.00494 │
│ n_data 15 │
│ n_hidden 0.67626 │
│ n_hidden_layers 2 │
│ n_latent 5 │
╰──────────────────────────────────────────────────────────╯
Trial status: 1 RUNNING
Current time: 2024-08-08 02:02:31. Total running time: 1s
Logical resource usage: 1.0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
╭─────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name status batch_size n_hidden n_hidden_layers learning_rate │
├─────────────────────────────────────────────────────────────────────────────────────────────────┤
│ ray_train_c549ac92 RUNNING 128 0.676258 2 0.00494485 │
╰─────────────────────────────────────────────────────────────────────────────────────────────────╯
Trial ray_train_c549ac92 completed after 5 iterations at 2024-08-08 02:02:34. Total running time: 4s
╭─────────────────────────────────────────────╮
│ Trial ray_train_c549ac92 result │
├─────────────────────────────────────────────┤
│ checkpoint_dir_name │
│ time_this_iter_s 0.85798 │
│ time_total_s 4.37792 │
│ training_iteration 5 │
│ val_loss 1.00186 │
╰─────────────────────────────────────────────╯
2024-08-08 02:02:34,870 INFO tensorboardx.py:311 -- Removed the following hyperparameter values when logging to tensorboard: {'batch_size': ('__ref_ph', '97948da7')}
Windows fatal exception: access violation
Stack (most recent call first):
File "C:\Program Files\JetBrains\PyCharm 2024.1.4\plugins\python\helpers\pydev\pydevconsole.py", line 397 in do_exit
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 399 in _report_training_result
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 450 in report
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 743 in report
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 652 in wrapper
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\tune\trainable\function_trainable.py", line 267 in _trainable_func
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467 in _resume_span
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\tune\trainable\function_trainable.py", line 53 in <lambda>
File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\air\_internal\util.py", line 88 in run
File "C:\Miniconda3\envs\sbmfi\lib\threading.py", line 932 in _bootstrap_inner
File "C:\Miniconda3\envs\sbmfi\lib\threading.py", line 890 in _bootstrap
Process finished with exit code -1073741819 (0xC0000005)