Not all CPUs used and windows access error

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

I am trying to train a VAE to find a good latent variable representation of a bunch of simulated data. The VAE suffers from posterior collapse and so I thought it would be useful to run a hyper parameter search. I use something along the lines of the code below:

import math
import os
import numpy as np
from scipy.stats import random_correlation, loguniform

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn

import ray
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.tune import Trial


class MDVAE(nn.Module):
    # denoising auto-encoder; pass in noisy features, decode to noise-free features; use latent variables for inference
    def __init__(
            self,
            n_data: int,
            n_latent = 0.3,
            n_hidden = 0.6,
            n_hidden_layers = 1,
            bias = True,
            activation=nn.LeakyReLU(0.01),
    ):
        super().__init__()
        if isinstance(n_latent, float):
            n_latent = math.ceil(n_latent * n_data)
        if isinstance(n_hidden, float):
            n_hidden = math.ceil(n_hidden * n_data)

        if (n_latent > n_hidden) or (n_latent > n_data):
            raise ValueError

        endocer_layers = [nn.Linear(n_data, n_hidden, bias=bias), activation]
        for i in range(n_hidden_layers):
            endocer_layers.extend([nn.Linear(n_hidden, n_hidden, bias=bias), activation])

        self.encoder = nn.Sequential(*endocer_layers)
        self.mean_lay = nn.Linear(n_hidden, n_latent)
        self.log_var_lay  = nn.Linear(n_hidden, n_latent)

        decoder_layers = []
        for layer in endocer_layers:
            if layer == activation:
                continue
            decoder_layers.extend([nn.Linear(layer.out_features, layer.in_features), activation])
        decoder_layers.extend([nn.Linear(n_latent, n_hidden)])
        self.decoder = nn.Sequential(*decoder_layers[::-1])

    def encode_and_sample(self, x):
        hidden = self.encoder(x)
        mean = self.mean_lay(hidden)
        log_var = self.log_var_lay(hidden)

        # reparametrization trick
        std = torch.exp(0.5 * log_var)
        epsilon = torch.randn_like(std)
        z = mean + std * epsilon
        return z, mean, log_var

    def forward(self, x):
        z, mean, log_var = self.encode_and_sample(x)
        return self.decoder(z), mean, log_var

class VAE_Dataset(Dataset):
    def __init__(self, data: torch.Tensor):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        return self.data[idx], self.data[idx]


_PRR = lambda x: x.to('cpu').data.numpy()
def ray_train(config, show_progress=False):
    _CWD = config.get('_CWD')
    n_data = int(config.get('n_data'))
    n_hidden = float(config.get('n_hidden', 0.6))
    n_latent = int(config.get('n_latent', 0.3))
    n_hidden_layers = int(config.get('n_hidden_layers', 2))
    batch_size = int(config.get('batch_size', 64))
    learning_rate = float(config.get('learning_rate', 1e-3))
    weight_decay = float(config.get('weight_decay', 1e-4))
    KL_factor = float(config.get('KL_factor', 1.0))
    n_epochs = int(config.get('n_epochs', 5))

    train_ds = torch.load(os.path.join(_CWD, 'train_ds.pt'))
    val_ds = torch.load(os.path.join(_CWD, 'val_ds.pt'))
    x_val, y_val = val_ds[:]
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    mdvae = MDVAE(
        n_data=n_data,
        n_hidden=n_hidden,
        n_latent=n_latent,
        n_hidden_layers=n_hidden_layers,
    )

    loss_f = nn.MSELoss()
    optimizer = torch.optim.Adam(mdvae.parameters(), lr=learning_rate, weight_decay=weight_decay)

    if show_progress:
        raise NotImplementedError

    for epoch in range(n_epochs):
        for i, (x, y) in enumerate(train_loader):
            x_hat, mean, log_var = mdvae.forward(x)
            reconstruct = loss_f(x_hat, y)
            KL_div = - 0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
            loss = reconstruct + KL_factor * KL_div
            optimizer.zero_grad()
            if ~(torch.isnan(loss) | torch.isinf(loss)):
                loss.backward()
                optimizer.step()

        with torch.no_grad():
            x_val_hat, mean, log_var = mdvae.forward(x_val)
            reconstruct_val = loss_f(x_val_hat, y_val)
            KL_div_val = - 0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
            val_loss = reconstruct_val + KL_factor * KL_div_val

            train.report({'val_loss': _PRR(val_loss)})

def short_dirname(trial: Trial):
    return "trial_" + str(trial.trial_id)

def ray_main(
        val_frac: float=0.1,
        max_n_hidden:float=0.8,
):
    rng = torch.Generator()
    rng.manual_seed(3)
    np_rng = np.random.RandomState(3)

    n_samples = 20000
    n_data = 15
    n_latent = 5

    eigs = np_rng.rand(n_data)
    eigs = eigs * (n_data / sum(eigs))
    corr = torch.from_numpy(
        random_correlation.rvs(eigs, random_state=np_rng)
    )
    chol = torch.linalg.cholesky(corr)

    data = torch.randn((n_samples, n_data), generator=rng, dtype=torch.float64)
    data = data @ chol  # correlated data
    data = (data - torch.mean(data, 0)) / torch.std(data, 0)  # normalized data
    dataset = VAE_Dataset(data.float())

    n_validate = math.ceil(val_frac * n_samples)

    # prepare datasets
    train_ds, val_ds = random_split(
        dataset,
        lengths=(len(dataset) - n_validate, n_validate),
    )

    _CWD = os.getcwd()
    torch.save(train_ds, os.path.join(_CWD, 'train_ds.pt'))
    torch.save(val_ds, os.path.join(_CWD, 'val_ds.pt'))

    # figure out dimensions of the VAE n_data -> n_hidden -> n_latent == n_free_vars
    min_n_hidden = n_latent / n_data
    if min_n_hidden > max_n_hidden:
        raise ValueError

    config = {
        '_CWD': _CWD,
        'batch_size': tune.choice(list(2 ** np.arange(3, 9))),
        'n_hidden': tune.uniform(min_n_hidden, max_n_hidden),
        'n_data': n_data,
        'n_latent': n_latent,
        'n_hidden_layers': tune.randint(1, 4),
        'learning_rate': tune.loguniform(1e-5, 3e-1),
    }
    if not ray.is_initialized():
        ray.init(
            local_mode=True,
            num_cpus=2,
        )
    metric = 'val_loss'
    mode = 'min'
    hyperopt_search = HyperOptSearch(metric=metric, mode=mode)
    tuner = tune.Tuner(
        ray_train,
        param_space=config,
        tune_config=tune.TuneConfig(
            num_samples=40,
            search_alg=hyperopt_search,
            max_concurrent_trials=6,
            scheduler=ASHAScheduler(time_attr='epoch', metric=metric, mode=mode),
            trial_dirname_creator=short_dirname
        ),
        run_config=train.RunConfig(storage_path=os.path.join(_CWD, 'ray_mdvae'), name="test_experiment")
    )
    results = tuner.fit()

    return results

if __name__ == "__main__":
    results = ray_main()

I am getting some super weird results. First of all, only 1 of the allocated 2 CPUs is used. I am running this job locally on my PC and would love to utilize all cores, but this does not seem to be happening. Second, I get a Windows fatal exception: access violation that exits the program; this seems to have something to do with threading, but I am unable to address this issue. Third, there is this odd part of the error: [...] logging to tensorboard: {'batch_size': ('__ref_ph', '97948da7')}. Im just selecting a random number with .choice, so why is that converted to a tuple with strange contents?

Here is the full report:

2024-08-08 02:02:28,203	INFO worker.py:1743 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
2024-08-08 02:02:30,240	INFO tune.py:622 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949
╭──────────────────────────────────────────────────────────╮
│ Configuration for experiment     test_experiment         │
├──────────────────────────────────────────────────────────┤
│ Search algorithm                 SearchGenerator         │
│ Scheduler                        AsyncHyperBandScheduler │
│ Number of trials                 40                      │
╰──────────────────────────────────────────────────────────╯
View detailed results here: C:/python_projects/sbmfi/src/sbmfi/inference/ray_mdvae/test_experiment
To visualize your results with TensorBoard, run: `tensorboard --logdir C:/Users/diede/AppData/Local/Temp/ray/session_2024-08-08_02-02-24_098276_8/artifacts/2024-08-08_02-02-30/test_experiment/driver_artifacts`
:job_id:01000000
:task_name:bundle_reservation_check_func
:job_id:01000000
:task_name:bundle_reservation_check_func
:actor_name:ImplicitFunc
:actor_name:ray_train
:actor_name:ImplicitFunc
:actor_name:ray_train
Trial ray_train_c549ac92 started with configuration:
╭──────────────────────────────────────────────────────────╮
│ Trial ray_train_c549ac92 config                          │
├──────────────────────────────────────────────────────────┤
│ _CWD                                ...c\sbmfi\inference │
│ batch_size                                           128 │
│ learning_rate                                    0.00494 │
│ n_data                                                15 │
│ n_hidden                                         0.67626 │
│ n_hidden_layers                                        2 │
│ n_latent                                               5 │
╰──────────────────────────────────────────────────────────╯
Trial status: 1 RUNNING
Current time: 2024-08-08 02:02:31. Total running time: 1s
Logical resource usage: 1.0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
╭─────────────────────────────────────────────────────────────────────────────────────────────────╮
│ Trial name           status       batch_size     n_hidden     n_hidden_layers     learning_rate │
├─────────────────────────────────────────────────────────────────────────────────────────────────┤
│ ray_train_c549ac92   RUNNING             128     0.676258                   2        0.00494485 │
╰─────────────────────────────────────────────────────────────────────────────────────────────────╯
Trial ray_train_c549ac92 completed after 5 iterations at 2024-08-08 02:02:34. Total running time: 4s
╭─────────────────────────────────────────────╮
│ Trial ray_train_c549ac92 result             │
├─────────────────────────────────────────────┤
│ checkpoint_dir_name                         │
│ time_this_iter_s                    0.85798 │
│ time_total_s                        4.37792 │
│ training_iteration                        5 │
│ val_loss                            1.00186 │
╰─────────────────────────────────────────────╯
2024-08-08 02:02:34,870	INFO tensorboardx.py:311 -- Removed the following hyperparameter values when logging to tensorboard: {'batch_size': ('__ref_ph', '97948da7')}
Windows fatal exception: access violation
Stack (most recent call first):
  File "C:\Program Files\JetBrains\PyCharm 2024.1.4\plugins\python\helpers\pydev\pydevconsole.py", line 397 in do_exit
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 399 in _report_training_result
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 450 in report
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 743 in report
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\train\_internal\session.py", line 652 in wrapper
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\tune\trainable\function_trainable.py", line 267 in _trainable_func
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467 in _resume_span
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\tune\trainable\function_trainable.py", line 53 in <lambda>
  File "C:\Miniconda3\envs\sbmfi\lib\site-packages\ray\air\_internal\util.py", line 88 in run
  File "C:\Miniconda3\envs\sbmfi\lib\threading.py", line 932 in _bootstrap_inner
  File "C:\Miniconda3\envs\sbmfi\lib\threading.py", line 890 in _bootstrap
Process finished with exit code -1073741819 (0xC0000005)