I am using ray tune with pytorch lightning and wandb.
ray==2.7.0
pytorch-lightning==2.0.8
wandb==0.15.9
My code is pasted below. I am running into a bottleneck issue with Checkpointing yielding two errors (shown below). I believe this relates to a known checkpointing issues for windows identified in this ray github issue. It was supposed to be fixed in ray 2.7, but appears to persiste… any idea how to fix this?
ERROR 1:
2023-09-26 13:17:35,381 WARNING experiment_state.py:311 -- The previous sync of the experiment directory to the cloud failed with the error: GetFileInfo() yielded path 'C:/Users/hseely/ray_results/check_effect_of_normals_ocnn_lenet', which is outside base dir 'C:/Users/hseely/ray_results\check_effect_of_normals_ocnn_lenet'
Syncing will be retried.
ERROR 2:
2023-09-26 13:20:07,106 WARNING syncer.py:458 -- Last sync command failed with the following error:
Traceback (most recent call last):
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\syncer.py", line 456, in _launch_sync_process
self.wait()
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\syncer.py", line 530, in wait
raise e
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\syncer.py", line 528, in wait
self._sync_process.wait(timeout=self.sync_timeout)
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\syncer.py", line 203, in wait
raise exception
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\syncer.py", line 166, in entrypoint
result = self._fn(*args, **kwargs)
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\storage.py", line 221, in _upload_to_fs_path
_upload_to_uri_with_exclude_fsspec(
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\storage.py", line 234, in _upload_to_uri_with_exclude_fsspec
_pyarrow_fs_copy_files(
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\ray\train\_internal\storage.py", line 115, in _pyarrow_fs_copy_files
return pyarrow.fs.copy_files(
File "C:\Users\hseely\AppData\Local\miniconda3\envs\rq2\lib\site-packages\pyarrow\fs.py", line 244, in copy_files
_copy_files_selector(source_fs, source_sel,
File "pyarrow\_fs.pyx", line 1229, in pyarrow._fs._copy_files_selector
File "pyarrow\error.pxi", line 99, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: GetFileInfo() yielded path 'C:/Users/hseely/ray_results/check_effect_of_normals_ocnn_lenet', which is outside base dir 'C:/Users/hseely/ray_results\check_effect_of_normals_ocnn_lenet' ```
MY CODE:
# Ray code based on this tutorial: https://docs.ray.io/en/latest/tune/examples/tune-pytorch-lightning.html
import yaml
import os
import pytorch_lightning as pl
from ray import tune
from ray.train.torch import TorchTrainer, TorchConfig
from ray.train import ScalingConfig, RunConfig, CheckpointConfig, SyncConfig
from ray.air.integrations.wandb import WandbLoggerCallback
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment, RayTrainReportCallback, prepare_trainer)
from utils.dataset import BiomassDataModule
from utils.trainer import LitModel
def train_func(tune_cfg):
current_directory = os.getcwd()
print("Current ray working directory:", current_directory)
# Load hps and update with ray tune config
with open(r'D:/Sync/RQ2/Analysis/config.yaml', "r") as yamlfile:
cfg = yaml.load(yamlfile, Loader=yaml.FullLoader)
cfg.update(tune_cfg)
# Create a Lightning model
model = LitModel(cfg)
# Enable gradient clipping if specified
if cfg['gradient_clip']:
gradient_clip_val = 0.5
gradient_clip_algorithm = "norm"
else:
gradient_clip_val = None
gradient_clip_algorithm = None
# Create a Lighting Trainer
trainer = pl.Trainer(
max_epochs=cfg['num_epochs'],
accelerator="gpu",
devices=cfg['n_devices'],
num_nodes=cfg['n_nodes'],
precision=cfg['precision'],
strategy=RayDDPStrategy(),
plugins=[RayLightningEnvironment()],
callbacks=[RayTrainReportCallback()],
enable_progress_bar=False,
# * * * * Gradient Clipping
gradient_clip_val=gradient_clip_val, # https://lightning.ai/docs/pytorch/latest/advanced/training_tricks.html
gradient_clip_algorithm=gradient_clip_algorithm,
)
# Validate lightning trainer configuration
trainer = prepare_trainer(trainer)
# Build your datasets on each worker
data_module = BiomassDataModule(cfg=cfg)
# Train model
trainer.fit(model, datamodule=data_module)
def tune_model(experiment_name, wandb_project, n_trials, search_space, target_metric="val_loss", direction="min",
time_budget_s=None, search_method=None, trial_scheduler=None):
"""
Performs hyperparameter tuning using ray tune library with either bayesian optimization or random search.
Results are logged using wandb.
:param experiment_name: name of experiment to log to wandb
:param wandb_project: name of wandb project to log to
:param n_trials: number of trials to run. If -1, then runs until time_budget_s is reached
:param search_space: dictionary of hyperparameters to tune using ray tune search functions
:param target_metric: name of metric to optimize
:param direction: direction of optimization
:param time_budget_s: duration of experiment in seconds
:param search_method: ray tune search algorithm to use. If None, then uses random search
:param trial_scheduler: ray tune trial scheduler to use. If None, then uses default
:return:
"""
# Turn off all other loggers so only using wandb
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
# Read in the config and pass the hyperparameters to wandb that are static throughout tuning
with open(r'D:/Sync/RQ2/Analysis/config.yaml', "r") as yamlfile:
static_cfg = yaml.load(yamlfile, Loader=yaml.FullLoader)
tuned_hp_keys = search_space.keys()
static_cfg = {key: value for key, value in static_cfg.items() if key not in tuned_hp_keys}
ray_trainer = TorchTrainer(
train_func,
torch_config=TorchConfig(backend='gloo'),
scaling_config=ScalingConfig(num_workers=1, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 2}),
)
tuner = tune.Tuner(
ray_trainer,
param_space={"train_loop_config": search_space},
tune_config=tune.TuneConfig(
metric=target_metric,
mode=direction,
num_samples=n_trials,
time_budget_s=time_budget_s,
search_alg=search_method,
scheduler=trial_scheduler,
),
run_config=RunConfig(
name=experiment_name,
callbacks=[WandbLoggerCallback(project=wandb_project, config=static_cfg)],
storage_path=os.path.expanduser("~/ray_logs"),
checkpoint_config=CheckpointConfig(
num_to_keep=1,
checkpoint_score_attribute="val_loss",
checkpoint_score_order="min"),
sync_config=SyncConfig(sync_artifacts=False, sync_artifacts_on_checkpoint=False)
),
)
tuner.fit()
print("\n****\n***\nTUNING COMPLETE!")
if __name__ == "__main__":
search_space = {
"use_normals": tune.grid_search([False, True]),
}
tune_model(experiment_name="check_effect_of_normals_ocnn_lenet", wandb_project="check_effect_of_normals",
search_space=search_space, n_trials=4, # time_budget_s=60 * 60,
search_method=None)