1. Severity of the issue: (select one)
None: I’m just curious or want clarification.
Low: Annoying but doesn’t hinder my work.
Medium: Significantly affects my productivity but can find a workaround.
High: Completely blocks me.
2. Environment:
- Ray version: 2.44.1
- Python version: 3.10
- OS: Linux
- Other libs/tools (if relevant): Pytorch + Pytorch Lightning
3. What happened vs. what you expected:
- Expected: Train run reaches state TERMINATED and another trial begins training
- Actual: Remains in this resource lock loop:
Followed this documentation page:
Hyperparameter Tuning with Ray Tune — Ray 2.44.1
Only thing I changed was adding a Pytorch Lightning for my own model and adding my own Data module. I also run the script with RAY_TRAIN_V2_ENABLED=1
Trainer Setup:
def train_driver(config, train_config):
metric = train_config["metric"]
mode = train_config["mode"]
batch_size = train_config["batch_size"]
num_workers = train_config["num_workers"]
cpus_per_worker = train_config["cpu_per_worker"]
epochs = train_config["epochs"]
save_epoch = train_config["save_epoch"]
encoder_name = train_config["encoder_name"]
num_classes = train_config["classes"]
lr = config["lr"]
data_path_dict = train_config["data_path_dict"]
label_map = getLabelMap(train_config["label_maps"])
seed = train_config["seed"]
ray_trainer = TorchTrainer(
train_func,
train_loop_config={
"lr": lr,
"batch_size": batch_size,
"cpus_per_worker": cpus_per_worker,
"epochs": epochs,
"save_epoch": save_epoch,
"encoder_name": encoder_name,
"classes": num_classes,
"seed": seed,
"label_map": label_map,
"data_path_dict": data_path_dict,
},
scaling_config=ray.train.ScalingConfig(
num_workers=num_workers,
use_gpu=True,
resources_per_worker={
"CPU": cpus_per_worker,
"GPU": 1,
},
),
run_config=ray.train.RunConfig(
name=f"ray_train_{ray.tune.get_context().get_trial_name()}",
storage_path="/workspace/ray_train_results",
checkpoint_config=ray.train.CheckpointConfig(
num_to_keep=5,
checkpoint_score_attribute=metric,
checkpoint_score_order=mode,
),
callbacks=[TuneReportCallback()]
),
)
ray_trainer.fit()
Tuner Setup:
ray.init()
search_space = {
"lr": tune.uniform(1e-5, 1e-2),
}
train_config = {
"batch_size": 16,
"num_workers": 3,
"epochs": 10,
"save_epoch": 2,
"encoder_name": "resnet50",
"metric": "F1 Score",
"mode": "max",
"classes": 4,
"cpu_per_worker": 4,
"data_path_dict": {
"stereo_train": ["/data/OI_Data/Stereo_Data/Training_Dataset_V5_0/678dd434ac665a006b73b647_OI4SampleSelector_train_960_576/"],
"stereo_val": None,
"mono_train": ["/data/OI_Data/Mono_Data/Train_sets/679104459a27c6c13547b9d8_OIv7_21SampleSelector_train_960_576/"],
"mono_val": None
},
"label_maps": label_maps,
"seed": 42,
}
scheduler = ASHAScheduler(
metric=train_config["metric"],
mode=train_config["mode"],
max_t=train_config["epochs"],
grace_period=1,
reduction_factor=2,
)
tuner = tune.Tuner(
tune.with_parameters(
train_driver,
train_config=train_config
),
param_space=search_space,
tune_config=tune.TuneConfig(
num_samples=2,
scheduler=scheduler,
max_concurrent_trials=1,
),
run_config=ray.tune.RunConfig(
name=f"ray_tune_SegFormer_{train_config['encoder_name']}",
storage_path="/workspace/ray_tune_results",
),
)
result = tuner.fit()