Hello,
I seem to be having an issue with saving checkpointing and loading data up.
i am getting around the end of the trial multiple no such file or directory errors and the trial stops.
here are the error logs
Failure # 1 (occurred at 2024-06-26_01-17-18)
e[36mray::ImplicitFunc.train()e[39m (pid=106336, ip=127.0.0.1, actor_id=66972fd4a7dc27ce3bfde81701000000, repr=train_stock_model)
File "python\ray\_raylet.pyx", line 1893, in ray._raylet.execute_task
File "python\ray\_raylet.pyx", line 1834, in ray._raylet.execute_task.function_executor
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\_private\function_manager.py", line 691, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\tune\trainable\trainable.py", line 331, in train
raise skipped from exception_cause(skipped)
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\air\_internal\util.py", line 98, in run
self._ret = self._target(*self._args, **self._kwargs)
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\tune\trainable\function_trainable.py", line 174, in <lambda>
training_func=lambda: self._trainable_func(self.config),
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\util\tracing\tracing_helper.py", line 467, in _resume_span
return method(self, *_args, **_kwargs)
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\tune\trainable\function_trainable.py", line 248, in _trainable_func
output = fn()
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\tune\trainable\util.py", line 130, in inner
return trainable(config, **fn_kwargs)
File "c:\users\administrator\downloads\python\temp.py", line 84, in train_stock_model
model.summary()
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\keras\engine\training.py", line 3219, in summary
layer_utils.print_summary(
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\keras\utils\layer_utils.py", line 320, in print_summary
print_fn('Model: "{}"'.format(model.name))
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\keras\utils\io_utils.py", line 77, in print_msg
sys.stdout.write(message + "\n")
File "C:\ProgramData\anaconda3\envs\tf-gpu\lib\site-packages\ray\_private\utils.py", line 412, in write
self.stream.write(data)
FileNotFoundError: [Errno 2] No such file or directory
and here are some other warnings i am getting as well
2024-06-26 01:17:12,051 WARNING util.py:201 -- The `on_step_begin` operation took 4.172 s, which may be a performance bottleneck.
this is my current config for ray
ray.init(configure_logging=True,log_to_driver=True,num_gpus=4,ignore_reinit_error=True) #logging_level=logging.DEBUG
algo = TuneBOHB()
scheduler = HyperBandForBOHB(
time_attr="training_iteration",
max_t=100,
stop_last_trials=True,
)
X_train20, X_test20,y_train20,y_test20 = gendata_lstm.GenerateData.GenerateData(20,20, None)
data = {
"X_t20": X_train20,
"X_tt20": X_test20,
"y_t20":y_train20,
"y_tt20": y_test20
}
trainable_with_resources = tune.with_resources(train_stock_model, {"cpu": 3,"gpu": 0.1, "accelerator_type:A100": 0.025}) #
tuner = tune.Tuner(
tune.with_parameters(trainable_with_resources, data=data),
tune_config=tune.TuneConfig(
metric="val_acc",
mode="max",
search_alg=algo,
scheduler=scheduler,
num_samples=3000,
reuse_actors=False
),
run_config=train.RunConfig(
name="1k_1_5k",
storage_path="Z:\\Models",
),
param_space={
"seq_length": tune.choice([20]),
"lr": tune.loguniform(0.0005, 0.005),
"l1": tune.choice([1024,1536]),
"l2": tune.choice([1024,1536]),
"l3": tune.choice([1024,1536]),
"l4": tune.choice([1024,1536]),
"l1_dropout" : tune.uniform(0.1,0.2),
"l2_dropout" : tune.uniform(0.1,0.2),
"l3_dropout" : tune.uniform(0.1,0.2),
"l4_dropout" : tune.uniform(0.1,0.2),
"decay": tune.loguniform(0.00005,0.001),
"alpha": tune.loguniform(0.0005,0.005),
"batch_size": tune.choice([8]),
"conv1d_filters": tune.choice([16]),
"conv1d_kernel": tune.choice([3]),
"num_conv_layers": tune.choice([2]),
"max_pool_size": 1
},
)
and here is the model fit where i am checkpointing.
model.fit(train_data, batch_size=config["batch_size"] ,epochs=100,validation_data=val_data,verbose=False,callbacks=[ReportCheckpointCallback(metrics={"mean_accuracy": "accuracy","mean_loss":"loss"
, "val_loss":"val_loss","val_acc":"val_accuracy"})])
i am not sure why this seems to happen when i check the file the file is there . its almost like there is a race condition somewhere for a specific trial and then before it ends writing it it tries to load it again. if someone could help or guide on what might be happening please let me know!