Hallo everyone, I was using ray tune PBT to tune my model, but I can’t find the saved model in checkpoints Dokumente. At the end of traing I can get the best config but I can’t get the best model. And i will get error like this:
(pid=27000) 2021-01-11 17:16:43.197133: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at save_restore_v2_ops.cc:109
: Not found: Failed to create a NewWriteableFile: D:\probe\pbt_checkpoint\pbt_test\MLPmodel_39793_00000_0_af_0=0,af_1=2,af_2=1,af_3=0,af_4=2,af_5=2,af_6=2,af_7=1,af_output=3,batchsize=849,num_layers=5,units_0=498,_2021-01-11_17-15-41\variables\variables_temp_62b4641374534df4bd63c5ecfd5991b3/part-00000-of-00001.data-00000-of-00001.tempstate15293471527192116649 : ϵͳ�Ҳ���ָ����·����
(pid=27000) ; No such process
2021-01-11 17:16:43,978 ERROR worker.py:980 – Possible unhandled error from worker: ray::MLPmodel.save_to_object() (pid=27000, ip=172.16.1.32)
File “python\ray_raylet.pyx”, line 463, in ray._raylet.execute_task
File “python\ray_raylet.pyx”, line 415, in ray._raylet.execute_task.function_executor
File “D:\anaconda3\envs\BA_37\lib\site-packages\ray\function_manager.py”, line 556, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File “D:\anaconda3\envs\BA_37\lib\site-packages\ray\tune\trainable.py”, line 295, in save_to_object
checkpoint_path = self.save(tmpdir)
File “D:\anaconda3\envs\BA_37\lib\site-packages\ray\tune\trainable.py”, line 278, in save
checkpoint = self.save_checkpoint(checkpoint_dir)
File “d:/Probe/PBT/PBT_probe.py”, line 94, in save_checkpoint
self.model.save(file_path)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\engine\training.py”, line 1979, in save
signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\saving\save.py”, line 134, in save_model
signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\saving\saved_model\save.py”, line 80, in save
save_lib.save(model, filepath, signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\saved_model\save.py”, line 985, in save
options=ckpt_options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\tracking\util.py”, line 1200, in save
file_prefix_tensor, object_graph_tensor, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\tracking\util.py”, line 1145, in _save_cached_when_graph_building
save_op = saver.save(file_prefix, options=options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 295, in save
return save_fn()
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 269, in save_fn
sharded_saves.append(saver.save(shard_prefix, options))
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 78, in save
return io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\ops\gen_io_ops.py”, line 1731, in save_v2
ctx=_ctx)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\ops\gen_io_ops.py”, line 1751, in save_v2_eager_fallback
ctx=ctx, name=name)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\eager\execute.py”, line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.NotFoundError: Failed to create a directory: D:\probe\pbt_checkpoint\pbt_test\MLPmodel_39793_00000_0_af_0=0,af_1=2,af_2=1,af_3=0,af_4=2,af_5=2,af_6=2,af_7=1,af_output=3,batchsize=849,num_layers=5,units_0=498,_2021-01-11_17-15-41\tmpulblmjabsave_to_object\checkpoint_2/model\variables\variables_temp_c123b60a74554ae3b4da1f882fa4089b; No such file or directory [Op:SaveV2]
2021-01-11 17:16:49,136 ERROR worker.py:980 – Possible unhandled error from worker: ray::MLPmodel.stop() (pid=27000, ip=172.16.1.32)
File “python\ray_raylet.pyx”, line 463, in ray._raylet.execute_task
File “python\ray_raylet.pyx”, line 415, in ray._raylet.execute_task.function_executor
File “D:\anaconda3\envs\BA_37\lib\site-packages\ray\function_manager.py”, line 556, in actor_method_executor
return method(__ray_actor, *args, **kwargs)
File “D:\anaconda3\envs\BA_37\lib\site-packages\ray\tune\trainable.py”, line 512, in stop
self.cleanup()
File “d:/Probe/PBT/PBT_probe.py”, line 102, in cleanup
saved_path = self.model.save(self.logdir)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\engine\training.py”, line 1979, in save
signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\saving\save.py”, line 134, in save_model
signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\keras\saving\saved_model\save.py”, line 80, in save
save_lib.save(model, filepath, signatures, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\saved_model\save.py”, line 985, in save
options=ckpt_options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\tracking\util.py”, line 1200, in save
file_prefix_tensor, object_graph_tensor, options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\tracking\util.py”, line 1145, in _save_cached_when_graph_building
save_op = saver.save(file_prefix, options=options)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 295, in save
return save_fn()
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 269, in save_fn
sharded_saves.append(saver.save(shard_prefix, options))
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\training\saving\functional_saver.py”, line 78, in save
return io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\ops\gen_io_ops.py”, line 1731, in save_v2
ctx=_ctx)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\ops\gen_io_ops.py”, line 1751, in save_v2_eager_fallback
ctx=ctx, name=name)
File “D:\anaconda3\envs\BA_37\lib\site-packages\tensorflow\python\eager\execute.py”, line 60, in quick_execute
inputs, attrs, num_outputs)
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xd5 in position 355: invalid continuation byte
and I have used this code to save checkpoint
def save_checkpoint(self, checkpoint_dir):
file_path = checkpoint_dir + "/model" self.model.save(file_path) return file_path def load_checkpoint(self, path): del self.model self.model = load_model(path)
Hier is my ray.run function:
pbt = PopulationBasedTraining(
time_attr="training_iteration", perturbation_interval=2, hyperparam_mutations=mutationspace) results = tune.run( MLPmodel, name="pbt_test", local_dir=os.path.normpath('D:/probe/pbt_checkpoint/'), scheduler=pbt, metric="msle", mode="min", reuse_actors=True, resources_per_trial={ "cpu": 3, "gpu": 1 }, stop={"training_iteration": 4}, num_samples=2, config=searchspace, )
Any suggestions why this might happend and how to fix it?
Thank you