Similar to the recent “redi connection resets” thread I’ve been unable to pass my large dataset to the models. I have the following traceback
Traceback (most recent call last):
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/connection.py", line 700, in send_packed_command
sendall(self._sock, item)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/_compat.py", line 8, in sendall
return sock.sendall(*args, **kwargs)
BrokenPipeError: [Errno 32] Broken pipe
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "hparam_search.py", line 151, in <module>
train_infomax_asha(config, Dataset, ASHA)
File "hparam_search.py", line 79, in train_infomax_asha
analysis = tune.run(
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/tune.py", line 299, in run
experiments[i] = Experiment(
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/experiment.py", line 138, in __init__
self._run_identifier = Experiment.register_if_needed(run)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/experiment.py", line 276, in register_if_needed
register_trainable(name, run_object)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/registry.py", line 71, in register_trainable
_global_registry.register(TRAINABLE_CLASS, name, trainable)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/registry.py", line 124, in register
self.flush_values()
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/tune/registry.py", line 146, in flush_values
_internal_kv_put(_make_key(category, key), value, overwrite=True)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/ray/experimental/internal_kv.py", line 27, in _internal_kv_put
updated = worker.redis_client.hset(key, "value", value)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/client.py", line 3004, in hset
return self.execute_command('HSET', name, key, value)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/client.py", line 877, in execute_command
conn.send_command(*args)
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/connection.py", line 720, in send_command
self.send_packed_command(self.pack_command(*args),
File "/home/michael/anaconda3/envs/pytorch-cuda-11-tune-nightly/lib/python3.8/site-packages/redis/connection.py", line 712, in send_packed_command
raise ConnectionError("Error %s while writing to socket. %s." %
redis.exceptions.ConnectionError: Error 32 while writing to socket. Broken pipe.
my train func is as follows
def train_model(config, data=None, checkpoint_dir=None):
callback = TuneReportCallback({"loss": "avg_val_loss"}, on="validation_end")
print(config)
trainer = pl.Trainer(
gpus=1,
callbacks=[basic_callbacks(), callback],
**config["Trainer kwargs"],
auto_select_gpus=True,
precision=16
)
model = Attention_Infomax(config, Dataset)
trainer.fit(model)
my run function is
analysis = tune.run(
tune.with_parameters(train_model, data=Dataset),
resources_per_trial=resources_per_trial,
progress_reporter=reporter,
scheduler=scheduler,
config=config,
raise_on_failed_trial=False,
max_failures=0,
num_samples=10,
search_alg=search_algorithm,
name="nevergrad",
mode="min",
metric="loss",
)