How severe does this issue affect your experience of using Ray?
- High: It blocks me to complete my task.
this is the my data process code in the DataProcessor class, and pytorch dataset using the DataProcessor class.
def process(self, to_drop = ['dateGMT', 'timeGMT', 'unixTime']):
Xs, Ys = [], []
file_list = self._get_file_list()
for file in file_list:
self._partial_fit_scaler(self.folder_path + '/' + file, to_drop)
for file in file_list:
print(file)
X, Y = self._process_data(self.folder_path + '/' + file, to_drop)
Xs.append(X), Ys.append(Y)
return np.concatenate(Xs, axis = 0), np.concatenate(Ys, axis = 0)
class BetterDataset(Dataset):
def __init__(self, data_folder_path, scaler, dataset = 'train', direction = 'forward',
valid_rate = 0.2, test_rate = 0.2, max_distance_interval = 0.05,
input_window = 15, to_drop = ['dateGMT', 'timeGMT', 'unixTime']):
processor = DataProcessor(data_folder_path, scaler, dataset, direction,
valid_rate, test_rate, max_distance_interval,
input_window)
self.x, self.y = processor.process(to_drop)
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
return torch.FloatTensor(self.x[idx]), \
torch.FloatTensor(self.y[idx])
and following is my configuration for param_space, i intentionally fix the value about the data processing for checking
hp = {
# (50m~1000m)
'max_distance_interval' : 0.05,
'input_window' : 15,
# 2^5~2^9
'hidden_size' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
'attention_dim' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
# Suggestion From Ray Tune FAQ
'learning_rate' : tune.loguniform(1e-5, 1e-1),
'batch_size' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
'num_epoch' : 200,
'patience' : 10,
'early_stop' : False,
'to_drop' : ['dateGMT', 'timeGMT', 'unixTime'],
# ['datGMT' 'timeGMT'],
}
this is the runnabale function for training
def load_data(config, dataset, direction):
scaler = MinMaxScaler()
return BetterDataset(data_folder_path='data/GET_DATA/preprocessing/txt/ASW',
scaler = scaler, dataset=dataset, direction=direction,
valid_rate=0.2, test_rate=0.2,
max_distance_interval=config['max_distance_interval'],
input_window = config['input_window'])
...
def train_fn(config):
device = set_device()
train_dataset = load_data(config, 'train', 'forward')
valid_dataset = load_data(config, 'valid', 'forward')
num_input_feature = train_dataset.x.shape[-1] # lot, lan, ...
model = LuongFLP(
input_size = num_input_feature,
hidden_size = hp['hidden_size'],
attention_dim = hp['attention_dim'],
output_size = 2,
device = device
)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = hp['learning_rate'])
train_dataloader = DataLoader(train_dataset,
batch_size = hp['batch_size'],
shuffle= False)
valid_dataloader = DataLoader(valid_dataset,
batch_size = hp['batch_size'],
shuffle= False)
loss_graph = []
val_loss_graph = []
best_val = 987654321
counter = 0
model.to(device)
progress_bar = tqdm(range(config['num_epoch']), desc="Epoch", leave= True)
for epoch in progress_bar:
train_loss = train_epoch(model, device, loss_fn, optimizer, train_dataloader)
val_loss, _ = evaluate(model, device, loss_fn, valid_dataloader, return_pred = False)
loss_graph.append(train_loss)
val_loss_graph.append(val_loss)
progress_bar.set_postfix(
{
'Train Loss': f'{train_loss:.4e}',
'Validation Loss': f'{val_loss:.4e}'
}
)
if config['early_stop'] == True:
if val_loss < best_val:
best_val = val_loss
counter = 0
else:
counter += 1
if config['patience'] < counter:
print(f"Early Stopped on epoch {epoch}")
return
train.report({"train loss": train_loss, "valid loss": val_loss})
following is the error
ValueError: need at least one array to concatenate
2024-05-16 00:41:18,041 WARNING experiment_state.py:205 -- Experiment state snapshotting has been triggered multiple times in the last 5.0 seconds. A snapshot is forced if `CheckpointConfig(num_to_keep)` is set, and a trial has checkpointed >= `num_to_keep` times since the last snapshot.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-05-16 00:41:18,043 INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/Users/junheejang/projects/Future_Location_Prediction/ray_results/train_fn_2024-05-16_00-41-13' in 0.0049s.
(train_fn pid=35486) mps is available
2024-05-16 00:41:18,046 ERROR tune.py:1035 -- Trials did not complete: [train_fn_90a47_00000]
2024-05-16 00:41:18,046 INFO tune.py:1039 -- Total run time: 2.46 seconds (2.43 seconds for the tuning loop).
in normal training code, with same value(input window, max_distance_interval, to_drop) it really works well with the DataProcessor,
But in tuning process, always got the error ‘ValueError: need at least one array to concatenate’
Why this happens? Even I kill the parallel tuning with max_concurrent_trials = 1
in TuneConfig, it happens.
Thank you sincerely.