Getting problem with using ray tune

How severe does this issue affect your experience of using Ray?

  • High: It blocks me to complete my task.

this is the my data process code in the DataProcessor class, and pytorch dataset using the DataProcessor class.

  def process(self, to_drop = ['dateGMT', 'timeGMT', 'unixTime']):
    Xs, Ys = [], []
    file_list = self._get_file_list()
    for file in file_list:
      self._partial_fit_scaler(self.folder_path + '/' + file, to_drop)
    for file in file_list:
      print(file)
      X, Y = self._process_data(self.folder_path + '/' + file, to_drop)
      Xs.append(X), Ys.append(Y)
    
    return np.concatenate(Xs, axis = 0), np.concatenate(Ys, axis = 0)
class BetterDataset(Dataset):
  def __init__(self, data_folder_path, scaler, dataset = 'train', direction = 'forward',  
               valid_rate = 0.2, test_rate = 0.2, max_distance_interval = 0.05,
               input_window = 15, to_drop = ['dateGMT', 'timeGMT', 'unixTime']):
    processor = DataProcessor(data_folder_path, scaler, dataset, direction, 
                              valid_rate, test_rate, max_distance_interval,
                              input_window)
    self.x, self.y = processor.process(to_drop)
  
  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, idx):
    return torch.FloatTensor(self.x[idx]), \
           torch.FloatTensor(self.y[idx])

and following is my configuration for param_space, i intentionally fix the value about the data processing for checking

hp = {
  # (50m~1000m)
  'max_distance_interval' : 0.05,
  'input_window' : 15,
  # 2^5~2^9
  'hidden_size' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
  'attention_dim' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
  # Suggestion From Ray Tune FAQ
  'learning_rate' : tune.loguniform(1e-5, 1e-1),
  'batch_size' : tune.sample_from(lambda _: 2**np.random.randint(5, 10)),
  'num_epoch' : 200,
  'patience' : 10,
  'early_stop' : False,
  'to_drop' : ['dateGMT', 'timeGMT', 'unixTime'], 
              # ['datGMT' 'timeGMT'],
}

this is the runnabale function for training

def load_data(config, dataset, direction):
  scaler = MinMaxScaler()
  return BetterDataset(data_folder_path='data/GET_DATA/preprocessing/txt/ASW',
                        scaler = scaler, dataset=dataset, direction=direction,
                        valid_rate=0.2, test_rate=0.2,
                        max_distance_interval=config['max_distance_interval'], 
                        input_window = config['input_window'])
  
...
def train_fn(config):
  device = set_device()
  
  train_dataset = load_data(config, 'train', 'forward')
  valid_dataset = load_data(config, 'valid', 'forward')
    
  num_input_feature = train_dataset.x.shape[-1] # lot, lan, ...
  
  model = LuongFLP(
    input_size = num_input_feature, 
    hidden_size = hp['hidden_size'],
    attention_dim = hp['attention_dim'],
    output_size = 2,
    device = device
  )

  loss_fn = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr = hp['learning_rate'])
  
  train_dataloader = DataLoader(train_dataset, 
                                batch_size = hp['batch_size'],
                                shuffle= False)
  valid_dataloader = DataLoader(valid_dataset, 
                                batch_size = hp['batch_size'],
                                shuffle= False)
  
  loss_graph = []
  val_loss_graph = []
  best_val = 987654321
  counter = 0
  
  model.to(device)
  progress_bar = tqdm(range(config['num_epoch']), desc="Epoch", leave= True)
  for epoch in progress_bar:
    train_loss = train_epoch(model, device, loss_fn, optimizer, train_dataloader)
    val_loss, _ = evaluate(model, device, loss_fn, valid_dataloader, return_pred = False)
  
    loss_graph.append(train_loss)
    val_loss_graph.append(val_loss)
    
    progress_bar.set_postfix(
      {
        'Train Loss': f'{train_loss:.4e}',
        'Validation Loss': f'{val_loss:.4e}'
       }
    )

    if config['early_stop'] == True: 
      if val_loss < best_val:
        best_val = val_loss
        counter = 0
      else:
        counter += 1
      if config['patience'] < counter:
        print(f"Early Stopped on epoch {epoch}")
        return 
      
    train.report({"train loss": train_loss, "valid loss": val_loss})

following is the error

ValueError: need at least one array to concatenate
2024-05-16 00:41:18,041	WARNING experiment_state.py:205 -- Experiment state snapshotting has been triggered multiple times in the last 5.0 seconds. A snapshot is forced if `CheckpointConfig(num_to_keep)` is set, and a trial has checkpointed >= `num_to_keep` times since the last snapshot.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-05-16 00:41:18,043	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/Users/junheejang/projects/Future_Location_Prediction/ray_results/train_fn_2024-05-16_00-41-13' in 0.0049s.
(train_fn pid=35486) mps is available
2024-05-16 00:41:18,046	ERROR tune.py:1035 -- Trials did not complete: [train_fn_90a47_00000]
2024-05-16 00:41:18,046	INFO tune.py:1039 -- Total run time: 2.46 seconds (2.43 seconds for the tuning loop).

in normal training code, with same value(input window, max_distance_interval, to_drop) it really works well with the DataProcessor,

But in tuning process, always got the error ‘ValueError: need at least one array to concatenate’

Why this happens? Even I kill the parallel tuning with max_concurrent_trials = 1 in TuneConfig, it happens.

Thank you sincerely.