How can I get map_batches to take advantage of all GPUs on a single node? nvidia-smi
will only show 1 GPU used no matter what.
# get preds
predictions = test_ds_tokenized.map_batches(TorchPredictor(model=model),
num_gpus=1, # or 4
batch_size=32,
compute=ray.data.ActorPoolStrategy(size=4) # num gpus in cluster # or 1
).materialize()
My main class, following: End-to-end: Offline Batch Inference — Ray 2.7.0
class TorchPredictor:
def __init__(self, model):
self.model = model.cuda()
self.model.eval()
def __call__(self, batch):
# transform to tensor / attach to GPU
batch["input_ids"] = torch.as_tensor(batch["input_ids"], dtype=torch.int64, device="cuda")
batch["attention_mask"] = torch.as_tensor(batch["attention_mask"], dtype=torch.int64, device="cuda")
# like no_grad
with torch.inference_mode():
# forward and back to cpu
out = self.model.generate(input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
**{
"max_length": 750,
"do_sample": False,
}
)
# decode
out = tokenizer.batch_decode(out, skip_special_tokens=True)
return {
"y_pred": out ,
}