RayTaskError(OutOfMemoryError) when using a LLM

Hi, Im new to Ray and I’m applying it to text-generation tasks. When I execute my code, which you can find it below, on Google Colab I get the following error:

RayTaskError(OutOfMemoryError): ray::generate_text_remote() (pid=1397404, ip=192.168.1.2)

I get the error with result = ray.get(result_id)

ray.init(ignore_reinit_error=True)

#LOAD LLM
# Load the entire model on the GPU 0
device_map = {"": 0}
# Load the fine-tuned model
model_name  = "ndenico/llama-2-7b-all_beauty"
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
return_dict=True,
load_in_8bit=True,
#torch_dtype=torch.float16,
device_map=device_map,
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


@ray.remote(num_gpus=1, max_calls=1)
def generate_text_remote(prompt, summarizer_obj):
    summary = summarizer_obj(prompt)
    return summary

def summarize_llm(review_dict):
    summarize_review = {}

    summarizer = ray.put(pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=max_length_summary,
                       penalty_alpha=0.6, top_k=4, max_new_tokens=max_length_summary))

    for key in review_dict.keys():
        text = review_dict[key]
        prompt = f"<s>[INST] Find the consensus information of the following reviews. This is the most common ideas among the reviews. Summarize it in less than {max_length_summary} words. The reviews are the following ones: {text} [/INST]"
        result_id = generate_text_remote.remote(prompt, summarizer)
        result = ray.get(result_id)
        summarize_review[key] = result[0]['generated_text']


real_dict_summarized_llm = summarize_llm(real_dict_merged)

Is Ray not well implemented? Or what should I do to avoid the memory error?

Thanks!