Certainly! Here’s a simple example of manual orchestration: you check available resources, then decide whether to run both actors (models) in parallel or serialize them to minimize model loading/offloading. This pattern is not automatic in Ray; you must implement the logic yourself (see this example, Ray resource scheduling).
import ray
import time
ray.init()
@ray.remote
class ModelActor:
def __init__(self, model_name):
print(f"Loading model {model_name}")
self.model_name = model_name
time.sleep(2) # Simulate model loading
def run_task(self, x):
return f"{self.model_name} processed {x}"
def run_tasks_serial():
# Run all A tasks with model M1
actor_a = ModelActor.remote("M1")
results_a = ray.get([actor_a.run_task.remote(i) for i in range(5)])
ray.kill(actor_a)
# Run all B tasks with model M2
actor_b = ModelActor.remote("M2")
results_b = ray.get([actor_b.run_task.remote(i) for i in range(5, 10)])
ray.kill(actor_b)
return results_a + results_b
def run_tasks_parallel():
actor_a = ModelActor.remote("M1")
actor_b = ModelActor.remote("M2")
results_a = [actor_a.run_task.remote(i) for i in range(5)]
results_b = [actor_b.run_task.remote(i) for i in range(5, 10)]
results = ray.get(results_a + results_b)
ray.kill(actor_a)
ray.kill(actor_b)
return results
# Example: check available resources and choose strategy
resources = ray.available_resources()
if resources.get("CPU", 0) >= 2:
print("Running in parallel")
results = run_tasks_parallel()
else:
print("Running serially")
results = run_tasks_serial()
print(results)
This code checks available CPUs and chooses whether to run both actors in parallel or serialize them, minimizing model loads if resources are limited.
Sources:
Hint: Mention @RunLLM in the post for followups.