Hi, thanks for responding.
I believe so, the policy server wraps a neural network, moves it to the the GPU and then another worker calls inference on their data remotely.
At the moment i’m really only working on a skeleton of the full set up so i can post exact code:
@ray.remote(num_gpus=0.25)
class Policy_Server:
def __init__(self, device):
self.device = device
self.model = net().to(self.device)
self.model.eval()
def update_model_parameters(self, state_dict: Dict[str, Tensor]) -> str:
with self.lock:
self.model.load_state_dict(state_dict=state_dict)
return "Model Parameters updated..."
async def inference(
self, observation: Union[Tensor, np.ndarray]
) -> Tuple[np.ndarray, np.ndarray]:
# print(self, observation.shape)
batch = t.tensor(observation).float().to(self.device)
with t.no_grad():
p, v = self.model(batch)
return p.detach().cpu().numpy(), v.detach().cpu().numpy().flatten()
def model_init(self, path: Path = Path("./Checkpoints/best_model.pth")):
self.update_model_parameters(t.load(path, weights_only=True))
and
@ray.remote
class self_play_worker:
def __init__(self, policy_server, buffer, num_boards, num_reads, index):
self.policy_server = policy_server
self.buffer = buffer
self.num_boards = num_boards
self.num_reads = num_reads
self.running = True
self.index = index
async def self_play(self):
"""Below is dummy code""
while self.running:
data = []
range_of_index = np.random.randint(5, 20)
for _ in range(range_of_index):
s = np.random.randint(0, 2, size=(111, 8, 8))
ref = self.policy_server.inference.remote(s)
fut: asyncio.Future = asyncio.wrap_future(ref.future())
p, v = await fut
data.append(
np.hstack(
(s.flatten(), np.argmax(p, axis=1).flatten(), v.flatten())
)
)
result = np.vstack(data)
await self.buffer.add.remote(result, self.index)
time.sleep(np.random.randint(5, 20))
# return False
def stop_play(self):
self.running = False
def start_play(self):
self.running = True
The only thing i can think of is the future wrap in self_play_worker. but i dont see why.
device is defined in main and is passed to both the Policy_Server and the Trainer. The Trainer has memory allocated, the Policy Server doesn’t it seems.