I am trying out various paradigm to parallelize a knn algorithm, among them I was trying to use ray, below my simple code:
import numpy as np
import ray
from tqdm import tqdm
@ray.remote
def _predict(X_train, y_train, x, k):
# Calculate distances from the input point to all training points
distances = np.linalg.norm(X_train - x, axis=1)
# Sort by distance and return indices of the first k neighbors
k_indices = np.argsort(distances)[:k]
# Extract the labels of the k nearest neighbor training samples
k_nearest_labels = [y_train[i] for i in k_indices]
# Return the most common class label among the k nearest neighbors
most_common = np.bincount(k_nearest_labels).argmax()
return most_common
def predict (X_train, y_train, X_test, k):
y_pred = ray.get([_predict.remote(X_train, y_train, x, k) for x in X_test])
class KNNClassifier:
def __init__(self, k=3):
self.k = k
def fit(self, X, y):
self.X_train = ray.put(X)
self.y_train = ray.put(y)
def predict(self, X):
# y_pred = [self._predict(x) for x in X]
return predict(self.X_train, self.y_train, X, self.k)
This code gives way inferior performances when compared to a very similar MPI solution, can anybody guess why?