Thursday, April 26, 2012

K-Nearest Neighbour Classifier

The Nearest Neighbour Classifier is one of the most straightforward classifier in the arsenal of machine learning techniques. It performs the classiļ¬cation by identifying the nearest neighbours to a query pattern and using those neighbors to determine the label of the query. The idea behind the algorithm is simple: Assign the query pattern to the class which occurs the most in the k nearest neighbors. In this post we'll use the function knn_search(...) that we have seen in the last post to implement a K-Nearest Neighbour Classifier. The implementation of the classifier is as follows:
from numpy import random,argsort,argmax,bincount,int_,array,vstack,round
from pylab import scatter,show

def knn_classifier(x, D, labels, K):
 """ Classify the vector x
     D - data matrix (each row is a pattern).
     labels - class of each pattern.
     K - number of neighbour to use.
     Returns the class label and the neighbors indexes.
 """
 neig_idx = knn_search(x,D,K)
 counts = bincount(labels[neig_idx]) # voting
 return argmax(counts),neig_idx
Let's test the classifier on some random data:
 # generating a random dataset with random labels
data = random.rand(2,150) # random points
labels = int_(round(random.rand(150)*1)) # random labels 0 or 1
x = random.rand(2,1) # random test point

# label assignment using k=5
result,neig_idx = knn_classifier(x,data,labels,5)
print 'Label assignment:', result

# plotting the data and the input pattern
# class 1, red points, class 0 blue points
scatter(data[0,:],data[1,:], c=labels,alpha=0.8)
scatter(x[0],x[1],marker='o',c='g',s=40)
# highlighting the neighbours
plot(data[0,neig_idx],data[1,neig_idx],'o',
  markerfacecolor='None',markersize=15,markeredgewidth=1)
show()
The script will show the following graph:



The query vector is represented with a green point and we can see that the 3 out of 5 nearest neighbors are red points (label 1) while the remaining 2 are blue (label 2).
The result of the classification will be printed on the console:
Label assignment: 1
As we expected, the green point have been assigned to the class with red markers.

1 comment:

  1. """Thanks a lot :D I made a small change so that the user can classify several points with one call of knn_classifier(x, D, labels, K)"""

    from numpy import random,argsort,argmax,bincount,int_,array,vstack,round,sqrt,ones
    from pylab import scatter,show,plot

    def knn_search(x, D, K):
    """ find K nearest neighbours of data among D """
    ndata = D.shape[0]
    # num of query points
    queries=x.shape
    K = K if K < ndata else ndata
    # euclidean distances from the other points
    diff=array(D*ones(queries,int)).T - x[:,:ndata].T
    sqd=sqrt(((diff.T)**2).sum(axis=2))
    # sorting
    idx=argsort(sqd)
    # return the indexes of K nearest neighbours
    return idx[:,:K]

    def knn_classifier(x, D, labels, K):
    neig_idx = knn_search(x,D,K)
    counts=list()
    for i in range(len(neig_idx)):
    counts.append(bincount(labels[neig_idx[i]])) # voting
    return argmax(array(counts),axis=-1),neig_idx

    #Let's test the classifier on some random data:
    # generating a random dataset with random labels
    data = random.rand(150,2)
    labels = int_(round(random.rand(150)*1)) # random labels 0 or 1

    x = array([[[0.4,0.4]],[[0.6,0.8]],[[0.9,0.2]],[[0.2,0.9]]])

    # label assignment using k=5
    result,neig_idx = knn_classifier(x,data,labels,5)
    print 'Label assignment:', result

    # plotting the data and the input pattern
    # class 1, red points, class 0 blue points
    scatter(data[:,0],data[:,1], c=labels,alpha=0.8)
    scatter(x.T[0],x.T[1],marker='o',c='g',s=40)
    # highlighting the neighbours
    plot(data[neig_idx,0],data[neig_idx,1],'o', markerfacecolor='None',markersize=15,markeredgewidth=1)
    show()

    ReplyDelete