Thursday, April 26, 2012

K-Nearest Neighbour Classifier

The Nearest Neighbour Classifier is one of the most straightforward classifier in the arsenal of machine learning techniques. It performs the classification by identifying the nearest neighbours to a query pattern and using those neighbors to determine the label of the query. The idea behind the algorithm is simple: Assign the query pattern to the class which occurs the most in the k nearest neighbors. In this post we'll use the function knn_search(...) that we have seen in the last post to implement a K-Nearest Neighbour Classifier. The implementation of the classifier is as follows:
from numpy import random,argsort,argmax,bincount,int_,array,vstack,round
from pylab import scatter,show

def knn_classifier(x, D, labels, K):
 """ Classify the vector x
     D - data matrix (each row is a pattern).
     labels - class of each pattern.
     K - number of neighbour to use.
     Returns the class label and the neighbors indexes.
 """
 neig_idx = knn_search(x,D,K)
 counts = bincount(labels[neig_idx]) # voting
 return argmax(counts),neig_idx
Let's test the classifier on some random data:
 # generating a random dataset with random labels
data = random.rand(2,150) # random points
labels = int_(round(random.rand(150)*1)) # random labels 0 or 1
x = random.rand(2,1) # random test point

# label assignment using k=5
result,neig_idx = knn_classifier(x,data,labels,5)
print 'Label assignment:', result

# plotting the data and the input pattern
# class 1, red points, class 0 blue points
scatter(data[0,:],data[1,:], c=labels,alpha=0.8)
scatter(x[0],x[1],marker='o',c='g',s=40)
# highlighting the neighbours
plot(data[0,neig_idx],data[1,neig_idx],'o',
  markerfacecolor='None',markersize=15,markeredgewidth=1)
show()
The script will show the following graph:



The query vector is represented with a green point and we can see that the 3 out of 5 nearest neighbors are red points (label 1) while the remaining 2 are blue (label 2).
The result of the classification will be printed on the console:
Label assignment: 1
As we expected, the green point have been assigned to the class with red markers.

1 comment:

  1. """Thanks a lot :D I made a small change so that the user can classify several points with one call of knn_classifier(x, D, labels, K)"""

    from numpy import random,argsort,argmax,bincount,int_,array,vstack,round,sqrt,ones
    from pylab import scatter,show,plot

    def knn_search(x, D, K):
    """ find K nearest neighbours of data among D """
    ndata = D.shape[0]
    # num of query points
    queries=x.shape
    K = K if K < ndata else ndata
    # euclidean distances from the other points
    diff=array(D*ones(queries,int)).T - x[:,:ndata].T
    sqd=sqrt(((diff.T)**2).sum(axis=2))
    # sorting
    idx=argsort(sqd)
    # return the indexes of K nearest neighbours
    return idx[:,:K]

    def knn_classifier(x, D, labels, K):
    neig_idx = knn_search(x,D,K)
    counts=list()
    for i in range(len(neig_idx)):
    counts.append(bincount(labels[neig_idx[i]])) # voting
    return argmax(array(counts),axis=-1),neig_idx

    #Let's test the classifier on some random data:
    # generating a random dataset with random labels
    data = random.rand(150,2)
    labels = int_(round(random.rand(150)*1)) # random labels 0 or 1

    x = array([[[0.4,0.4]],[[0.6,0.8]],[[0.9,0.2]],[[0.2,0.9]]])

    # label assignment using k=5
    result,neig_idx = knn_classifier(x,data,labels,5)
    print 'Label assignment:', result

    # plotting the data and the input pattern
    # class 1, red points, class 0 blue points
    scatter(data[:,0],data[:,1], c=labels,alpha=0.8)
    scatter(x.T[0],x.T[1],marker='o',c='g',s=40)
    # highlighting the neighbours
    plot(data[neig_idx,0],data[neig_idx,1],'o', markerfacecolor='None',markersize=15,markeredgewidth=1)
    show()

    ReplyDelete

Note: Only a member of this blog may post a comment.