from numpy import mean,cov,cumsum,dot,linalg,size,flipud def princomp(A,numpc=0): # computing eigenvalues and eigenvectors of covariance matrix M = (A-mean(A.T,axis=1)).T # subtract the mean (along columns) [latent,coeff] = linalg.eig(cov(M)) p = size(coeff,axis=1) idx = argsort(latent) # sorting the eigenvalues idx = idx[::-1] # in ascending order # sorting eigenvectors according to the sorted eigenvalues coeff = coeff[:,idx] latent = latent[idx] # sorting eigenvalues if numpc < p and numpc >= 0: coeff = coeff[:,range(numpc)] # cutting some PCs if needed score = dot(coeff.T,M) # projection of the data in the new space return coeff,score,latentThe following code uses the new version of the princomp to compute the PCA of a matrix that represents an image in gray scale. The PCA is computed ten times with an increasing number of principal components. The script show the images reconstructed using less than 50 principal components (out of 200).
from pylab import imread,subplot,imshow,title,gray,figure,show,NullLocator A = imread('shakira.jpg') # load an image A = mean(A,2) # to get a 2-D array full_pc = size(A,axis=1) # numbers of all the principal components i = 1 dist = [] for numpc in range(0,full_pc+10,10): # 0 10 20 ... full_pc coeff, score, latent = princomp(A,numpc) Ar = dot(coeff,score).T+mean(A,axis=0) # image reconstruction # difference in Frobenius norm dist.append(linalg.norm(A-Ar,'fro')) # showing the pics reconstructed with less than 50 PCs if numpc <= 50: ax = subplot(2,3,i,frame_on=False) ax.xaxis.set_major_locator(NullLocator()) # remove ticks ax.yaxis.set_major_locator(NullLocator()) i += 1 imshow(flipud(Ar)) title('PCs # '+str(numpc)) gray() figure() imshow(flipud(A)) title('numpc FULL') gray() show()The resulting images:
We can see that 40 principal components are enough to reconstruct the original image.
At the end of this experiment, we can plot the distance of the reconstructed images from the original image in Frobenius norm (red curve) and the cumulative sum of the eigenvalues (blue curve). Recall that the cumulative sum of the eigenvalues shows the level of variance accounted by each of the corresponding eigenvectors. On the x axis there is the number of eigenvalues/eigenvectors used.
from pylab import plot,axis figure() perc = cumsum(latent)/sum(latent) dist = dist/max(dist) plot(range(len(perc)),perc,'b',range(0,full_pc+10,10),dist,'r') axis([0,full_pc,0,1.1]) show()