import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.cluster import KMeans reference = np.random.rand(100, 2) plt.figure(figsize=(12, 3)) for k in range(1,6): kmeans = KMeans(n_clusters=k) a = kmeans.fit_predict(reference) plt.subplot(1,5,k) plt.scatter(reference[:, 0], reference[:, 1], c=a) plt.xlabel('k='+str(k)) plt.tight_layout() plt.show()
From the figure above we can see that the algorithm evenly splits the points K clusters even if there's no separation between them. Let’s now do the same on a target dataset with 3 natural clusters:
X = make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=.8,)[0] plt.figure(figsize=(12, 3)) for k in range(1,6): kmeans = KMeans(n_clusters=k) a = kmeans.fit_predict(X) plt.subplot(1,5,k) plt.scatter(X[:, 0], X[:, 1], c=a) plt.xlabel('k='+str(k)) plt.tight_layout() plt.show()
Here we note that the algorithm, with K=2, correctly isolates one of the clusters grouping the other two together. Then, with K=3, correctly identifies the natural clusters. But, with K=4 and K=5 some of the natural clusters are split in two. If we plot the inertia in both cases we'll see something interesting:
def compute_inertia(a, X): W = [np.mean(pairwise_distances(X[a == c, :])) for c in np.unique(a)] return np.mean(W) def compute_gap(clustering, data, k_max=5, n_references=5): if len(data.shape) == 1: data = data.reshape(-1, 1) reference = np.random.rand(*data.shape) reference_inertia = [] for k in range(1, k_max+1): local_inertia = [] for _ in range(n_references): clustering.n_clusters = k assignments = clustering.fit_predict(reference) local_inertia.append(compute_inertia(assignments, reference)) reference_inertia.append(np.mean(local_inertia)) ondata_inertia = [] for k in range(1, k_max+1): clustering.n_clusters = k assignments = clustering.fit_predict(data) ondata_inertia.append(compute_inertia(assignments, data)) gap = np.log(reference_inertia)-np.log(ondata_inertia) return gap, np.log(reference_inertia), np.log(ondata_inertia) k_max = 5 gap, reference_inertia, ondata_inertia = compute_gap(KMeans(), X, k_max) plt.plot(range(1, k_max+1), reference_inertia, '-o', label='reference') plt.plot(range(1, k_max+1), ondata_inertia, '-o', label='data') plt.xlabel('k') plt.ylabel('log(inertia)') plt.show()
On the reference dataset the inertia goes down’ very slowly while on the target dataset it assumes the shape of an elbow! We can now compute the Gap Statistics for each K computing the difference of the two curves showed above:
plt.plot(range(1, k_max+1), gap, '-o') plt.ylabel('gap') plt.xlabel('k')
It’s easy to see that the Gap is maximum for K=3, just the right choice for our target dataset.
For a more formal introduction you can check out the following paper: Tibshirani R, Walther G, Hastie T. Estimating the number of clusters in a dataset via the gap statistic. Journal of the Royal Statistics Society 2001.