import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
reference = np.random.rand(100, 2)
plt.figure(figsize=(12, 3))
for k in range(1,6):
kmeans = KMeans(n_clusters=k)
a = kmeans.fit_predict(reference)
plt.subplot(1,5,k)
plt.scatter(reference[:, 0], reference[:, 1], c=a)
plt.xlabel('k='+str(k))
plt.tight_layout()
plt.show()
From the figure above we can see that the algorithm evenly splits the points K clusters even if there's no separation between them. Let’s now do the same on a target dataset with 3 natural clusters:
X = make_blobs(n_samples=100, n_features=2,
centers=3, cluster_std=.8,)[0]
plt.figure(figsize=(12, 3))
for k in range(1,6):
kmeans = KMeans(n_clusters=k)
a = kmeans.fit_predict(X)
plt.subplot(1,5,k)
plt.scatter(X[:, 0], X[:, 1], c=a)
plt.xlabel('k='+str(k))
plt.tight_layout()
plt.show()
Here we note that the algorithm, with K=2, correctly isolates one of the clusters grouping the other two together. Then, with K=3, correctly identifies the natural clusters. But, with K=4 and K=5 some of the natural clusters are split in two. If we plot the inertia in both cases we'll see something interesting:
def compute_inertia(a, X):
W = [np.mean(pairwise_distances(X[a == c, :])) for c in np.unique(a)]
return np.mean(W)
def compute_gap(clustering, data, k_max=5, n_references=5):
if len(data.shape) == 1:
data = data.reshape(-1, 1)
reference = np.random.rand(*data.shape)
reference_inertia = []
for k in range(1, k_max+1):
local_inertia = []
for _ in range(n_references):
clustering.n_clusters = k
assignments = clustering.fit_predict(reference)
local_inertia.append(compute_inertia(assignments, reference))
reference_inertia.append(np.mean(local_inertia))
ondata_inertia = []
for k in range(1, k_max+1):
clustering.n_clusters = k
assignments = clustering.fit_predict(data)
ondata_inertia.append(compute_inertia(assignments, data))
gap = np.log(reference_inertia)-np.log(ondata_inertia)
return gap, np.log(reference_inertia), np.log(ondata_inertia)
k_max = 5
gap, reference_inertia, ondata_inertia = compute_gap(KMeans(), X, k_max)
plt.plot(range(1, k_max+1), reference_inertia,
'-o', label='reference')
plt.plot(range(1, k_max+1), ondata_inertia,
'-o', label='data')
plt.xlabel('k')
plt.ylabel('log(inertia)')
plt.show()
On the reference dataset the inertia goes down’ very slowly while on the target dataset it assumes the shape of an elbow! We can now compute the Gap Statistics for each K computing the difference of the two curves showed above:
plt.plot(range(1, k_max+1), gap, '-o')
plt.ylabel('gap')
plt.xlabel('k')
It’s easy to see that the Gap is maximum for K=3, just the right choice for our target dataset.
For a more formal introduction you can check out the following paper: Tibshirani R, Walther G, Hastie T. Estimating the number of clusters in a dataset via the gap statistic. Journal of the Royal Statistics Society 2001.



