Constraint KMeans#

Simple example to show how to cluster keeping approximatively the same number of points in every cluster.

Data#

from collections import Counter

import matplotlib.pyplot as plt
import numpy
from mlinsights.mlmodel import ConstraintKMeans
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

n_samples = 100
data = make_blobs(
    n_samples=n_samples,
    n_features=2,
    centers=2,
    cluster_std=1.0,
    center_box=(-10.0, 0.0),
    shuffle=True,
    random_state=2,
)
X1 = data[0]
data = make_blobs(
    n_samples=n_samples // 2,
    n_features=2,
    centers=2,
    cluster_std=1.0,
    center_box=(0.0, 10.0),
    shuffle=True,
    random_state=2,
)
X2 = data[0]

X = numpy.vstack([X1, X2])
X.shape
(150, 2)

Plots.

fig, ax = plt.subplots(1, 1, figsize=(4, 4))
ax.plot(X[:, 0], X[:, 1], ".")
ax.set_title("4 clusters")
4 clusters
Text(0.5, 1.0, '4 clusters')

Standard KMeans#

km = KMeans(n_clusters=4)
km.fit(X)
cl = km.predict(X)
hist = Counter(cl)

colors = "brgy"
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
for i in range(0, max(cl) + 1):
    ax.plot(X[cl == i, 0], X[cl == i, 1], colors[i] + ".", label="cl%d" % i)
    x = [km.cluster_centers_[i, 0], km.cluster_centers_[i, 0]]
    y = [km.cluster_centers_[i, 1], km.cluster_centers_[i, 1]]
    ax.plot(x, y, colors[i] + "+")
ax.set_title(f"KMeans 4 clusters\n{hist!r}")
ax.legend()
KMeans 4 clusters Counter({0: 50, 2: 50, 1: 27, 3: 23})
/home/xadupre/install/scikit-learn/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

<matplotlib.legend.Legend object at 0x7fed7b8c4850>

Constraint KMeans#

km1 = ConstraintKMeans(n_clusters=4, strategy="gain", balanced_predictions=True)
km1.fit(X)

km2 = ConstraintKMeans(n_clusters=4, strategy="distance", balanced_predictions=True)
km2.fit(X)
ConstraintKMeans(balanced_predictions=True, n_clusters=4, strategy='distance')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


This algorithm tries to exchange points between clusters.

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
for i in range(0, max(cl1) + 1):
    ax[0].plot(X[cl1 == i, 0], X[cl1 == i, 1], colors[i] + ".", label="cl%d" % i)
    ax[1].plot(X[cl2 == i, 0], X[cl2 == i, 1], colors[i] + ".", label="cl%d" % i)
    x = [km1.cluster_centers_[i, 0], km1.cluster_centers_[i, 0]]
    y = [km1.cluster_centers_[i, 1], km1.cluster_centers_[i, 1]]
    ax[0].plot(x, y, colors[i] + "+")
    x = [km2.cluster_centers_[i, 0], km2.cluster_centers_[i, 0]]
    y = [km2.cluster_centers_[i, 1], km2.cluster_centers_[i, 1]]
    ax[1].plot(x, y, colors[i] + "+")
ax[0].set_title(f"ConstraintKMeans 4 clusters (gains)\n{hist1!r}")
ax[0].legend()
ax[1].set_title(f"ConstraintKMeans 4 clusters (distances)\n{hist2!r}")
ax[1].legend()
ConstraintKMeans 4 clusters (gains) Counter({3: 39, 0: 37, 2: 37, 1: 37}), ConstraintKMeans 4 clusters (distances) Counter({1: 38, 2: 38, 0: 37, 3: 37})
<matplotlib.legend.Legend object at 0x7fed8cb7e500>

Another algorithm tries to extend the area of attraction of each cluster.

km = ConstraintKMeans(n_clusters=4, strategy="weights", max_iter=1000, history=True)
km.fit(X)

cl = km.predict(X)
hist = Counter(cl)

Let’s plot Delaunay edges as well.

def plot_delaunay(ax, edges, points):
    for a, b in edges:
        ax.plot(points[[a, b], 0], points[[a, b], 1], "--", color="#555555")


edges = km.cluster_edges()


fig, ax = plt.subplots(1, 2, figsize=(10, 4))
for i in range(0, max(cl) + 1):
    ax[0].plot(X[cl == i, 0], X[cl == i, 1], colors[i] + ".", label="cl%d" % i)
    x = [km.cluster_centers_[i, 0], km.cluster_centers_[i, 0]]
    y = [km.cluster_centers_[i, 1], km.cluster_centers_[i, 1]]
    ax[0].plot(x, y, colors[i] + "+")
ax[0].set_title(f"ConstraintKMeans 4 clusters\nstrategy='weights'\n{hist!r}")
ax[0].legend()

cls = km.cluster_centers_iter_
ax[1].plot(X[:, 0], X[:, 1], ".", label="X", color="#AAAAAA", ms=3)
for i in range(0, max(cl) + 1):
    ms = numpy.arange(cls.shape[-1]).astype(numpy.float64) / cls.shape[-1] * 50 + 1
    ax[1].scatter(cls[i, 0, :], cls[i, 1, :], color=colors[i], s=ms, label="cl%d" % i)
    plot_delaunay(ax[1], edges, km.cluster_centers_)
ax[1].set_title("Centers movement")
ConstraintKMeans 4 clusters strategy='weights' Counter({2: 49, 3: 49, 0: 48, 1: 4}), Centers movement
Text(0.5, 1.0, 'Centers movement')

Total running time of the script: (0 minutes 1.667 seconds)

Gallery generated by Sphinx-Gallery