Note

Go to the end to download the full example code.

Decision Tree and Logistic Regression¶

The notebook demonstrates the model DecisionTreeLogisticRegression which replaces the decision based on one variable by a logistic regression.

Iris dataset and logistic regression¶

The following code shows the border defined by two machine learning models on the Iris dataset.

import numpy
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from pandas import DataFrame
from tqdm import tqdm
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from mlinsights.mlmodel import DecisionTreeLogisticRegression
from mlinsights.mltree import predict_leaves


def plot_classifier_decision_zone(clf, X, y, title=None, ax=None):
    if ax is None:
        ax = plt.gca()

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    dhx = (x_max - x_min) / 100
    dhy = (y_max - y_min) / 100
    xx, yy = numpy.meshgrid(
        numpy.arange(x_min, x_max, dhx), numpy.arange(y_min, y_max, dhy)
    )

    Z = clf.predict(numpy.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.5)
    ax.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k", lw=0.5)
    if title is not None:
        ax.set_title(title)


iris = load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
y = y % 2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, shuffle=True)

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
plot_classifier_decision_zone(lr, X_test, y_test, ax=ax[0], title="LogisticRegression")
plot_classifier_decision_zone(
    dt, X_test, y_test, ax=ax[1], title="DecisionTreeClassifier"
)

LogisticRegression, DecisionTreeClassifier

The logistic regression is not very stable on this sort of problem. No linear separator can work on this dataset. Let’s dig into it.

DecisionTreeLogisticRegression¶

dtlr = DecisionTreeLogisticRegression(
    estimator=LogisticRegression(solver="liblinear"),
    min_samples_leaf=10,
    min_samples_split=10,
    max_depth=1,
    fit_improve_algo="none",
)
dtlr.fit(X_train, y_train)

DecisionTreeLogisticRegression(estimator=LogisticRegression(solver='liblinear'),
                               fit_improve_algo='none', max_depth=1,
                               min_samples_leaf=10, min_samples_split=10)

DecisionTreeLogisticRegression

iFitted

Parameters

	estimator	LogisticRegre...r='liblinear')
	max_depth	1
	min_samples_split	10
	min_samples_leaf	10
	min_weight_fraction_leaf	0.0
	fit_improve_algo	'none'
	p1p2	0.09
	gamma	1.0
	verbose	0
	strategy	'parallel'

estimator: LogisticRegression

LogisticRegression(solver='liblinear')

LogisticRegression

?Documentation for LogisticRegression

Parameters

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	None
	solver	'liblinear'
	max_iter	100
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

dtlr2 = DecisionTreeLogisticRegression(
    estimator=LogisticRegression(solver="liblinear"),
    min_samples_leaf=4,
    min_samples_split=4,
    max_depth=10,
    fit_improve_algo="intercept_sort_always",
)
dtlr2.fit(X_train, y_train)

fig, ax = plt.subplots(2, 2, figsize=(10, 8))
plot_classifier_decision_zone(
    dtlr,
    X_train,
    y_train,
    ax=ax[0, 0],
    title="DecisionTreeLogisticRegression\ndepth=%d - train" % dtlr.tree_depth_,
)
plot_classifier_decision_zone(
    dtlr2,
    X_train,
    y_train,
    ax=ax[0, 1],
    title="DecisionTreeLogisticRegression\ndepth=%d - train" % dtlr2.tree_depth_,
)
plot_classifier_decision_zone(
    dtlr,
    X_test,
    y_test,
    ax=ax[1, 0],
    title="DecisionTreeLogisticRegression\ndepth=%d - test" % dtlr.tree_depth_,
)
plot_classifier_decision_zone(
    dtlr2,
    X_test,
    y_test,
    ax=ax[1, 1],
    title="DecisionTreeLogisticRegression\ndepth=%d - test" % dtlr2.tree_depth_,
)

DecisionTreeLogisticRegression depth=1 - train, DecisionTreeLogisticRegression depth=5 - train, DecisionTreeLogisticRegression depth=1 - test, DecisionTreeLogisticRegression depth=5 - test

rows = []
for model in [lr, dt, dtlr, dtlr2]:
    val = (" - depth=%d" % model.tree_depth_) if hasattr(model, "tree_depth_") else ""
    obs = dict(
        name="%s%s" % (model.__class__.__name__, val), score=model.score(X_test, y_test)
    )
    rows.append(obs)

DataFrame(rows)

	name	score
0	LogisticRegression	0.644444
1	DecisionTreeClassifier	0.877778
2	DecisionTreeLogisticRegression - depth=1	0.600000
3	DecisionTreeLogisticRegression - depth=5	0.900000

A first example¶

def random_set_simple(n):
    X = numpy.random.rand(n, 2)
    y = ((X[:, 0] ** 2 + X[:, 1] ** 2) <= 1).astype(numpy.int32).ravel()
    return X, y


X, y = random_set_simple(2000)
X_train, X_test, y_train, y_test = train_test_split(X, y)
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
dt8 = DecisionTreeClassifier(max_depth=10)
dt8.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
plot_classifier_decision_zone(
    dt,
    X_test,
    y_test,
    ax=ax[0],
    title="DecisionTree - max_depth=%d\nacc=%1.2f"
    % (dt.max_depth, dt.score(X_test, y_test)),
)
plot_classifier_decision_zone(
    dt8,
    X_test,
    y_test,
    ax=ax[1],
    title="DecisionTree - max_depth=%d\nacc=%1.2f"
    % (dt8.max_depth, dt8.score(X_test, y_test)),
)
ax[0].set_xlim([0, 1])
ax[1].set_xlim([0, 1])
ax[0].set_ylim([0, 1])

dtlr = DecisionTreeLogisticRegression(
    max_depth=3, fit_improve_algo="intercept_sort_always", verbose=1
)
dtlr.fit(X_train, y_train)
dtlr8 = DecisionTreeLogisticRegression(
    max_depth=10, min_samples_split=4, fit_improve_algo="intercept_sort_always"
)
dtlr8.fit(X_train, y_train)

fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
plot_classifier_decision_zone(
    dtlr,
    X_test,
    y_test,
    ax=ax[0],
    title="DecisionTreeLogReg - depth=%d\nacc=%1.2f"
    % (dtlr.tree_depth_, dtlr.score(X_test, y_test)),
)
plot_classifier_decision_zone(
    dtlr8,
    X_test,
    y_test,
    ax=ax[1],
    title="DecisionTreeLogReg - depth=%d\nacc=%1.2f"
    % (dtlr8.tree_depth_, dtlr8.score(X_test, y_test)),
)
ax[0].set_xlim([0, 1])
ax[1].set_xlim([0, 1])
ax[0].set_ylim([0, 1])

[DTLR ]   trained acc 0.97 N=1500
[DTLRI]   change intercept 11.377595 --> 10.331268 in [0.393661, 16.182086]
[DTLR*]  above: n_class=2 N=1500 - 1106/1500
[DTLR ]    trained acc 0.99 N=1106
[DTLRI]    change intercept 5.835260 --> 1.473286 in [0.084415, 2.110296]
[DTLR*]   above: n_class=2 N=1106 - 723/1500
[DTLR ]     trained acc 1.00 N=723
[DTLRI]     change intercept 6.063263 --> 0.721621 in [-0.261547, 1.442163]
[DTLR*]   below: n_class=2 N=1106 - 383/1500
[DTLR ]     trained acc 0.98 N=383
[DTLRI]     change intercept 4.503035 --> 0.848029 in [0.172467, 1.172151]
[DTLR*]  below: n_class=2 N=1500 - 394/1500
[DTLR ]    trained acc 0.74 N=394
[DTLRI]    change intercept 6.099474 --> 6.900553 in [6.024330, 9.557904]
[DTLR*]   above: n_class=2 N=394 - 172/1500
[DTLR ]     trained acc 0.62 N=172
[DTLRI]     change intercept 1.698347 --> 1.222500 in [1.035121, 1.398987]
[DTLR*]   below: n_class=1 N=394 - 222/1500

(0.0, 1.0)

def draw_border(
    clr,
    X,
    y,
    fct=None,
    incx=0.1,
    incy=0.1,
    figsize=None,
    border=True,
    ax=None,
    s=10.0,
    linewidths=0.1,
):
    h = 0.02
    x_min, x_max = X[:, 0].min() - incx, X[:, 0].max() + incx
    y_min, y_max = X[:, 1].min() - incy, X[:, 1].max() + incy
    xx, yy = numpy.meshgrid(
        numpy.arange(x_min, x_max, h), numpy.arange(y_min, y_max, h)
    )
    if fct is None:
        Z = clr.predict(numpy.c_[xx.ravel(), yy.ravel()])
    else:
        Z = fct(clr, numpy.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    cmap = plt.cm.tab20
    Z = Z.reshape(xx.shape)
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize or (4, 3))
    ax.pcolormesh(xx, yy, Z, cmap=cmap)

    # Plot also the training points
    ax.scatter(
        X[:, 0], X[:, 1], c=y, edgecolors="k", cmap=cmap, s=s, linewidths=linewidths
    )

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax


fig, ax = plt.subplots(1, 2, figsize=(14, 4))
draw_border(dt, X_test, y_test, border=False, ax=ax[0])
ax[0].set_title("Iris")
draw_border(dt, X, y, border=False, ax=ax[1], fct=lambda m, x: predict_leaves(m, x))
ax[1].set_title("DecisionTree")

Text(0.5, 1.0, 'DecisionTree')

fig, ax = plt.subplots(6, 4, figsize=(12, 16))
for i, depth in tqdm(enumerate((1, 2, 3, 4, 5, 6))):
    dtl = DecisionTreeLogisticRegression(
        max_depth=depth, fit_improve_algo="intercept_sort_always", min_samples_leaf=2
    )
    dtl.fit(X_train, y_train)
    draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 0], s=4.0)
    draw_border(
        dtl,
        X,
        y,
        border=False,
        ax=ax[i, 1],
        fct=lambda m, x: predict_leaves(m, x),
        s=4.0,
    )
    ax[i, 0].set_title(
        "Depth=%d nodes=%d score=%1.2f"
        % (dtl.tree_depth_, dtl.n_nodes_, dtl.score(X_test, y_test))
    )
    ax[i, 1].set_title("DTLR Leaves zones")

    dtl = DecisionTreeClassifier(max_depth=depth)
    dtl.fit(X_train, y_train)
    draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 2], s=4.0)
    draw_border(
        dtl,
        X,
        y,
        border=False,
        ax=ax[i, 3],
        fct=lambda m, x: predict_leaves(m, x),
        s=4.0,
    )
    ax[i, 2].set_title(
        "Depth=%d nodes=%d score=%1.2f"
        % (dtl.max_depth, dtl.tree_.node_count, dtl.score(X_test, y_test))
    )
    ax[i, 3].set_title("DT Leaves zones")

    for k in range(ax.shape[1]):
        ax[i, k].get_xaxis().set_visible(False)

Depth=1 nodes=1 score=0.93, DTLR Leaves zones, Depth=1 nodes=3 score=0.80, DT Leaves zones, Depth=2 nodes=3 score=0.74, DTLR Leaves zones, Depth=2 nodes=7 score=0.90, DT Leaves zones, Depth=3 nodes=7 score=0.69, DTLR Leaves zones, Depth=3 nodes=15 score=0.93, DT Leaves zones, Depth=4 nodes=13 score=0.81, DTLR Leaves zones, Depth=4 nodes=29 score=0.96, DT Leaves zones, Depth=5 nodes=23 score=0.91, DTLR Leaves zones, Depth=5 nodes=43 score=0.97, DT Leaves zones, Depth=6 nodes=35 score=0.94, DTLR Leaves zones, Depth=6 nodes=53 score=0.99, DT Leaves zones

0it [00:00, ?it/s]
1it [00:00,  6.38it/s]
2it [00:00,  4.79it/s]
3it [00:00,  4.30it/s]
4it [00:00,  4.09it/s]
5it [00:01,  3.65it/s]
6it [00:01,  3.31it/s]
6it [00:01,  3.73it/s]

Another example designed to fail¶

Designed to be difficult with a regular decision tree.

def random_set(n):
    X = numpy.random.rand(n, 2)
    y = (
        (cdist(X, numpy.array([[0.5, 0.5]]), metric="minkowski", p=1) <= 0.5)
        .astype(numpy.int32)
        .ravel()
    )
    return X, y


X, y = random_set(2000)
X_train, X_test, y_train, y_test = train_test_split(X, y)
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
dt8 = DecisionTreeClassifier(max_depth=10)
dt8.fit(X_train, y_train)

fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
plot_classifier_decision_zone(
    dt,
    X_test,
    y_test,
    ax=ax[0],
    title="DecisionTree - max_depth=%d\nacc=%1.2f"
    % (dt.max_depth, dt.score(X_test, y_test)),
)
plot_classifier_decision_zone(
    dt8,
    X_test,
    y_test,
    ax=ax[1],
    title="DecisionTree - max_depth=%d\nacc=%1.2f"
    % (dt8.max_depth, dt8.score(X_test, y_test)),
)
ax[0].set_xlim([0, 1])
ax[1].set_xlim([0, 1])
ax[0].set_ylim([0, 1])

DecisionTree - max_depth=3 acc=0.77, DecisionTree - max_depth=10 acc=0.94

(0.0, 1.0)

The example is a square rotated by 45 degrees. Every sample in the square is a positive sample, every sample outside is a negative one. The tree approximates the border with horizontal and vertical lines.

dtlr = DecisionTreeLogisticRegression(
    max_depth=3, fit_improve_algo="intercept_sort_always", verbose=1
)
dtlr.fit(X_train, y_train)
dtlr8 = DecisionTreeLogisticRegression(
    max_depth=10, min_samples_split=4, fit_improve_algo="intercept_sort_always"
)
dtlr8.fit(X_train, y_train)

fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
plot_classifier_decision_zone(
    dtlr,
    X_test,
    y_test,
    ax=ax[0],
    title="DecisionTreeLogReg - depth=%d\nacc=%1.2f"
    % (dtlr.tree_depth_, dtlr.score(X_test, y_test)),
)
plot_classifier_decision_zone(
    dtlr8,
    X_test,
    y_test,
    ax=ax[1],
    title="DecisionTreeLogReg - depth=%d\nacc=%1.2f"
    % (dtlr8.tree_depth_, dtlr8.score(X_test, y_test)),
)
ax[0].set_xlim([0, 1])
ax[1].set_xlim([0, 1])
ax[0].set_ylim([0, 1])

DecisionTreeLogReg - depth=3 acc=0.61, DecisionTreeLogReg - depth=10 acc=0.96

[DTLR ]   trained acc 0.51 N=1500
[DTLRI]   change intercept -0.181642 --> -0.079865 in [-0.197704, 0.033901]
[DTLR*]  above: n_class=2 N=1500 - 748/1500
[DTLR ]    trained acc 0.77 N=748
[DTLRI]    change intercept 5.452602 --> 5.458000 in [2.941666, 7.381419]
[DTLR*]   above: n_class=2 N=748 - 370/1500
[DTLR ]     trained acc 0.77 N=370
[DTLRI]     change intercept 2.774303 --> 1.598424 in [1.029508, 1.906619]
[DTLR*]   below: n_class=2 N=748 - 378/1500
[DTLR ]     trained acc 0.77 N=378
[DTLRI]     change intercept 1.517726 --> 2.694421 in [2.252316, 3.265961]
[DTLR*]  below: n_class=2 N=1500 - 752/1500
[DTLR ]    trained acc 0.75 N=752
[DTLRI]    change intercept -1.973951 --> -1.843201 in [-4.059739, -0.015238]
[DTLR*]   above: n_class=2 N=752 - 354/1500
[DTLR ]     trained acc 0.72 N=354
[DTLRI]     change intercept 0.173448 --> -0.740789 in [-1.155369, -0.407925]
[DTLR*]   below: n_class=2 N=752 - 398/1500
[DTLR ]     trained acc 0.79 N=398
[DTLRI]     change intercept -1.820105 --> -0.515863 in [-0.969611, -0.009603]

(0.0, 1.0)

Leave zones¶

# We use method *decision_path* to understand which leaf is responsible
# for which zone.


fig, ax = plt.subplots(1, 2, figsize=(14, 4))
draw_border(dtlr, X_test, y_test, border=False, ax=ax[0])
ax[0].set_title("Iris")
draw_border(dtlr, X, y, border=False, ax=ax[1], fct=lambda m, x: predict_leaves(m, x))
ax[1].set_title("DecisionTreeLogisticRegression")

Text(0.5, 1.0, 'DecisionTreeLogisticRegression')

fig, ax = plt.subplots(6, 4, figsize=(12, 16))
for i, depth in tqdm(enumerate((1, 2, 3, 4, 5, 6))):
    dtl = DecisionTreeLogisticRegression(
        max_depth=depth, fit_improve_algo="intercept_sort_always", min_samples_leaf=2
    )
    dtl.fit(X_train, y_train)
    draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 0], s=4.0)
    draw_border(
        dtl,
        X,
        y,
        border=False,
        ax=ax[i, 1],
        fct=lambda m, x: predict_leaves(m, x),
        s=4.0,
    )
    ax[i, 0].set_title(
        "Depth=%d nodes=%d score=%1.2f"
        % (dtl.tree_depth_, dtl.n_nodes_, dtl.score(X_test, y_test))
    )
    ax[i, 1].set_title("DTLR Leaves zones")

    dtl = DecisionTreeClassifier(max_depth=depth)
    dtl.fit(X_train, y_train)
    draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 2], s=4.0)
    draw_border(
        dtl,
        X,
        y,
        border=False,
        ax=ax[i, 3],
        fct=lambda m, x: predict_leaves(m, x),
        s=4.0,
    )
    ax[i, 2].set_title(
        "Depth=%d nodes=%d score=%1.2f"
        % (dtl.max_depth, dtl.tree_.node_count, dtl.score(X_test, y_test))
    )
    ax[i, 3].set_title("DT Leaves zones")

Depth=1 nodes=1 score=0.52, DTLR Leaves zones, Depth=1 nodes=3 score=0.63, DT Leaves zones, Depth=2 nodes=3 score=0.73, DTLR Leaves zones, Depth=2 nodes=7 score=0.73, DT Leaves zones, Depth=3 nodes=7 score=0.61, DTLR Leaves zones, Depth=3 nodes=13 score=0.77, DT Leaves zones, Depth=4 nodes=15 score=0.62, DTLR Leaves zones, Depth=4 nodes=23 score=0.85, DT Leaves zones, Depth=5 nodes=31 score=0.80, DTLR Leaves zones, Depth=5 nodes=39 score=0.86, DT Leaves zones, Depth=6 nodes=63 score=0.82, DTLR Leaves zones, Depth=6 nodes=63 score=0.90, DT Leaves zones

0it [00:00, ?it/s]
1it [00:00,  7.18it/s]
2it [00:00,  5.94it/s]
3it [00:00,  4.73it/s]
4it [00:00,  3.73it/s]
5it [00:01,  3.05it/s]
6it [00:01,  2.70it/s]
6it [00:01,  3.27it/s]

Total running time of the script: (0 minutes 10.396 seconds)

Gallery generated by Sphinx-Gallery

	criterion	'entropy'
	splitter	'best'
	max_depth	None
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	None
	random_state	None
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	class_weight	None
	ccp_alpha	0.0
	monotonic_cst	None

	criterion	'gini'
	splitter	'best'
	max_depth	10
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	None
	random_state	None
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	class_weight	None
	ccp_alpha	0.0
	monotonic_cst	None