Note
Go to the end to download the full example code.
Visualize a scikit-learn pipeline¶
Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.
Simple model¶
Let’s vizualize a simple pipeline, a single model not even trained.
from numpy.random import randn
import pandas
from PIL import Image
from sphinx_runpython.runpython import run_cmd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
OneHotEncoder,
StandardScaler,
MinMaxScaler,
PolynomialFeatures,
)
from mlinsights.helpers.pipeline import (
alter_pipeline_for_debugging,
enumerate_pipeline_models,
)
from mlinsights.plotting import pipeline2dot, pipeline2str
iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
The trick consists in converting the pipeline in a graph through the DOT language.
digraph{
orientation=portrait;
nodesep=0.05;
ranksep=0.25;
sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];
node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
sch0:f0 -> node1;
sch0:f1 -> node1;
sch0:f2 -> node1;
sch0:f3 -> node1;
sch1[label="<f0> -v-0",shape=record,fontsize=8];
node1 -> sch1:f0;
node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
sch1:f0 -> node2;
sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
node2 -> sch2:f0;
node2 -> sch2:f1;
}
It is lot better with an image.
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=200x351 at 0x79FB8293CAD0>
Complex pipeline¶
scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.
columns = [
"pclass",
"name",
"sex",
"age",
"sibsp",
"parch",
"ticket",
"fare",
"cabin",
"embarked",
"boat",
"body",
"home.dest",
]
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
clf = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LogisticRegression(solver="lbfgs")),
]
)
clf
Let’s see it first as a simplified text.
print(pipeline2str(clf))
Pipeline
ColumnTransformer
Pipeline(age,fare)
SimpleImputer
StandardScaler
Pipeline(embarked,sex,pclass)
SimpleImputer
OneHotEncoder
LogisticRegression
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=591x787 at 0x79FB837969C0>
Example with FeatureUnion¶
model = Pipeline(
[
("poly", PolynomialFeatures()),
(
"union",
FeatureUnion([("scaler2", MinMaxScaler()), ("scaler3", StandardScaler())]),
),
]
)
dot = pipeline2dot(model, columns)
dot_file = "graph3.dot"
with open(dot_file, "w", encoding="utf-8") as f:
f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)
img = Image.open("graph3.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=306x569 at 0x79FB83FFE9F0>
Compute intermediate outputs¶
# It is difficult to access intermediate outputs with *scikit-learn* but
# it may be interesting to do so. The method
# `alter_pipeline_for_debugging <find://alter_pipeline_for_debugging>`_
# modifies the pipeline to intercept intermediate outputs.
model = Pipeline(
[
("scaler1", StandardScaler()),
(
"union",
FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())]),
),
("lr", LinearRegression()),
]
)
X = randn(4, 5)
y = randn(4)
model.fit(X, y)
print(pipeline2str(model))
Pipeline
StandardScaler
FeatureUnion
StandardScaler
MinMaxScaler
LinearRegression
Let’s now modify the pipeline to get the intermediate outputs.
alter_pipeline_for_debugging(model)
The function adds a member _debug which stores inputs and outputs in
every piece of the pipeline.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
array([-1.26141698, 0.00345816, -0.13232125, 0.454529 ])
The member was populated with inputs and outputs.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.28018206 -1.00718116 0.84747923 0.64667883 0.36314504]
[ 0.93337 -1.72129241 1.18326982 -0.57110584 -1.78215473]
[ 1.66587367 0.12213741 -0.86504416 0.07605191 -0.1783714 ]
[ 0.02121704 -0.2879426 -0.41906187 -1.24244392 -0.92573229]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
)
Every piece behaves the same way.
(0,)
BaseEstimatorDebugInformation(Pipeline)
predict(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.28018206 -1.00718116 0.84747923 0.64667883 0.36314504]
[ 0.93337 -1.72129241 1.18326982 -0.57110584 -1.78215473]
[ 1.66587367 0.12213741 -0.86504416 0.07605191 -0.1783714 ]
[ 0.02121704 -0.2879426 -0.41906187 -1.24244392 -0.92573229]]
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[-1.26141698 0.00345816 -0.13232125 0.454529 ]
)
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.28018206 -1.00718116 0.84747923 0.64667883 0.36314504]
[ 0.93337 -1.72129241 1.18326982 -0.57110584 -1.78215473]
[ 1.66587367 0.12213741 -0.86504416 0.07605191 -0.1783714 ]
[ 0.02121704 -0.2879426 -0.41906187 -1.24244392 -0.92573229]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
)
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
) -> (
shape=(4, 10) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427 0.15745841
0.38738185 0.83606488 1. 1. ]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488 0.55461605
0. 1. 0.35537028 0. ]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917 1.
...
)
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
)
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917]
[-1.10551491 0.61903621 -0.71103048 -1.3726767 -0.36547856]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[0.15745841 0.38738185 0.83606488 1. 1. ]
[0.55461605 0. 1. 0.35537028 0. ]
[1. 1. 0. 0.69794079 0.74758006]
[0. 0.77754509 0.21773141 0. 0.39920875]]
)
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
predict(
shape=(4, 10) type=<class 'numpy.ndarray'>
[[-0.69882086 -0.40301849 0.77570501 1.30139781 1.23157427 0.15745841
0.38738185 0.83606488 1. 1. ]
[ 0.32698426 -1.41778717 1.16987444 -0.42239009 -1.42667488 0.55461605
0. 1. 0.35537028 0. ]
[ 1.47735151 1.20176944 -1.23454898 0.49366898 0.56057917 1.
...
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[-1.26141698 0.00345816 -0.13232125 0.454529 ]
)
Total running time of the script: (0 minutes 0.282 seconds)