Note
Go to the end to download the full example code
Visualize a scikit-learn pipeline#
Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.
Simple model#
Let’s vizualize a simple pipeline, a single model not even trained.
from numpy.random import randn
import pandas
from PIL import Image
from sphinx_runpython.runpython import run_cmd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
OneHotEncoder,
StandardScaler,
MinMaxScaler,
PolynomialFeatures,
)
from mlinsights.helpers.pipeline import (
alter_pipeline_for_debugging,
enumerate_pipeline_models,
)
from mlinsights.plotting import pipeline2dot, pipeline2str
iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
The trick consists in converting the pipeline in a graph through the DOT language.
digraph{
orientation=portrait;
nodesep=0.05;
ranksep=0.25;
sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];
node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
sch0:f0 -> node1;
sch0:f1 -> node1;
sch0:f2 -> node1;
sch0:f3 -> node1;
sch1[label="<f0> -v-0",shape=record,fontsize=8];
node1 -> sch1:f0;
node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
sch1:f0 -> node2;
sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
node2 -> sch2:f0;
node2 -> sch2:f1;
}
It is lot better with an image.
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=200x351 at 0x7FED7ACEB550>
Complex pipeline#
scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.
columns = [
"pclass",
"name",
"sex",
"age",
"sibsp",
"parch",
"ticket",
"fare",
"cabin",
"embarked",
"boat",
"body",
"home.dest",
]
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
clf = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LogisticRegression(solver="lbfgs")),
]
)
clf
Let’s see it first as a simplified text.
print(pipeline2str(clf))
Pipeline
ColumnTransformer
Pipeline(age,fare)
SimpleImputer
StandardScaler
Pipeline(embarked,sex,pclass)
SimpleImputer
OneHotEncoder
LogisticRegression
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=591x787 at 0x7FED7ACE8A60>
Example with FeatureUnion#
model = Pipeline(
[
("poly", PolynomialFeatures()),
(
"union",
FeatureUnion([("scaler2", MinMaxScaler()), ("scaler3", StandardScaler())]),
),
]
)
dot = pipeline2dot(model, columns)
dot_file = "graph3.dot"
with open(dot_file, "w", encoding="utf-8") as f:
f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)
img = Image.open("graph3.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=306x569 at 0x7FED7ACEAD10>
Compute intermediate outputs#
# It is difficult to access intermediate outputs with *scikit-learn* but
# it may be interesting to do so. The method
# `alter_pipeline_for_debugging <find://alter_pipeline_for_debugging>`_
# modifies the pipeline to intercept intermediate outputs.
model = Pipeline(
[
("scaler1", StandardScaler()),
(
"union",
FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())]),
),
("lr", LinearRegression()),
]
)
X = randn(4, 5)
y = randn(4)
model.fit(X, y)
print(pipeline2str(model))
Pipeline
StandardScaler
FeatureUnion
StandardScaler
MinMaxScaler
LinearRegression
Let’s now modify the pipeline to get the intermediate outputs.
alter_pipeline_for_debugging(model)
The function adds a member _debug
which stores inputs and outputs in
every piece of the pipeline.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
array([-0.94327916, 1.83009631, -0.28141013, 0.12719898])
The member was populated with inputs and outputs.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 1.2762825 1.40392169 -0.44070616 -1.73411973 -0.20499451]
[-1.21217604 -0.33528123 -0.66585633 -0.24197224 0.11465946]
[-0.17971264 -0.80838722 1.22547855 1.9416219 1.58399645]
[ 2.22639797 -0.97041703 1.1955906 -0.10650412 -1.54463857]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
)
Every piece behaves the same way.
(0,)
BaseEstimatorDebugInformation(Pipeline)
predict(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 1.2762825 1.40392169 -0.44070616 -1.73411973 -0.20499451]
[-1.21217604 -0.33528123 -0.66585633 -0.24197224 0.11465946]
[-0.17971264 -0.80838722 1.22547855 1.9416219 1.58399645]
[ 2.22639797 -0.97041703 1.1955906 -0.10650412 -1.54463857]]
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[-0.94327916 1.83009631 -0.28141013 0.12719898]
)
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 1.2762825 1.40392169 -0.44070616 -1.73411973 -0.20499451]
[-1.21217604 -0.33528123 -0.66585633 -0.24197224 0.11465946]
[-0.17971264 -0.80838722 1.22547855 1.9416219 1.58399645]
[ 2.22639797 -0.97041703 1.1955906 -0.10650412 -1.54463857]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
)
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
) -> (
shape=(4, 10) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 0.7236891
1. 0.119043 0. 0.42818803]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355 0.
0.26750008 0. 0.40594461 0.53035845]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961 0.30025918
...
)
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
)
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961]
[ 1.28652927 -0.84133064 0.97900498 -0.05448639 -1.37714387]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[0.7236891 1. 0.119043 0. 0.42818803]
[0. 0.26750008 0. 0.40594461 0.53035845]
[0.30025918 0.06824208 1. 1. 1. ]
[1. 0. 0.98419743 0.44279924 0. ]]
)
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
predict(
shape=(4, 10) type=<class 'numpy.ndarray'>
[[ 0.5669488 1.67810961 -0.86875663 -1.29897394 -0.1728293 0.7236891
1. 0.119043 0. 0.42818803]
[-1.31771283 -0.16738017 -1.12300381 -0.15806637 0.11453355 0.
0.26750008 0. 0.40594461 0.53035845]
[-0.53576524 -0.6693988 1.01275546 1.5115267 1.43543961 0.30025918
...
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[-0.94327916 1.83009631 -0.28141013 0.12719898]
)
Total running time of the script: (0 minutes 0.203 seconds)