Note
Go to the end to download the full example code.
Visualize a scikit-learn pipeline¶
Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.
Simple model¶
Let’s vizualize a simple pipeline, a single model not even trained.
from numpy.random import randn
import pandas
from PIL import Image
from sphinx_runpython.runpython import run_cmd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
OneHotEncoder,
StandardScaler,
MinMaxScaler,
PolynomialFeatures,
)
from mlinsights.helpers.pipeline import (
alter_pipeline_for_debugging,
enumerate_pipeline_models,
)
from mlinsights.plotting import pipeline2dot, pipeline2str
iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
The trick consists in converting the pipeline in a graph through the DOT language.
digraph{
orientation=portrait;
nodesep=0.05;
ranksep=0.25;
sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];
node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
sch0:f0 -> node1;
sch0:f1 -> node1;
sch0:f2 -> node1;
sch0:f3 -> node1;
sch1[label="<f0> -v-0",shape=record,fontsize=8];
node1 -> sch1:f0;
node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
sch1:f0 -> node2;
sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
node2 -> sch2:f0;
node2 -> sch2:f1;
}
It is lot better with an image.
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=200x351 at 0x7F84B203D090>
Complex pipeline¶
scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.
columns = [
"pclass",
"name",
"sex",
"age",
"sibsp",
"parch",
"ticket",
"fare",
"cabin",
"embarked",
"boat",
"body",
"home.dest",
]
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
clf = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LogisticRegression(solver="lbfgs")),
]
)
clf
Let’s see it first as a simplified text.
print(pipeline2str(clf))
Pipeline
ColumnTransformer
Pipeline(age,fare)
SimpleImputer
StandardScaler
Pipeline(embarked,sex,pclass)
SimpleImputer
OneHotEncoder
LogisticRegression
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=591x787 at 0x7F84B203F100>
Example with FeatureUnion¶
model = Pipeline(
[
("poly", PolynomialFeatures()),
(
"union",
FeatureUnion([("scaler2", MinMaxScaler()), ("scaler3", StandardScaler())]),
),
]
)
dot = pipeline2dot(model, columns)
dot_file = "graph3.dot"
with open(dot_file, "w", encoding="utf-8") as f:
f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)
img = Image.open("graph3.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=306x569 at 0x7F84B203E860>
Compute intermediate outputs¶
# It is difficult to access intermediate outputs with *scikit-learn* but
# it may be interesting to do so. The method
# `alter_pipeline_for_debugging <find://alter_pipeline_for_debugging>`_
# modifies the pipeline to intercept intermediate outputs.
model = Pipeline(
[
("scaler1", StandardScaler()),
(
"union",
FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())]),
),
("lr", LinearRegression()),
]
)
X = randn(4, 5)
y = randn(4)
model.fit(X, y)
print(pipeline2str(model))
Pipeline
StandardScaler
FeatureUnion
StandardScaler
MinMaxScaler
LinearRegression
Let’s now modify the pipeline to get the intermediate outputs.
alter_pipeline_for_debugging(model)
The function adds a member _debug
which stores inputs and outputs in
every piece of the pipeline.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
array([ 0.57886408, 0.02901042, -1.80801004, -0.13899759])
The member was populated with inputs and outputs.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.11278923 0.49633964 0.89431531 -0.58705583 -0.89982345]
[ 0.30571764 0.50186418 -0.57305771 0.77656605 0.60803094]
[-0.70304868 0.32967023 -0.34664322 -0.13516745 -0.76585543]
[-1.3830715 -2.00956839 -0.79207642 -0.25670281 0.36958478]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
)
Every piece behaves the same way.
(0,)
BaseEstimatorDebugInformation(Pipeline)
predict(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.11278923 0.49633964 0.89431531 -0.58705583 -0.89982345]
[ 0.30571764 0.50186418 -0.57305771 0.77656605 0.60803094]
[-0.70304868 0.32967023 -0.34664322 -0.13516745 -0.76585543]
[-1.3830715 -2.00956839 -0.79207642 -0.25670281 0.36958478]]
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[ 0.57886408 0.02901042 -1.80801004 -0.13899759]
)
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[-0.11278923 0.49633964 0.89431531 -0.58705583 -0.89982345]
[ 0.30571764 0.50186418 -0.57305771 0.77656605 0.60803094]
[-0.70304868 0.32967023 -0.34664322 -0.13516745 -0.76585543]
[-1.3830715 -2.00956839 -0.79207642 -0.25670281 0.36958478]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
)
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
) -> (
shape=(4, 10) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908 0.75218524
0.99780024 1. 0. 0. ]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756 1.
1. 0.12987416 1. 1. ]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644 0.40266888
...
)
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
)
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
transform(
shape=(4, 5) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644]
[-1.43076176 -1.72838454 -0.89921453 -0.40784071 0.81094796]]
) -> (
shape=(4, 5) type=<class 'numpy.ndarray'>
[[0.75218524 0.99780024 1. 0. 0. ]
[1. 1. 0.12987416 1. 1. ]
[0.40266888 0.93143596 0.26413389 0.33138833 0.08884678]
[0. 0. 0. 0.24226146 0.84186393]]
)
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
predict(
shape=(4, 10) type=<class 'numpy.ndarray'>
[[ 0.56695655 0.62660822 1.68101311 -1.06151875 -1.08975908 0.75218524
0.99780024 1. 0. 0. ]
[ 1.22512431 0.63180005 -0.56410962 1.63671501 1.16797756 1.
1. 0.12987416 1. 1. ]
[-0.3613191 0.46997627 -0.21768897 -0.16735556 -0.88916644 0.40266888
...
) -> (
shape=(4,) type=<class 'numpy.ndarray'>
[ 0.57886408 0.02901042 -1.80801004 -0.13899759]
)
Total running time of the script: (0 minutes 0.170 seconds)