Visualize a scikit-learn pipeline#

Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.

Simple model#

Let’s vizualize a simple pipeline, a single model not even trained.

from numpy.random import randn
import pandas
from PIL import Image
from sphinx_runpython.runpython import run_cmd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures,
)
from mlinsights.helpers.pipeline import (
    alter_pipeline_for_debugging,
    enumerate_pipeline_models,
)
from mlinsights.plotting import pipeline2dot, pipeline2str


iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


The trick consists in converting the pipeline in a graph through the DOT language.

dot = pipeline2dot(clf, df)
print(dot)
digraph{
  orientation=portrait;
  nodesep=0.05;
  ranksep=0.25;
  sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];

  node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
  sch0:f0 -> node1;
  sch0:f1 -> node1;
  sch0:f2 -> node1;
  sch0:f3 -> node1;
  sch1[label="<f0> -v-0",shape=record,fontsize=8];
  node1 -> sch1:f0;

  node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
  sch1:f0 -> node2;
  sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
  node2 -> sch2:f0;
  node2 -> sch2:f1;
}

It is lot better with an image.

dot_file = "graph.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)


img = Image.open("graph.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=200x351 at 0x7FED7ACEB550>

Complex pipeline#

scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.

columns = [
    "pclass",
    "name",
    "sex",
    "age",
    "sibsp",
    "parch",
    "ticket",
    "fare",
    "cabin",
    "embarked",
    "boat",
    "body",
    "home.dest",
]

numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(solver="lbfgs")),
    ]
)
clf
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


Let’s see it first as a simplified text.

print(pipeline2str(clf))
Pipeline
   ColumnTransformer
      Pipeline(age,fare)
         SimpleImputer
         StandardScaler
      Pipeline(embarked,sex,pclass)
         SimpleImputer
         OneHotEncoder
   LogisticRegression
dot = pipeline2dot(clf, columns)

dot_file = "graph2.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)

cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)

img = Image.open("graph2.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=591x787 at 0x7FED7ACE8A60>

Example with FeatureUnion#

model = Pipeline(
    [
        ("poly", PolynomialFeatures()),
        (
            "union",
            FeatureUnion([("scaler2", MinMaxScaler()), ("scaler3", StandardScaler())]),
        ),
    ]
)
dot = pipeline2dot(model, columns)

dot_file = "graph3.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)

cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)

img = Image.open("graph3.dot.png")
img
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=306x569 at 0x7FED7ACEAD10>

Compute intermediate outputs#

# It is difficult to access intermediate outputs with *scikit-learn* but
# it may be interesting to do so. The method
# `alter_pipeline_for_debugging <find://alter_pipeline_for_debugging>`_
# modifies the pipeline to intercept intermediate outputs.


model = Pipeline(
    [
        ("scaler1", StandardScaler()),
        (
            "union",
            FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())]),
        ),
        ("lr", LinearRegression()),
    ]
)

X = randn(4, 5)
y = randn(4)
model.fit(X, y)
Pipeline(steps=[('scaler1', StandardScaler()),
                ('union',
                 FeatureUnion(transformer_list=[('scaler2', StandardScaler()),
                                                ('scaler3', MinMaxScaler())])),
                ('lr', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


print(pipeline2str(model))
Pipeline
   StandardScaler
   FeatureUnion
      StandardScaler
      MinMaxScaler
   LinearRegression

Let’s now modify the pipeline to get the intermediate outputs.

alter_pipeline_for_debugging(model)

The function adds a member _debug which stores inputs and outputs in every piece of the pipeline.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
array([-0.94327916,  1.83009631, -0.28141013,  0.12719898])

The member was populated with inputs and outputs.

model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.2762825   1.40392169 -0.44070616 -1.73411973 -0.20499451]
    [-1.21217604 -0.33528123 -0.66585633 -0.24197224  0.11465946]
    [-0.17971264 -0.80838722  1.22547855  1.9416219   1.58399645]
    [ 2.22639797 -0.97041703  1.1955906  -0.10650412 -1.54463857]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  )

Every piece behaves the same way.

for coor, model, vars in enumerate_pipeline_models(model):
    print(coor)
    print(model._debug)
(0,)
BaseEstimatorDebugInformation(Pipeline)
  predict(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.2762825   1.40392169 -0.44070616 -1.73411973 -0.20499451]
    [-1.21217604 -0.33528123 -0.66585633 -0.24197224  0.11465946]
    [-0.17971264 -0.80838722  1.22547855  1.9416219   1.58399645]
    [ 2.22639797 -0.97041703  1.1955906  -0.10650412 -1.54463857]]
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [-0.94327916  1.83009631 -0.28141013  0.12719898]
  )
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 1.2762825   1.40392169 -0.44070616 -1.73411973 -0.20499451]
    [-1.21217604 -0.33528123 -0.66585633 -0.24197224  0.11465946]
    [-0.17971264 -0.80838722  1.22547855  1.9416219   1.58399645]
    [ 2.22639797 -0.97041703  1.1955906  -0.10650412 -1.54463857]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  )
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  ) -> (
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293   0.7236891
      1.          0.119043    0.          0.42818803]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355  0.
      0.26750008  0.          0.40594461  0.53035845]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961  0.30025918
   ...
  )
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  )
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293 ]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961]
    [ 1.28652927 -0.84133064  0.97900498 -0.05448639 -1.37714387]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[0.7236891  1.         0.119043   0.         0.42818803]
    [0.         0.26750008 0.         0.40594461 0.53035845]
    [0.30025918 0.06824208 1.         1.         1.        ]
    [1.         0.         0.98419743 0.44279924 0.        ]]
  )
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
  predict(
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[ 0.5669488   1.67810961 -0.86875663 -1.29897394 -0.1728293   0.7236891
      1.          0.119043    0.          0.42818803]
    [-1.31771283 -0.16738017 -1.12300381 -0.15806637  0.11453355  0.
      0.26750008  0.          0.40594461  0.53035845]
    [-0.53576524 -0.6693988   1.01275546  1.5115267   1.43543961  0.30025918
   ...
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [-0.94327916  1.83009631 -0.28141013  0.12719898]
  )

Total running time of the script: (0 minutes 0.203 seconds)

Gallery generated by Sphinx-Gallery