Note

Go to the end to download the full example code.

Visualize a scikit-learn pipeline¶

Pipeline can be big with scikit-learn, let’s dig into a visual way to look a them.

Simple model¶

Let’s vizualize a simple pipeline, a single model not even trained.

from numpy.random import randn
import pandas
from PIL import Image
from sphinx_runpython.runpython import run_cmd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures,
)
from mlinsights.helpers.pipeline import (
    alter_pipeline_for_debugging,
    enumerate_pipeline_models,
)
from mlinsights.plotting import pipeline2dot, pipeline2str


iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf

LogisticRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

The trick consists in converting the pipeline in a graph through the DOT language.

dot = pipeline2dot(clf, df)
print(dot)

digraph{
  orientation=portrait;
  nodesep=0.05;
  ranksep=0.25;
  sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8];

  node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12];
  sch0:f0 -> node1;
  sch0:f1 -> node1;
  sch0:f2 -> node1;
  sch0:f3 -> node1;
  sch1[label="<f0> -v-0",shape=record,fontsize=8];
  node1 -> sch1:f0;

  node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12];
  sch1:f0 -> node2;
  sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8];
  node2 -> sch2:f0;
  node2 -> sch2:f1;
}

It is lot better with an image.

dot_file = "graph.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)

cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)


img = Image.open("graph.dot.png")
img

<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=200x351 at 0x79FB8293CAD0>

Complex pipeline¶

scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.

columns = [
    "pclass",
    "name",
    "sex",
    "age",
    "sibsp",
    "parch",
    "ticket",
    "fare",
    "cabin",
    "embarked",
    "boat",
    "body",
    "home.dest",
]

numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(solver="lbfgs")),
    ]
)
clf

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', LogisticRegression())])

Pipeline

?Documentation for PipelineiNot fitted

Parameters

	steps	[('preprocessor', ...), ('classifier', ...)]
	transform_input	None
	memory	None
	verbose	False

preprocessor: ColumnTransformer

?Documentation for preprocessor: ColumnTransformer

Parameters

	transformers	[('num', ...), ('cat', ...)]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

num

['age', 'fare']

SimpleImputer

?Documentation for SimpleImputer

Parameters

	missing_values	nan
	strategy	'median'
	fill_value	None
	copy	True
	add_indicator	False
	keep_empty_features	False

StandardScaler

?Documentation for StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

cat

['embarked', 'sex', 'pclass']

SimpleImputer

?Documentation for SimpleImputer

Parameters

	missing_values	nan
	strategy	'constant'
	fill_value	'missing'
	copy	True
	add_indicator	False
	keep_empty_features	False

OneHotEncoder

?Documentation for OneHotEncoder

Parameters

	categories	'auto'
	drop	None
	sparse_output	True
	dtype	<class 'numpy.float64'>
	handle_unknown	'ignore'
	min_frequency	None
	max_categories	None
	feature_name_combiner	'concat'

LogisticRegression

?Documentation for LogisticRegression

Parameters

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	None
	solver	'lbfgs'
	max_iter	100
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

Let’s see it first as a simplified text.

print(pipeline2str(clf))

Pipeline
   ColumnTransformer
      Pipeline(age,fare)
         SimpleImputer
         StandardScaler
      Pipeline(embarked,sex,pclass)
         SimpleImputer
         OneHotEncoder
   LogisticRegression

dot = pipeline2dot(clf, columns)

dot_file = "graph2.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)

cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)

img = Image.open("graph2.dot.png")
img

<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=591x787 at 0x79FB837969C0>

Example with FeatureUnion¶

model = Pipeline(
    [
        ("poly", PolynomialFeatures()),
        (
            "union",
            FeatureUnion([("scaler2", MinMaxScaler()), ("scaler3", StandardScaler())]),
        ),
    ]
)
dot = pipeline2dot(model, columns)

dot_file = "graph3.dot"
with open(dot_file, "w", encoding="utf-8") as f:
    f.write(dot)

cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True)

img = Image.open("graph3.dot.png")
img

<PIL.PngImagePlugin.PngImageFile image mode=RGB size=306x569 at 0x79FB83FFE9F0>

Compute intermediate outputs¶

# It is difficult to access intermediate outputs with *scikit-learn* but
# it may be interesting to do so. The method
# `alter_pipeline_for_debugging <find://alter_pipeline_for_debugging>`_
# modifies the pipeline to intercept intermediate outputs.


model = Pipeline(
    [
        ("scaler1", StandardScaler()),
        (
            "union",
            FeatureUnion([("scaler2", StandardScaler()), ("scaler3", MinMaxScaler())]),
        ),
        ("lr", LinearRegression()),
    ]
)

X = randn(4, 5)
y = randn(4)
model.fit(X, y)

Pipeline(steps=[('scaler1', StandardScaler()),
                ('union',
                 FeatureUnion(transformer_list=[('scaler2', StandardScaler()),
                                                ('scaler3', MinMaxScaler())])),
                ('lr', LinearRegression())])

Pipeline

?Documentation for PipelineiFitted

Parameters

	steps	[('scaler1', ...), ('union', ...), ...]
	transform_input	None
	memory	None
	verbose	False

StandardScaler

?Documentation for StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

union: FeatureUnion

?Documentation for union: FeatureUnion

Parameters

	transformer_list	[('scaler2', ...), ('scaler3', ...)]
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True

scaler2

StandardScaler

?Documentation for StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

scaler3

MinMaxScaler

?Documentation for MinMaxScaler

Parameters

	feature_range	(0, ...)
	copy	True
	clip	False

LinearRegression

?Documentation for LinearRegression

Parameters

	fit_intercept	True
	copy_X	True
	tol	1e-06
	n_jobs	None
	positive	False

print(pipeline2str(model))

Pipeline
   StandardScaler
   FeatureUnion
      StandardScaler
      MinMaxScaler
   LinearRegression

Let’s now modify the pipeline to get the intermediate outputs.

alter_pipeline_for_debugging(model)

The function adds a member _debug which stores inputs and outputs in every piece of the pipeline.

model.steps[0][1]._debug

BaseEstimatorDebugInformation(StandardScaler)

model.predict(X)

array([-1.26141698,  0.00345816, -0.13232125,  0.454529  ])

The member was populated with inputs and outputs.

model.steps[0][1]._debug

BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.28018206 -1.00718116  0.84747923  0.64667883  0.36314504]
    [ 0.93337    -1.72129241  1.18326982 -0.57110584 -1.78215473]
    [ 1.66587367  0.12213741 -0.86504416  0.07605191 -0.1783714 ]
    [ 0.02121704 -0.2879426  -0.41906187 -1.24244392 -0.92573229]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  )

Every piece behaves the same way.

for coor, m, _vars in enumerate_pipeline_models(model):
    print(coor)
    print(m._debug)

(0,)
BaseEstimatorDebugInformation(Pipeline)
  predict(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.28018206 -1.00718116  0.84747923  0.64667883  0.36314504]
    [ 0.93337    -1.72129241  1.18326982 -0.57110584 -1.78215473]
    [ 1.66587367  0.12213741 -0.86504416  0.07605191 -0.1783714 ]
    [ 0.02121704 -0.2879426  -0.41906187 -1.24244392 -0.92573229]]
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [-1.26141698  0.00345816 -0.13232125  0.454529  ]
  )
(0, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[ 0.28018206 -1.00718116  0.84747923  0.64667883  0.36314504]
    [ 0.93337    -1.72129241  1.18326982 -0.57110584 -1.78215473]
    [ 1.66587367  0.12213741 -0.86504416  0.07605191 -0.1783714 ]
    [ 0.02121704 -0.2879426  -0.41906187 -1.24244392 -0.92573229]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  )
(0, 1)
BaseEstimatorDebugInformation(FeatureUnion)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  ) -> (
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427  0.15745841
      0.38738185  0.83606488  1.          1.        ]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488  0.55461605
      0.          1.          0.35537028  0.        ]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917  1.
   ...
  )
(0, 1, 0)
BaseEstimatorDebugInformation(StandardScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  )
(0, 1, 1)
BaseEstimatorDebugInformation(MinMaxScaler)
  transform(
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917]
    [-1.10551491  0.61903621 -0.71103048 -1.3726767  -0.36547856]]
  ) -> (
   shape=(4, 5) type=<class 'numpy.ndarray'>
   [[0.15745841 0.38738185 0.83606488 1.         1.        ]
    [0.55461605 0.         1.         0.35537028 0.        ]
    [1.         1.         0.         0.69794079 0.74758006]
    [0.         0.77754509 0.21773141 0.         0.39920875]]
  )
(0, 2)
BaseEstimatorDebugInformation(LinearRegression)
  predict(
   shape=(4, 10) type=<class 'numpy.ndarray'>
   [[-0.69882086 -0.40301849  0.77570501  1.30139781  1.23157427  0.15745841
      0.38738185  0.83606488  1.          1.        ]
    [ 0.32698426 -1.41778717  1.16987444 -0.42239009 -1.42667488  0.55461605
      0.          1.          0.35537028  0.        ]
    [ 1.47735151  1.20176944 -1.23454898  0.49366898  0.56057917  1.
   ...
  ) -> (
   shape=(4,) type=<class 'numpy.ndarray'>
   [-1.26141698  0.00345816 -0.13232125  0.454529  ]
  )

Total running time of the script: (0 minutes 0.282 seconds)

Gallery generated by Sphinx-Gallery