Compare multiple versions of onnxruntime

One important task is check the onnxruntime does not run slower for any new version. The following tools were developped for that purpose.

Step 1: save a test

We need to first to save the model and the input onnxruntime must be evaluated on. This is done with function save_for_benchmark_or_test.

<<<

import os
import numpy as np
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from skl2onnx import to_onnx
from onnx_extended.tools.run_onnx import save_for_benchmark_or_test
from onnx_extended.args import get_parsed_args

# The dimension of the problem.

args = get_parsed_args(
    "create_bench",
    **dict(
        batch_size=(10, "batch size"),
        n_features=(10, "number of features"),
        n_trees=(10, "number of trees"),
        max_depth=(3, "max detph"),
    ),
)

batch_size = args.batch_size
n_features = args.n_features
n_trees = args.n_trees
max_depth = args.max_depth

# Let's create model.
X, y = make_regression(
    batch_size + 2**max_depth * 2, n_features=n_features, n_targets=1
)
X, y = X.astype(np.float32), y.astype(np.float32)

print(
    f"train RandomForestRegressor n_trees={n_trees} "
    f"n_features={n_features} batch_size={batch_size} "
    f"max_depth={max_depth}"
)
model = RandomForestRegressor(n_trees, max_depth=max_depth, n_jobs=-1, verbose=1)
model.fit(X[:-batch_size], y[:-batch_size])

# target_opset is used to select opset an old version of onnxruntime can process.
print("conversion to onnx")
onx = to_onnx(model, X[:1], target_opset=17)

print(f"size: {len(onx.SerializeToString())}")

# Let's save the model and the inputs on disk.
folder = f"test_ort_version-F{n_features}-T{n_trees}-D{max_depth}-B{batch_size}"
if not os.path.exists(folder):
    os.mkdir(folder)

print("create the benchmark")
inputs = [X[:batch_size]]
save_for_benchmark_or_test(folder, "rf", onx, inputs)

print("end")
# Let's see what was saved.
for r, d, f in os.walk(folder):
    for name in f:
        full_name = os.path.join(r, name)
        print(f"{os.stat(full_name).st_size / 2 ** 10:1.1f} Kb: {full_name}")

>>>

    train RandomForestRegressor n_trees=10 n_features=10 batch_size=10 max_depth=3
    conversion to onnx
    size: 4446
    create the benchmark
    end
    4.3 Kb: test_ort_version-F10-T10-D3-B10/rf/model.onnx
    0.4 Kb: test_ort_version-F10-T10-D3-B10/rf/test_data_set_0/input_0.pb
    0.1 Kb: test_ort_version-F10-T10-D3-B10/rf/test_data_set_0/output_0.pb
    [runpythonerror]
    [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
    [Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
    [Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished

The output are not used to measure the performance but it can be used to evaluate the discrepancies.

Step 2: evaluate multiple versions of onnxruntime

It calls function bench_virtual.

import os
import platform
import psutil
from onnx_extended.tools.run_onnx import bench_virtual
from onnx_extended.args import get_parsed_args

args = get_parsed_args(
    "run_bench",
    **dict(
        test_name=(
            "test_ort_version-F10-T10-D3-B10",
            "folder containing the benchmark to run",
        ),
    ),
)

name = args.test_name
folder = os.path.abspath(f"{name}/rf")
if not os.path.exists(folder):
    raise FileNotFoundError(f"Unable to find {folder!r}.")
virtual_env = os.path.abspath("venv")

runtimes = ["onnxruntime"]
modules = [
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.17.3"},
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.16.3"},
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.15.1"},
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.14.1"},
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.13.1"},
    {"onnx-extended": "0.2.3", "onnx": "1.15.0", "onnxruntime": "1.12.1"},
]

print("--------------------------")
print(platform.machine(), platform.version(), platform.platform())
print(platform.processor())
print(f"RAM: {psutil.virtual_memory().total / (1024.0 **3):1.3f} GB")
print("Physical cores:", psutil.cpu_count(logical=False))
print("Total cores:", psutil.cpu_count(logical=True))
print("--------------------------")
print(name)
for t in range(3):
    print("--------------------------")
    df = bench_virtual(
        folder,
        virtual_env,
        verbose=1,
        modules=modules,
        runtimes=runtimes,
        warmup=5,
        repeat=10,
        save_as_dataframe=f"result-{name}.t{t}.csv",
        filter_fct=lambda rt, modules: True,
    )

    columns = ["runtime", "b_avg_time", "runtime", "v_onnxruntime"]
    df[columns].to_csv(f"summary-{name}.t{t}.csv")
    print(df[columns])

The output would look like:

[bench_virtual] 1/5 18:01:02 onnx==1.14.1 onnx-extended==0.2.1 onnxruntime==1.16.0
[bench_virtual] 2/5 18:01:06 onnx==1.14.1 onnx-extended==0.2.1 onnxruntime==1.15.1
[bench_virtual] 3/5 18:01:09 onnx==1.14.1 onnx-extended==0.2.1 onnxruntime==1.14.1
[bench_virtual] 4/5 18:01:12 onnx==1.14.1 onnx-extended==0.2.1 onnxruntime==1.13.1
[bench_virtual] 5/5 18:01:15 onnx==1.14.1 onnx-extended==0.2.1 onnxruntime==1.12.1
            runtime  b_avg_time              runtime v_onnxruntime
0   ReferenceEvaluator    0.001879   ReferenceEvaluator        1.16.0
1  CReferenceEvaluator    0.000042  CReferenceEvaluator        1.16.0
2          onnxruntime    0.000013          onnxruntime        1.16.0
3          onnxruntime    0.000012          onnxruntime        1.15.1
4          onnxruntime    0.000017          onnxruntime        1.14.1
5          onnxruntime    0.000012          onnxruntime        1.13.1
6          onnxruntime    0.000011          onnxruntime        1.12.1

The differences are not significant on such small model except for the python runtime.