Measuring onnxruntime performance against a cython binding

The following code measures the performance of the python bindings against a cython binding. The time spent in it is not significant when the computation is huge but it may be for small matrices.

import numpy
from pandas import DataFrame
import matplotlib.pyplot as plt
from tqdm import tqdm
from onnx import numpy_helper, TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnxruntime import InferenceSession
from onnx_extended.ortcy.wrap.ortinf import OrtSession
from onnx_extended.args import get_parsed_args
from onnx_extended.ext_test_case import measure_time, unit_test_going


script_args = get_parsed_args(
    "plot_bench_cypy_ort",
    description=__doc__,
    dims=(
        "1,10" if unit_test_going() else "1,10,100,1000",
        "square matrix dimensions to try, comma separated values",
    ),
    expose="repeat,number",
)

A simple onnx model

A = numpy_helper.from_array(numpy.array([1], dtype=numpy.float32), name="A")
X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None])
node1 = make_node("Add", ["X", "A"], ["Y"])
graph = make_graph([node1], "+1", [X], [Y], [A])
onnx_model = make_model(graph, opset_imports=[make_opsetid("", 18)], ir_version=8)
check_model(onnx_model)

Two python bindings on CPU

sess_ort = InferenceSession(
    onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
)
sess_ext = OrtSession(onnx_model.SerializeToString())

x = numpy.random.randn(10, 10).astype(numpy.float32)
y = x + 1

y_ort = sess_ort.run(None, {"X": x})[0]
y_ext = sess_ext.run([x])[0]

d_ort = numpy.abs(y_ort - y).sum()
d_ext = numpy.abs(y_ext - y).sum()
print(f"Discrepancies: d_ort={d_ort}, d_ext={d_ext}")
Discrepancies: d_ort=0.0, d_ext=0.0

Time measurement

run_1_1 is a specific implementation when there is only 1 input and output.

t_ort = measure_time(lambda: sess_ort.run(None, {"X": x})[0], number=200, repeat=100)
print(f"t_ort={t_ort}")

t_ext = measure_time(lambda: sess_ext.run([x])[0], number=200, repeat=100)
print(f"t_ext={t_ext}")

t_ext2 = measure_time(lambda: sess_ext.run_1_1(x), number=200, repeat=100)
print(f"t_ext2={t_ext2}")
t_ort={'average': np.float64(5.4070025496912415e-06), 'deviation': np.float64(2.1811158847588845e-06), 'min_exec': np.float64(4.663300005631754e-06), 'max_exec': np.float64(2.5058569999600877e-05), 'repeat': 100, 'number': 200, 'ttime': np.float64(0.0005407002549691241), 'context_size': 64, 'warmup_time': 0.0001342309988103807}
t_ext={'average': np.float64(5.1815271996019875e-06), 'deviation': np.float64(4.304988726654559e-07), 'min_exec': np.float64(4.901085003439221e-06), 'max_exec': np.float64(7.735379995210678e-06), 'repeat': 100, 'number': 200, 'ttime': np.float64(0.0005181527199601988), 'context_size': 64, 'warmup_time': 8.35049995657755e-05}
t_ext2={'average': np.float64(4.72687039982702e-06), 'deviation': np.float64(5.308661812130688e-07), 'min_exec': np.float64(4.307794997657766e-06), 'max_exec': np.float64(7.513645005019498e-06), 'repeat': 100, 'number': 200, 'ttime': np.float64(0.00047268703998270205), 'context_size': 64, 'warmup_time': 2.5773000743356533e-05}

Benchmark

dims = [int(i) for i in script_args.dims.split(",")]

data = []
for dim in tqdm(dims):
    if dim < 1000:
        number, repeat = script_args.number, script_args.repeat
    else:
        number, repeat = script_args.number * 5, script_args.repeat * 5
    x = numpy.random.randn(dim, dim).astype(numpy.float32)
    t_ort = measure_time(
        lambda x=x: sess_ort.run(None, {"X": x})[0], number=number, repeat=50
    )
    t_ort["name"] = "ort"
    t_ort["dim"] = dim
    data.append(t_ort)

    t_ext = measure_time(lambda x=x: sess_ext.run([x])[0], number=number, repeat=repeat)
    t_ext["name"] = "ext"
    t_ext["dim"] = dim
    data.append(t_ext)

    t_ext2 = measure_time(lambda x=x: sess_ext.run_1_1(x), number=number, repeat=repeat)
    t_ext2["name"] = "ext_1_1"
    t_ext2["dim"] = dim
    data.append(t_ext2)

    if unit_test_going() and dim >= 10:
        break


df = DataFrame(data)
df
  0%|          | 0/4 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:01<00:00,  2.38it/s]
100%|██████████| 4/4 [00:01<00:00,  2.38it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time name dim
0 0.000005 3.113291e-07 0.000005 0.000007 50 10 0.000254 64 0.000072 ort 1
1 0.000006 1.095015e-06 0.000005 0.000009 10 10 0.000056 64 0.000068 ext 1
2 0.000007 4.366461e-06 0.000004 0.000018 10 10 0.000071 64 0.000016 ext_1_1 1
3 0.000006 3.298409e-06 0.000005 0.000022 50 10 0.000288 64 0.000042 ort 10
4 0.000005 3.370656e-07 0.000005 0.000006 10 10 0.000053 64 0.000059 ext 10
5 0.000006 5.555920e-06 0.000004 0.000023 10 10 0.000064 64 0.000015 ext_1_1 10
6 0.000006 1.399302e-06 0.000006 0.000013 50 10 0.000313 64 0.000038 ort 100
7 0.000007 2.568428e-07 0.000007 0.000008 10 10 0.000071 64 0.000042 ext 100
8 0.000006 7.985796e-08 0.000006 0.000007 10 10 0.000064 64 0.000016 ext_1_1 100
9 0.000062 2.124362e-05 0.000035 0.000132 50 50 0.003124 64 0.005340 ort 1000
10 0.000294 5.240357e-05 0.000247 0.000469 50 50 0.014722 64 0.003543 ext 1000
11 0.000297 7.914858e-05 0.000241 0.000518 50 50 0.014847 64 0.000593 ext_1_1 1000


Plots

piv = df.pivot(index="dim", columns="name", values="average")

fig, ax = plt.subplots(1, 1)
piv.plot(ax=ax, title="Binding Comparison", logy=True, logx=True)
fig.tight_layout()
fig.savefig("plot_bench_ort.png")
Binding Comparison

Total running time of the script: (0 minutes 2.464 seconds)

Gallery generated by Sphinx-Gallery