Measuring performance about Gemm#

Differents types, differents backend, differents

Onnx Model#

import platform
from itertools import product
import numpy
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame, pivot_table
from onnx import TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnx.numpy_helper import from_array
from onnxruntime import InferenceSession, get_available_providers
from onnxruntime.capi._pybind_state import (
    OrtValue as C_OrtValue,
    OrtDevice as C_OrtDevice,
)
from onnxruntime.capi.onnxruntime_pybind11_state import (
    NotImplemented,
    InvalidGraph,
    InvalidArgument,
)
from onnx_extended.reference import CReferenceEvaluator
from onnx_extended.ext_test_case import unit_test_going, measure_time


def create_model(mat_type=TensorProto.FLOAT):
    I1 = from_array(numpy.array([1], dtype=numpy.float32), name="I")
    A = make_tensor_value_info("A", mat_type, [None, None])
    B = make_tensor_value_info("B", mat_type, [None, None])
    C = make_tensor_value_info("C", mat_type, [None, None])
    nodes = [
        make_node("CastLike", ["I", "A"], ["Ic"]),
        make_node("Add", ["A", "Ic"], ["A1"]),
        make_node("Add", ["A1", "Ic"], ["A2"]),
        make_node("Add", ["A2", "Ic"], ["A3"]),
        make_node("MatMul", ["A", "B"], ["M0"]),
        make_node("MatMul", ["A1", "B"], ["M1"]),
        make_node("MatMul", ["A2", "B"], ["M2"]),
        make_node("MatMul", ["A3", "B"], ["M3"]),
        make_node("Add", ["M0", "M1"], ["M12"]),
        make_node("Add", ["M2", "M3"], ["M23"]),
        make_node("Add", ["M12", "M23"], ["C"]),
    ]
    graph = make_graph(nodes, "a", [A, B], [C], [I1])
    if mat_type < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    check_model(onnx_model)
    return onnx_model


create_model()
ir_version: 8
graph {
  node {
    input: "I"
    input: "A"
    output: "Ic"
    op_type: "CastLike"
  }
  node {
    input: "A"
    input: "Ic"
    output: "A1"
    op_type: "Add"
  }
  node {
    input: "A1"
    input: "Ic"
    output: "A2"
    op_type: "Add"
  }
  node {
    input: "A2"
    input: "Ic"
    output: "A3"
    op_type: "Add"
  }
  node {
    input: "A"
    input: "B"
    output: "M0"
    op_type: "MatMul"
  }
  node {
    input: "A1"
    input: "B"
    output: "M1"
    op_type: "MatMul"
  }
  node {
    input: "A2"
    input: "B"
    output: "M2"
    op_type: "MatMul"
  }
  node {
    input: "A3"
    input: "B"
    output: "M3"
    op_type: "MatMul"
  }
  node {
    input: "M0"
    input: "M1"
    output: "M12"
    op_type: "Add"
  }
  node {
    input: "M2"
    input: "M3"
    output: "M23"
    op_type: "Add"
  }
  node {
    input: "M12"
    input: "M23"
    output: "C"
    op_type: "Add"
  }
  name: "a"
  initializer {
    dims: 1
    data_type: 1
    name: "I"
    raw_data: "\000\000\200?"
  }
  input {
    name: "A"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  input {
    name: "B"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  output {
    name: "C"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 18
}

A model to cast

def create_cast(to):
    A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
    C = make_tensor_value_info("C", to, [None, None])
    node1 = make_node("Cast", ["A"], ["C"], to=to)
    graph = make_graph([node1], "a", [A], [C])
    if to < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    check_model(onnx_model)
    return onnx_model


create_cast(TensorProto.FLOAT16)
ir_version: 8
graph {
  node {
    input: "A"
    output: "C"
    op_type: "Cast"
    attribute {
      name: "to"
      i: 10
      type: INT
    }
  }
  name: "a"
  input {
    name: "A"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  output {
    name: "C"
    type {
      tensor_type {
        elem_type: 10
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 18
}

Performance#

The benchmark will run the following configurations.

types = [
    TensorProto.FLOAT,
    TensorProto.UINT32,
    TensorProto.INT32,
    TensorProto.INT16,
    TensorProto.INT8,
    TensorProto.FLOAT16,
    TensorProto.BFLOAT16,
    TensorProto.FLOAT8E4M3FN,
    TensorProto.FLOAT8E5M2,
]
engine = [CReferenceEvaluator, InferenceSession]
providers = [
    ["CPUExecutionProvider"],
    ["CUDAExecutionProvider", "CPUExecutionProvider"],
]
# M, N, K
dims = [
    (10, 10, 10),
    (61, 62, 63),
    (64, 64, 64),
    (65, 66, 67),
    (100, 100, 100),
    (128, 128, 128),
    # (256, 256, 256),
    # (400, 400, 400),
    # (512, 512, 512),
]


map_type = {TensorProto.FLOAT: numpy.float32, TensorProto.FLOAT16: numpy.float16}

Let’s cache the matrices involved.

def to_ort_value(m):
    device = C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    ort_value = C_OrtValue.ortvalue_from_numpy(m, device)
    return ort_value


matrices = {}
for m, n, k in dims:
    for tt in types:
        for i, j in [(m, k), (k, n)]:
            try:
                sess = InferenceSession(
                    create_cast(tt).SerializeToString(),
                    providers=["CPUExecutionProvider"],
                )
            except (InvalidGraph, InvalidArgument):
                # not support by this version of onnxruntime
                continue
            vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
            ov = to_ort_value(vect)
            ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
            matrices[tt, i, j] = ovtt

print(f"{len(matrices)} matrices were created.")
72 matrices were created.

Let’s run the benchmark

data = []
errors = []
pbar = tqdm(list(product(types, engine, providers, dims)))
for tt, engine, provider, dim in pbar:
    if max(dim) <= 200:
        repeat, number = 50, 25
    elif max(dim) <= 256:
        repeat, number = 25, 10
    else:
        repeat, number = 10, 4

    onx = create_model(tt)
    with open(f"plot_bench_gemm_{tt}.onnx", "wb") as f:
        f.write(onx.SerializeToString())
    k1 = (tt, dim[0], dim[2])
    k2 = (tt, dim[2], dim[1])
    if k1 not in matrices:
        errors.append(f"Key k1={k1!r} not in matrices.")
        continue
    if k2 not in matrices:
        errors.append(f"Key k2={k2!r} not in matrices.")
        continue

    if engine == CReferenceEvaluator:
        if tt == TensorProto.FLOAT16 and max(dim) > 50:
            repeat, number = 2, 2
        if provider != ["CPUExecutionProvider"]:
            continue
        if tt not in [TensorProto.FLOAT, TensorProto.FLOAT16]:
            continue

        pbar.set_description(
            f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}"
        )

        feeds = {"A": matrices[k1].numpy(), "B": matrices[k2].numpy()}
        sess = engine(onx)
        sess.run(None, feeds)
        obs = measure_time(lambda: sess.run(None, feeds), repeat=repeat, number=number)

    elif engine == InferenceSession:
        if provider[0] not in get_available_providers():
            continue
        pbar.set_description(
            f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}"
        )
        feeds = {"A": matrices[k1], "B": matrices[k2]}
        try:
            sess = engine(onx.SerializeToString(), providers=provider)
        except (NotImplemented, InvalidGraph) as e:
            # not implemented
            errors.append(e)
            continue

        if provider == ["CPUExecutionProvider"]:
            the_feeds = feeds
        else:
            # moving values to CUDA
            device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
            try:
                the_feeds = {
                    k: C_OrtValue.ortvalue_from_numpy(v.numpy(), device)
                    for k, v in feeds.items()
                }
            except RuntimeError as e:
                errors.append(f"issue with cuda and type {tt} - {e}")
                continue

        sess._sess.run_with_ort_values(the_feeds, ["C"], None)[0]
        obs = measure_time(
            lambda: sess._sess.run_with_ort_values(the_feeds, ["C"], None)[0],
            repeat=repeat,
            number=number,
        )

    else:
        continue

    obs.update(
        dict(
            engine={"InferenceSession": "ort", "CReferenceEvaluator": "np"}[
                engine.__name__
            ],
            type={
                TensorProto.FLOAT: "f32",
                TensorProto.FLOAT16: "f16",
                TensorProto.INT8: "i8",
                TensorProto.INT16: "i16",
                TensorProto.INT32: "i32",
                TensorProto.UINT32: "u32",
            }[tt],
            M=dim[0],
            N=dim[1],
            K=dim[2],
            cost=numpy.prod(dim) * 4,
            cost_s=f"{numpy.prod(dim) * 4}-{dim[0]}x{dim[1]}x{dim[2]}",
            repeat=repeat,
            number=number,
            provider={"CPUExecutionProvider": "cpu", "CUDAExecutionProvider": "cuda"}[
                provider[0]
            ],
            platform=platform.processor(),
        )
    )
    data.append(obs)
    if unit_test_going() and len(data) >= 2:
        break


df = DataFrame(data)
df.to_excel("plot_bench_gemm.xlsx")
df.to_csv("plot_bench_gemm.csv")
df.drop(["min_exec", "max_exec"], axis=1).to_csv("plot_bench_gemm_.csv")
df
  0%|          | 0/216 [00:00<?, ?it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):   0%|          | 0/216 [00:00<?, ?it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):   0%|          | 1/216 [00:00<00:44,  4.82it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):   0%|          | 1/216 [00:00<00:44,  4.82it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):   1%|          | 2/216 [00:00<00:39,  5.37it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):   1%|          | 2/216 [00:00<00:39,  5.37it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):   1%|1         | 3/216 [00:00<00:37,  5.61it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):   1%|1         | 3/216 [00:00<00:37,  5.61it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):   2%|1         | 4/216 [00:02<02:45,  1.28it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):   2%|1         | 4/216 [00:02<02:45,  1.28it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):   2%|2         | 5/216 [00:03<03:48,  1.09s/it]
t=1 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):   2%|2         | 5/216 [00:03<03:48,  1.09s/it]
t=1 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):   3%|2         | 6/216 [00:05<03:54,  1.12s/it]
t=1 e=InferenceSession p=CPUE dim=(10, 10, 10):   3%|2         | 6/216 [00:05<03:54,  1.12s/it]
t=1 e=InferenceSession p=CPUE dim=(10, 10, 10):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(61, 62, 63):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(64, 64, 64):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(65, 66, 67):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(65, 66, 67):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(100, 100, 100):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(128, 128, 128):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(128, 128, 128):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(10, 10, 10):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(61, 62, 63):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(64, 64, 64):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(65, 66, 67):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(100, 100, 100):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(128, 128, 128):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=6 e=InferenceSession p=CPUE dim=(10, 10, 10):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=6 e=InferenceSession p=CPUE dim=(10, 10, 10):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(61, 62, 63):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(64, 64, 64):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(65, 66, 67):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(100, 100, 100):  28%|##8       | 61/216 [00:07<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(128, 128, 128):  28%|##8       | 61/216 [00:09<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(61, 62, 63):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(64, 64, 64):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(65, 66, 67):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(100, 100, 100):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(61, 62, 63):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(64, 64, 64):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(65, 66, 67):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(100, 100, 100):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(10, 10, 10):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(61, 62, 63):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(61, 62, 63):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(64, 64, 64):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(65, 66, 67):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(100, 100, 100):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(128, 128, 128):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(128, 128, 128): 100%|##########| 216/216 [00:15<00:00, 14.17it/s]
average deviation min_exec max_exec repeat number ttime context_size engine type M N K cost cost_s provider platform
0 0.000098 0.000017 0.000086 0.000159 50 25 0.004923 64 np f32 10 10 10 4000 4000-10x10x10 cpu x86_64
1 0.000135 0.000010 0.000129 0.000181 50 25 0.006764 64 np f32 61 62 63 953064 953064-61x62x63 cpu x86_64
2 0.000133 0.000010 0.000127 0.000193 50 25 0.006672 64 np f32 64 64 64 1048576 1048576-64x64x64 cpu x86_64
3 0.001358 0.004381 0.000415 0.031639 50 25 0.067895 64 np f32 65 66 67 1149720 1149720-65x66x67 cpu x86_64
4 0.001290 0.000601 0.000726 0.003400 50 25 0.064503 64 np f32 100 100 100 4000000 4000000-100x100x100 cpu x86_64
5 0.000941 0.000348 0.000689 0.002080 50 25 0.047035 64 np f32 128 128 128 8388608 8388608-128x128x128 cpu x86_64
6 0.000070 0.000126 0.000029 0.000710 50 25 0.003508 64 ort f32 10 10 10 4000 4000-10x10x10 cpu x86_64
7 0.000071 0.000014 0.000046 0.000091 50 25 0.003528 64 ort f32 61 62 63 953064 953064-61x62x63 cpu x86_64
8 0.000065 0.000004 0.000061 0.000076 50 25 0.003248 64 ort f32 64 64 64 1048576 1048576-64x64x64 cpu x86_64
9 0.000070 0.000018 0.000050 0.000109 50 25 0.003511 64 ort f32 65 66 67 1149720 1149720-65x66x67 cpu x86_64
10 0.000117 0.000014 0.000093 0.000169 50 25 0.005870 64 ort f32 100 100 100 4000000 4000000-100x100x100 cpu x86_64
11 0.000225 0.000025 0.000181 0.000309 50 25 0.011246 64 ort f32 128 128 128 8388608 8388608-128x128x128 cpu x86_64
12 0.000030 0.000013 0.000020 0.000054 50 25 0.001505 64 ort i32 10 10 10 4000 4000-10x10x10 cpu x86_64
13 0.000338 0.000051 0.000288 0.000547 50 25 0.016887 64 ort i32 61 62 63 953064 953064-61x62x63 cpu x86_64
14 0.000344 0.000039 0.000315 0.000544 50 25 0.017199 64 ort i32 64 64 64 1048576 1048576-64x64x64 cpu x86_64
15 0.000388 0.000046 0.000347 0.000573 50 25 0.019423 64 ort i32 65 66 67 1149720 1149720-65x66x67 cpu x86_64
16 0.001401 0.000324 0.001140 0.002769 50 25 0.070059 64 ort i32 100 100 100 4000000 4000000-100x100x100 cpu x86_64
17 0.003157 0.000791 0.002371 0.005002 50 25 0.157834 64 ort i32 128 128 128 8388608 8388608-128x128x128 cpu x86_64
18 0.000152 0.000030 0.000126 0.000307 50 25 0.007576 64 np f16 10 10 10 4000 4000-10x10x10 cpu x86_64
19 0.007516 0.000252 0.007263 0.007768 2 2 0.015031 64 np f16 61 62 63 953064 953064-61x62x63 cpu x86_64
20 0.008402 0.000310 0.008092 0.008712 2 2 0.016804 64 np f16 64 64 64 1048576 1048576-64x64x64 cpu x86_64
21 0.008053 0.000081 0.007972 0.008134 2 2 0.016106 64 np f16 65 66 67 1149720 1149720-65x66x67 cpu x86_64
22 0.029565 0.000179 0.029386 0.029744 2 2 0.059130 64 np f16 100 100 100 4000000 4000000-100x100x100 cpu x86_64
23 0.058163 0.000116 0.058046 0.058279 2 2 0.116326 64 np f16 128 128 128 8388608 8388608-128x128x128 cpu x86_64
24 0.000023 0.000008 0.000017 0.000043 50 25 0.001164 64 ort f16 10 10 10 4000 4000-10x10x10 cpu x86_64
25 0.000105 0.000015 0.000069 0.000137 50 25 0.005261 64 ort f16 61 62 63 953064 953064-61x62x63 cpu x86_64
26 0.000113 0.000027 0.000077 0.000217 50 25 0.005674 64 ort f16 64 64 64 1048576 1048576-64x64x64 cpu x86_64
27 0.000110 0.000024 0.000082 0.000168 50 25 0.005515 64 ort f16 65 66 67 1149720 1149720-65x66x67 cpu x86_64
28 0.000229 0.000028 0.000168 0.000298 50 25 0.011461 64 ort f16 100 100 100 4000000 4000000-100x100x100 cpu x86_64
29 0.000334 0.000050 0.000237 0.000469 50 25 0.016705 64 ort f16 128 128 128 8388608 8388608-128x128x128 cpu x86_64


The errors.

for e in list(sorted(set(map(str, errors)))):
    print(e)
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(float8e4m3fn)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(float8e5m2)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(int16)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(int8)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Add(14) node with name ''

Plots#

piv = pivot_table(
    df, index=["cost"], columns=["engine", "type", "provider"], values="average"
)
piv.reset_index(drop=False).to_excel("plot_bench_gemm_summary.xlsx")
piv.reset_index(drop=False).to_csv("plot_bench_gemm_summary.csv")
print(piv)
piv
engine          np                 ort
type           f16       f32       f16       f32       i32
provider       cpu       cpu       cpu       cpu       cpu
cost
4000      0.000152  0.000098  0.000023  0.000070  0.000030
953064    0.007516  0.000135  0.000105  0.000071  0.000338
1048576   0.008402  0.000133  0.000113  0.000065  0.000344
1149720   0.008053  0.001358  0.000110  0.000070  0.000388
4000000   0.029565  0.001290  0.000229  0.000117  0.001401
8388608   0.058163  0.000941  0.000334  0.000225  0.003157
engine np ort
type f16 f32 f16 f32 i32
provider cpu cpu cpu cpu cpu
cost
4000 0.000152 0.000098 0.000023 0.000070 0.000030
953064 0.007516 0.000135 0.000105 0.000071 0.000338
1048576 0.008402 0.000133 0.000113 0.000065 0.000344
1149720 0.008053 0.001358 0.000110 0.000070 0.000388
4000000 0.029565 0.001290 0.000229 0.000117 0.001401
8388608 0.058163 0.000941 0.000334 0.000225 0.003157


With the dimensions.

pivs = pivot_table(
    df, index=["cost_s"], columns=["engine", "type", "provider"], values="average"
)
print(pivs)
engine                     np                 ort
type                      f16       f32       f16       f32       i32
provider                  cpu       cpu       cpu       cpu       cpu
cost_s
1048576-64x64x64     0.008402  0.000133  0.000113  0.000065  0.000344
1149720-65x66x67     0.008053  0.001358  0.000110  0.000070  0.000388
4000-10x10x10        0.000152  0.000098  0.000023  0.000070  0.000030
4000000-100x100x100  0.029565  0.001290  0.000229  0.000117  0.001401
8388608-128x128x128  0.058163  0.000941  0.000334  0.000225  0.003157
953064-61x62x63      0.007516  0.000135  0.000105  0.000071  0.000338

plot

dfi = df[
    df.type.isin({"f32", "f16", "bf16", "f8e4m3", "f8e5m2"}) & df.engine.isin({"ort"})
]
pivi = pivot_table(
    dfi, index=["cost"], columns=["engine", "type", "provider"], values="average"
)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
piv.plot(ax=ax[0], title="Gemm performance\nlower is better", logx=True, logy=True)
if pivi.shape[0] > 0:
    pivi.plot(
        ax=ax[1],
        title=f"Gemm performance ORT\n{platform.processor()}",
        logx=True,
        logy=True,
    )
fig.tight_layout()
fig.savefig("plot_bench_gemm.png")
Gemm performance lower is better, Gemm performance ORT x86_64

Total running time of the script: ( 0 minutes 16.194 seconds)

Gallery generated by Sphinx-Gallery