Measuring performance about Gemm#

Differents types, differents backend, differents

Onnx Model#

import platform
from itertools import product
import numpy
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame, pivot_table
from onnx import TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnx.numpy_helper import from_array
from onnxruntime import InferenceSession, get_available_providers
from onnxruntime.capi._pybind_state import (
    OrtValue as C_OrtValue,
    OrtDevice as C_OrtDevice,
)
from onnxruntime.capi.onnxruntime_pybind11_state import (
    NotImplemented,
    InvalidGraph,
    InvalidArgument,
)
from onnx_extended.reference import CReferenceEvaluator
from onnx_extended.ext_test_case import unit_test_going, measure_time


def create_model(mat_type=TensorProto.FLOAT):
    I1 = from_array(numpy.array([1], dtype=numpy.float32), name="I")
    A = make_tensor_value_info("A", mat_type, [None, None])
    B = make_tensor_value_info("B", mat_type, [None, None])
    C = make_tensor_value_info("C", mat_type, [None, None])
    nodes = [
        make_node("CastLike", ["I", "A"], ["Ic"]),
        make_node("Add", ["A", "Ic"], ["A1"]),
        make_node("Add", ["A1", "Ic"], ["A2"]),
        make_node("Add", ["A2", "Ic"], ["A3"]),
        make_node("MatMul", ["A", "B"], ["M0"]),
        make_node("MatMul", ["A1", "B"], ["M1"]),
        make_node("MatMul", ["A2", "B"], ["M2"]),
        make_node("MatMul", ["A3", "B"], ["M3"]),
        make_node("Add", ["M0", "M1"], ["M12"]),
        make_node("Add", ["M2", "M3"], ["M23"]),
        make_node("Add", ["M12", "M23"], ["C"]),
    ]
    graph = make_graph(nodes, "a", [A, B], [C], [I1])
    if mat_type < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    check_model(onnx_model)
    return onnx_model


create_model()

ir_version: 8
graph {
  node {
    input: "I"
    input: "A"
    output: "Ic"
    op_type: "CastLike"
  }
  node {
    input: "A"
    input: "Ic"
    output: "A1"
    op_type: "Add"
  }
  node {
    input: "A1"
    input: "Ic"
    output: "A2"
    op_type: "Add"
  }
  node {
    input: "A2"
    input: "Ic"
    output: "A3"
    op_type: "Add"
  }
  node {
    input: "A"
    input: "B"
    output: "M0"
    op_type: "MatMul"
  }
  node {
    input: "A1"
    input: "B"
    output: "M1"
    op_type: "MatMul"
  }
  node {
    input: "A2"
    input: "B"
    output: "M2"
    op_type: "MatMul"
  }
  node {
    input: "A3"
    input: "B"
    output: "M3"
    op_type: "MatMul"
  }
  node {
    input: "M0"
    input: "M1"
    output: "M12"
    op_type: "Add"
  }
  node {
    input: "M2"
    input: "M3"
    output: "M23"
    op_type: "Add"
  }
  node {
    input: "M12"
    input: "M23"
    output: "C"
    op_type: "Add"
  }
  name: "a"
  initializer {
    dims: 1
    data_type: 1
    name: "I"
    raw_data: "\000\000\200?"
  }
  input {
    name: "A"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  input {
    name: "B"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  output {
    name: "C"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 18
}

A model to cast

def create_cast(to):
    A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
    C = make_tensor_value_info("C", to, [None, None])
    node1 = make_node("Cast", ["A"], ["C"], to=to)
    graph = make_graph([node1], "a", [A], [C])
    if to < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    check_model(onnx_model)
    return onnx_model


create_cast(TensorProto.FLOAT16)

ir_version: 8
graph {
  node {
    input: "A"
    output: "C"
    op_type: "Cast"
    attribute {
      name: "to"
      i: 10
      type: INT
    }
  }
  name: "a"
  input {
    name: "A"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
  output {
    name: "C"
    type {
      tensor_type {
        elem_type: 10
        shape {
          dim {
          }
          dim {
          }
        }
      }
    }
  }
}
opset_import {
  domain: ""
  version: 18
}

Performance#

The benchmark will run the following configurations.

types = [
    TensorProto.FLOAT,
    TensorProto.UINT32,
    TensorProto.INT32,
    TensorProto.INT16,
    TensorProto.INT8,
    TensorProto.FLOAT16,
    TensorProto.BFLOAT16,
    TensorProto.FLOAT8E4M3FN,
    TensorProto.FLOAT8E5M2,
]
engine = [CReferenceEvaluator, InferenceSession]
providers = [
    ["CPUExecutionProvider"],
    ["CUDAExecutionProvider", "CPUExecutionProvider"],
]
# M, N, K
dims = [
    (10, 10, 10),
    (61, 62, 63),
    (64, 64, 64),
    (65, 66, 67),
    (100, 100, 100),
    (128, 128, 128),
    # (256, 256, 256),
    # (400, 400, 400),
    # (512, 512, 512),
]


map_type = {TensorProto.FLOAT: numpy.float32, TensorProto.FLOAT16: numpy.float16}

Let’s cache the matrices involved.

def to_ort_value(m):
    device = C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    ort_value = C_OrtValue.ortvalue_from_numpy(m, device)
    return ort_value


matrices = {}
for m, n, k in dims:
    for tt in types:
        for i, j in [(m, k), (k, n)]:
            try:
                sess = InferenceSession(
                    create_cast(tt).SerializeToString(),
                    providers=["CPUExecutionProvider"],
                )
            except (InvalidGraph, InvalidArgument):
                # not support by this version of onnxruntime
                continue
            vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
            ov = to_ort_value(vect)
            ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
            matrices[tt, i, j] = ovtt

print(f"{len(matrices)} matrices were created.")

72 matrices were created.

Let’s run the benchmark

data = []
errors = []
pbar = tqdm(list(product(types, engine, providers, dims)))
for tt, engine, provider, dim in pbar:
    if max(dim) <= 200:
        repeat, number = 50, 25
    elif max(dim) <= 256:
        repeat, number = 25, 10
    else:
        repeat, number = 10, 4

    onx = create_model(tt)
    with open(f"plot_bench_gemm_{tt}.onnx", "wb") as f:
        f.write(onx.SerializeToString())
    k1 = (tt, dim[0], dim[2])
    k2 = (tt, dim[2], dim[1])
    if k1 not in matrices:
        errors.append(f"Key k1={k1!r} not in matrices.")
        continue
    if k2 not in matrices:
        errors.append(f"Key k2={k2!r} not in matrices.")
        continue

    if engine == CReferenceEvaluator:
        if tt == TensorProto.FLOAT16 and max(dim) > 50:
            repeat, number = 2, 2
        if provider != ["CPUExecutionProvider"]:
            continue
        if tt not in [TensorProto.FLOAT, TensorProto.FLOAT16]:
            continue

        pbar.set_description(
            f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}"
        )

        feeds = {"A": matrices[k1].numpy(), "B": matrices[k2].numpy()}
        sess = engine(onx)
        sess.run(None, feeds)
        obs = measure_time(lambda: sess.run(None, feeds), repeat=repeat, number=number)

    elif engine == InferenceSession:
        if provider[0] not in get_available_providers():
            continue
        pbar.set_description(
            f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}"
        )
        feeds = {"A": matrices[k1], "B": matrices[k2]}
        try:
            sess = engine(onx.SerializeToString(), providers=provider)
        except (NotImplemented, InvalidGraph) as e:
            # not implemented
            errors.append(e)
            continue

        if provider == ["CPUExecutionProvider"]:
            the_feeds = feeds
        else:
            # moving values to CUDA
            device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
            try:
                the_feeds = {
                    k: C_OrtValue.ortvalue_from_numpy(v.numpy(), device)
                    for k, v in feeds.items()
                }
            except RuntimeError as e:
                errors.append(f"issue with cuda and type {tt} - {e}")
                continue

        sess._sess.run_with_ort_values(the_feeds, ["C"], None)[0]
        obs = measure_time(
            lambda: sess._sess.run_with_ort_values(the_feeds, ["C"], None)[0],
            repeat=repeat,
            number=number,
        )

    else:
        continue

    obs.update(
        dict(
            engine={"InferenceSession": "ort", "CReferenceEvaluator": "np"}[
                engine.__name__
            ],
            type={
                TensorProto.FLOAT: "f32",
                TensorProto.FLOAT16: "f16",
                TensorProto.INT8: "i8",
                TensorProto.INT16: "i16",
                TensorProto.INT32: "i32",
                TensorProto.UINT32: "u32",
            }[tt],
            M=dim[0],
            N=dim[1],
            K=dim[2],
            cost=numpy.prod(dim) * 4,
            cost_s=f"{numpy.prod(dim) * 4}-{dim[0]}x{dim[1]}x{dim[2]}",
            repeat=repeat,
            number=number,
            provider={"CPUExecutionProvider": "cpu", "CUDAExecutionProvider": "cuda"}[
                provider[0]
            ],
            platform=platform.processor(),
        )
    )
    data.append(obs)
    if unit_test_going() and len(data) >= 2:
        break


df = DataFrame(data)
df.to_excel("plot_bench_gemm.xlsx")
df.to_csv("plot_bench_gemm.csv")
df.drop(["min_exec", "max_exec"], axis=1).to_csv("plot_bench_gemm_.csv")
df

  0%|          | 0/216 [00:00<?, ?it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):   0%|          | 0/216 [00:00<?, ?it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):   0%|          | 1/216 [00:00<00:44,  4.82it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):   0%|          | 1/216 [00:00<00:44,  4.82it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):   1%|          | 2/216 [00:00<00:39,  5.37it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):   1%|          | 2/216 [00:00<00:39,  5.37it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):   1%|1         | 3/216 [00:00<00:37,  5.61it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):   1%|1         | 3/216 [00:00<00:37,  5.61it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):   2%|1         | 4/216 [00:02<02:45,  1.28it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):   2%|1         | 4/216 [00:02<02:45,  1.28it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):   2%|2         | 5/216 [00:03<03:48,  1.09s/it]
t=1 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):   2%|2         | 5/216 [00:03<03:48,  1.09s/it]
t=1 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):   3%|2         | 6/216 [00:05<03:54,  1.12s/it]
t=1 e=InferenceSession p=CPUE dim=(10, 10, 10):   3%|2         | 6/216 [00:05<03:54,  1.12s/it]
t=1 e=InferenceSession p=CPUE dim=(10, 10, 10):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(61, 62, 63):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(64, 64, 64):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(65, 66, 67):   6%|6         | 13/216 [00:05<00:54,  3.74it/s]
t=1 e=InferenceSession p=CPUE dim=(65, 66, 67):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(100, 100, 100):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(128, 128, 128):   7%|7         | 16/216 [00:05<00:42,  4.73it/s]
t=1 e=InferenceSession p=CPUE dim=(128, 128, 128):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(10, 10, 10):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(61, 62, 63):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(64, 64, 64):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(65, 66, 67):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(100, 100, 100):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=12 e=InferenceSession p=CPUE dim=(128, 128, 128):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=6 e=InferenceSession p=CPUE dim=(10, 10, 10):   8%|8         | 18/216 [00:05<00:42,  4.67it/s]
t=6 e=InferenceSession p=CPUE dim=(10, 10, 10):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(61, 62, 63):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(64, 64, 64):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(65, 66, 67):  28%|##8       | 61/216 [00:06<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(100, 100, 100):  28%|##8       | 61/216 [00:07<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(128, 128, 128):  28%|##8       | 61/216 [00:09<00:04, 35.02it/s]
t=6 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(61, 62, 63):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(64, 64, 64):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(65, 66, 67):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(100, 100, 100):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=5 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(61, 62, 63):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(64, 64, 64):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(65, 66, 67):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(100, 100, 100):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=3 e=InferenceSession p=CPUE dim=(128, 128, 128):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):  34%|###4      | 74/216 [00:13<00:24,  5.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(10, 10, 10):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(61, 62, 63):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(64, 64, 64):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(65, 66, 67):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(100, 100, 100):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=(128, 128, 128):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(10, 10, 10):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(61, 62, 63):  56%|#####6    | 121/216 [00:13<00:06, 13.65it/s]
t=10 e=InferenceSession p=CPUE dim=(61, 62, 63):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(64, 64, 64):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(65, 66, 67):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(100, 100, 100):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(128, 128, 128):  62%|######2   | 134/216 [00:14<00:05, 14.24it/s]
t=10 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=16 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=17 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(10, 10, 10):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(61, 62, 63):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(64, 64, 64):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(65, 66, 67):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(100, 100, 100):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(128, 128, 128):  67%|######6   | 144/216 [00:15<00:05, 13.07it/s]
t=19 e=InferenceSession p=CPUE dim=(128, 128, 128): 100%|##########| 216/216 [00:15<00:00, 14.17it/s]

	average	deviation	min_exec	max_exec	repeat	number	ttime	context_size	engine	type	M	N	K	cost	cost_s	provider	platform
0	0.000098	0.000017	0.000086	0.000159	50	25	0.004923	64	np	f32	10	10	10	4000	4000-10x10x10	cpu	x86_64
1	0.000135	0.000010	0.000129	0.000181	50	25	0.006764	64	np	f32	61	62	63	953064	953064-61x62x63	cpu	x86_64
2	0.000133	0.000010	0.000127	0.000193	50	25	0.006672	64	np	f32	64	64	64	1048576	1048576-64x64x64	cpu	x86_64
3	0.001358	0.004381	0.000415	0.031639	50	25	0.067895	64	np	f32	65	66	67	1149720	1149720-65x66x67	cpu	x86_64
4	0.001290	0.000601	0.000726	0.003400	50	25	0.064503	64	np	f32	100	100	100	4000000	4000000-100x100x100	cpu	x86_64
5	0.000941	0.000348	0.000689	0.002080	50	25	0.047035	64	np	f32	128	128	128	8388608	8388608-128x128x128	cpu	x86_64
6	0.000070	0.000126	0.000029	0.000710	50	25	0.003508	64	ort	f32	10	10	10	4000	4000-10x10x10	cpu	x86_64
7	0.000071	0.000014	0.000046	0.000091	50	25	0.003528	64	ort	f32	61	62	63	953064	953064-61x62x63	cpu	x86_64
8	0.000065	0.000004	0.000061	0.000076	50	25	0.003248	64	ort	f32	64	64	64	1048576	1048576-64x64x64	cpu	x86_64
9	0.000070	0.000018	0.000050	0.000109	50	25	0.003511	64	ort	f32	65	66	67	1149720	1149720-65x66x67	cpu	x86_64
10	0.000117	0.000014	0.000093	0.000169	50	25	0.005870	64	ort	f32	100	100	100	4000000	4000000-100x100x100	cpu	x86_64
11	0.000225	0.000025	0.000181	0.000309	50	25	0.011246	64	ort	f32	128	128	128	8388608	8388608-128x128x128	cpu	x86_64
12	0.000030	0.000013	0.000020	0.000054	50	25	0.001505	64	ort	i32	10	10	10	4000	4000-10x10x10	cpu	x86_64
13	0.000338	0.000051	0.000288	0.000547	50	25	0.016887	64	ort	i32	61	62	63	953064	953064-61x62x63	cpu	x86_64
14	0.000344	0.000039	0.000315	0.000544	50	25	0.017199	64	ort	i32	64	64	64	1048576	1048576-64x64x64	cpu	x86_64
15	0.000388	0.000046	0.000347	0.000573	50	25	0.019423	64	ort	i32	65	66	67	1149720	1149720-65x66x67	cpu	x86_64
16	0.001401	0.000324	0.001140	0.002769	50	25	0.070059	64	ort	i32	100	100	100	4000000	4000000-100x100x100	cpu	x86_64
17	0.003157	0.000791	0.002371	0.005002	50	25	0.157834	64	ort	i32	128	128	128	8388608	8388608-128x128x128	cpu	x86_64
18	0.000152	0.000030	0.000126	0.000307	50	25	0.007576	64	np	f16	10	10	10	4000	4000-10x10x10	cpu	x86_64
19	0.007516	0.000252	0.007263	0.007768	2	2	0.015031	64	np	f16	61	62	63	953064	953064-61x62x63	cpu	x86_64
20	0.008402	0.000310	0.008092	0.008712	2	2	0.016804	64	np	f16	64	64	64	1048576	1048576-64x64x64	cpu	x86_64
21	0.008053	0.000081	0.007972	0.008134	2	2	0.016106	64	np	f16	65	66	67	1149720	1149720-65x66x67	cpu	x86_64
22	0.029565	0.000179	0.029386	0.029744	2	2	0.059130	64	np	f16	100	100	100	4000000	4000000-100x100x100	cpu	x86_64
23	0.058163	0.000116	0.058046	0.058279	2	2	0.116326	64	np	f16	128	128	128	8388608	8388608-128x128x128	cpu	x86_64
24	0.000023	0.000008	0.000017	0.000043	50	25	0.001164	64	ort	f16	10	10	10	4000	4000-10x10x10	cpu	x86_64
25	0.000105	0.000015	0.000069	0.000137	50	25	0.005261	64	ort	f16	61	62	63	953064	953064-61x62x63	cpu	x86_64
26	0.000113	0.000027	0.000077	0.000217	50	25	0.005674	64	ort	f16	64	64	64	1048576	1048576-64x64x64	cpu	x86_64
27	0.000110	0.000024	0.000082	0.000168	50	25	0.005515	64	ort	f16	65	66	67	1149720	1149720-65x66x67	cpu	x86_64
28	0.000229	0.000028	0.000168	0.000298	50	25	0.011461	64	ort	f16	100	100	100	4000000	4000000-100x100x100	cpu	x86_64
29	0.000334	0.000050	0.000237	0.000469	50	25	0.016705	64	ort	f16	128	128	128	8388608	8388608-128x128x128	cpu	x86_64

The errors.

for e in list(sorted(set(map(str, errors)))):
    print(e)

[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(float8e4m3fn)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(float8e5m2)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(int16)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. Type Error: Type 'tensor(int8)' of input parameter (A) of operator (MatMul) in node () is invalid.
[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Add(14) node with name ''

Plots#

piv = pivot_table(
    df, index=["cost"], columns=["engine", "type", "provider"], values="average"
)
piv.reset_index(drop=False).to_excel("plot_bench_gemm_summary.xlsx")
piv.reset_index(drop=False).to_csv("plot_bench_gemm_summary.csv")
print(piv)
piv

engine          np                 ort
type           f16       f32       f16       f32       i32
provider       cpu       cpu       cpu       cpu       cpu
cost
4000      0.000152  0.000098  0.000023  0.000070  0.000030
953064    0.007516  0.000135  0.000105  0.000071  0.000338
1048576   0.008402  0.000133  0.000113  0.000065  0.000344
1149720   0.008053  0.001358  0.000110  0.000070  0.000388
4000000   0.029565  0.001290  0.000229  0.000117  0.001401
8388608   0.058163  0.000941  0.000334  0.000225  0.003157

engine	np		ort
type	f16	f32	f16	f32	i32
provider	cpu	cpu	cpu	cpu	cpu
cost
4000	0.000152	0.000098	0.000023	0.000070	0.000030
953064	0.007516	0.000135	0.000105	0.000071	0.000338
1048576	0.008402	0.000133	0.000113	0.000065	0.000344
1149720	0.008053	0.001358	0.000110	0.000070	0.000388
4000000	0.029565	0.001290	0.000229	0.000117	0.001401
8388608	0.058163	0.000941	0.000334	0.000225	0.003157

With the dimensions.

pivs = pivot_table(
    df, index=["cost_s"], columns=["engine", "type", "provider"], values="average"
)
print(pivs)

engine                     np                 ort
type                      f16       f32       f16       f32       i32
provider                  cpu       cpu       cpu       cpu       cpu
cost_s
1048576-64x64x64     0.008402  0.000133  0.000113  0.000065  0.000344
1149720-65x66x67     0.008053  0.001358  0.000110  0.000070  0.000388
4000-10x10x10        0.000152  0.000098  0.000023  0.000070  0.000030
4000000-100x100x100  0.029565  0.001290  0.000229  0.000117  0.001401
8388608-128x128x128  0.058163  0.000941  0.000334  0.000225  0.003157
953064-61x62x63      0.007516  0.000135  0.000105  0.000071  0.000338

plot

dfi = df[
    df.type.isin({"f32", "f16", "bf16", "f8e4m3", "f8e5m2"}) & df.engine.isin({"ort"})
]
pivi = pivot_table(
    dfi, index=["cost"], columns=["engine", "type", "provider"], values="average"
)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
piv.plot(ax=ax[0], title="Gemm performance\nlower is better", logx=True, logy=True)
if pivi.shape[0] > 0:
    pivi.plot(
        ax=ax[1],
        title=f"Gemm performance ORT\n{platform.processor()}",
        logx=True,
        logy=True,
    )
fig.tight_layout()
fig.savefig("plot_bench_gemm.png")

Gemm performance lower is better, Gemm performance ORT x86_64

Total running time of the script: ( 0 minutes 16.194 seconds)

Gallery generated by Sphinx-Gallery