Measuring performance about Gemm with onnxruntime

The benchmark measures the performance of Gemm for different types and configuration. That includes a custom operator only available on CUDA calling function cublasLtMatmul. This function offers many options.

import pprint
import platform
from itertools import product
import numpy
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame, pivot_table
from onnx import TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnx.numpy_helper import from_array
from onnx.reference import ReferenceEvaluator
from onnxruntime import InferenceSession, SessionOptions, get_available_providers
from onnxruntime.capi._pybind_state import (
    OrtValue as C_OrtValue,
    OrtDevice as C_OrtDevice,
)
from onnxruntime.capi.onnxruntime_pybind11_state import (
    Fail,
    NotImplemented,
    InvalidGraph,
    InvalidArgument,
)

try:
    from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
except ImportError:
    onnx_simple_text_plot = str
try:
    from onnx_extended.reference import CReferenceEvaluator
except ImportError:
    CReferenceEvaluator = ReferenceEvaluator
from onnx_extended.args import get_parsed_args
from onnx_extended.ext_test_case import unit_test_going, measure_time

try:
    from onnx_extended.validation.cuda.cuda_example_py import get_device_prop
    from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs

    has_cuda = True
except ImportError:

    def get_device_prop():
        return {"name": "CPU"}

    def get_ort_ext_libs():
        return None

    has_cuda = False

default_dims = (
    "32,32,32;64,64,64;128,128,128;256,256,256;"
    "400,400,400;512,512,512;1024,1024,1024"
)
if has_cuda:
    prop = get_device_prop()
    if prop.get("major", 0) >= 7:
        default_dims += ";2048,2048,2048;4096,4096,4096"
    if prop.get("major", 0) >= 9:
        default_dims += ";16384,16384,16384"


script_args = get_parsed_args(
    "plot_bench_gemm_ort",
    description=__doc__,
    dims=(
        "32,32,32;64,64,64" if unit_test_going() else default_dims,
        "square matrix dimensions to try, comma separated values",
    ),
    types=(
        "FLOAT" if unit_test_going() else "FLOAT8E4M3FN,FLOAT,FLOAT16,BFLOAT16",
        "element type to teest",
    ),
    number=2 if unit_test_going() else 4,
    repeat=2 if unit_test_going() else 10,
    warmup=2 if unit_test_going() else 5,
    expose="repeat,number,warmup",
)

Device properties

if has_cuda:
    properties = get_device_prop()
    pprint.pprint(properties)
else:
    properties = {"major": 0}
{'clockRate': 1569000,
 'computeMode': 0,
 'concurrentKernels': 1,
 'isMultiGpuBoard': 0,
 'major': 6,
 'maxThreadsPerBlock': 1024,
 'minor': 1,
 'multiProcessorCount': 10,
 'name': 'NVIDIA GeForce GTX 1060',
 'sharedMemPerBlock': 49152,
 'totalConstMem': 65536,
 'totalGlobalMem': 6442319872}

Model to benchmark

It includes one Gemm. The operator changes. It can the regular Gemm, a custom Gemm from domain com.microsoft or a custom implementation from domain onnx_extended.ortops.tutorial.cuda.

def create_model(
    mat_type=TensorProto.FLOAT, provider="CUDAExecutionProvider", domain="com.microsoft"
):
    A = make_tensor_value_info("A", mat_type, [None, None])
    B = make_tensor_value_info("B", mat_type, [None, None])
    outputs = [make_tensor_value_info("C", mat_type, [None, None])]
    inits = []
    if domain != "":
        if provider != "CUDAExecutionProvider":
            return None
        f8 = False
        if domain == "com.microsoft":
            op_name = "GemmFloat8"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT:
            op_name = "CustomGemmFloat"
            computeType = "CUBLAS_COMPUTE_32F_FAST_TF32"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT16:
            op_name = "CustomGemmFloat16"
            computeType = "CUBLAS_COMPUTE_16F"
            node_output = ["C"]
        elif mat_type in (TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2):
            f8 = True
            op_name = "CustomGemmFloat8E4M3FN"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
            outputs = [
                make_tensor_value_info("C", TensorProto.FLOAT16, [None, None]),
            ]
            inits.append(from_array(numpy.array([1], dtype=numpy.float32), name="I"))
        else:
            return None
        node_kw = dict(
            alpha=1.0,
            transB=1,
            domain=domain,
            computeType=computeType,
            fastAccumulationMode=1,
            rowMajor=0 if op_name.startswith("CustomGemmFloat") else 1,
        )
        node_kw["name"] = (
            f"{mat_type}.{len(node_output)}.{len(outputs)}."
            f"{domain}..{node_kw['rowMajor']}.."
            f"{node_kw['fastAccumulationMode']}..{node_kw['computeType']}.."
            f"{f8}"
        )
        node_inputs = ["A", "B"]
        if f8:
            node_inputs.append("")
            node_inputs.extend(["I"] * 3)
        nodes = [make_node(op_name, node_inputs, node_output, **node_kw)]
    else:
        nodes = [
            make_node("Gemm", ["A", "B"], ["C"], transA=1, beta=0.0),
        ]
    graph = make_graph(nodes, "a", [A, B], outputs, inits)
    if mat_type < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph,
        opset_imports=[
            make_opsetid("", opset),
            make_opsetid("com.microsoft", 1),
            make_opsetid("onnx_extended.ortops.tutorial.cuda", 1),
        ],
        ir_version=ir,
    )
    check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_model()))
opset: domain='' version=18
opset: domain='com.microsoft' version=1
opset: domain='onnx_extended.ortops.tutorial.cuda' version=1
input: name='A' type=dtype('float32') shape=['', '']
input: name='B' type=dtype('float32') shape=['', '']
GemmFloat8[com.microsoft](A, B, alpha=1.00, computeType=b'CUBLAS_COMPUTE_32F', fastAccumulationMode=1, rowMajor=1, transB=1) -> C
output: name='C' type=dtype('float32') shape=['', '']

A model to cast into anytype. numpy does not support float 8. onnxruntime is used to cast a float array into any type. It must be called with tensor of type OrtValue.

def create_cast(to, cuda=False):
    A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
    C = make_tensor_value_info("C", to, [None, None])
    if cuda:
        nodes = [
            make_node("Cast", ["A"], ["Cc"], to=to),
            make_node("MemcpyFromHost", ["Cc"], ["C"]),
        ]
    else:
        nodes = [make_node("Cast", ["A"], ["C"], to=to)]
    graph = make_graph(nodes, "a", [A], [C])
    if to < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    if not cuda:
        # OpType: MemcpyFromHost
        check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_cast(TensorProto.FLOAT16)))
opset: domain='' version=18
input: name='A' type=dtype('float32') shape=['', '']
Cast(A, to=10) -> C
output: name='C' type=dtype('float16') shape=['', '']

Performance

The benchmark will run the following configurations.

types = list(getattr(TensorProto, a) for a in script_args.types.split(","))
engine = [InferenceSession, CReferenceEvaluator]
providers = [
    ["CUDAExecutionProvider", "CPUExecutionProvider"],
    ["CPUExecutionProvider"],
]
# M, N, K
# we use multiple of 8, otherwise, float8 does not work.
dims = [list(int(i) for i in line.split(",")) for line in script_args.dims.split(";")]
domains = ["onnx_extended.ortops.tutorial.cuda", "", "com.microsoft"]

Let’s cache the matrices involved.

def to_ort_value(m):
    device = C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    ort_value = C_OrtValue.ortvalue_from_numpy(m, device)
    return ort_value


def cached_inputs(dims, types):
    matrices = {}
    matrices_cuda = {}
    pbar = tqdm(list(product(dims, types)))
    for dim, tt in pbar:
        m, n, k = dim
        pbar.set_description(f"t={tt} dim={dim}")
        for i, j in [(m, k), (k, n), (k, m)]:
            if (tt, i, j) in matrices:
                continue
            # CPU
            try:
                sess = InferenceSession(
                    create_cast(tt).SerializeToString(),
                    providers=["CPUExecutionProvider"],
                )
                cpu = True
            except (InvalidGraph, InvalidArgument, NotImplemented):
                # not support by this version of onnxruntime
                cpu = False

            if cpu:
                vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
                ov = to_ort_value(vect)
                ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
                matrices[tt, i, j] = ovtt
            else:
                continue

            # CUDA
            if "CUDAExecutionProvider" not in get_available_providers():
                # No CUDA
                continue
            sess = InferenceSession(
                create_cast(tt, cuda=True).SerializeToString(),
                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
            )
            vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
            ov = to_ort_value(vect)
            ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
            matrices_cuda[tt, i, j] = ovtt
    return matrices, matrices_cuda


matrices, matrices_cuda = cached_inputs(dims, types)
print(f"{len(matrices)} matrices were created.")
  0%|          | 0/28 [00:00<?, ?it/s]
t=17 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=17 dim=[32, 32, 32]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=1 dim=[32, 32, 32]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=10 dim=[32, 32, 32]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=16 dim=[32, 32, 32]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=17 dim=[64, 64, 64]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=1 dim=[64, 64, 64]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=10 dim=[64, 64, 64]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=16 dim=[64, 64, 64]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=17 dim=[128, 128, 128]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=1 dim=[128, 128, 128]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=10 dim=[128, 128, 128]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=16 dim=[128, 128, 128]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=17 dim=[256, 256, 256]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=1 dim=[256, 256, 256]:   4%|▎         | 1/28 [00:06<03:02,  6.77s/it]
t=1 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=10 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=16 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=17 dim=[400, 400, 400]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=1 dim=[400, 400, 400]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=10 dim=[400, 400, 400]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=16 dim=[400, 400, 400]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=17 dim=[512, 512, 512]:  50%|█████     | 14/28 [00:06<00:04,  2.83it/s]
t=1 dim=[512, 512, 512]:  50%|█████     | 14/28 [00:07<00:04,  2.83it/s]
t=1 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=10 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=16 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=17 dim=[1024, 1024, 1024]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=1 dim=[1024, 1024, 1024]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=10 dim=[1024, 1024, 1024]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=16 dim=[1024, 1024, 1024]:  79%|███████▊  | 22/28 [00:07<00:01,  4.93it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:07<00:00,  6.17it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:07<00:00,  3.72it/s]
28 matrices were created.

Let’s run the benchmark

def rendering_obs(obs, dim, number, repeat, domain, provider, internal_time):
    stype = {
        TensorProto.FLOAT: "f32",
        TensorProto.FLOAT16: "f16",
        TensorProto.BFLOAT16: "bf16",
        TensorProto.INT8: "i8",
        TensorProto.INT16: "i16",
        TensorProto.INT32: "i32",
        TensorProto.UINT32: "u32",
        TensorProto.FLOAT8E4M3FN: "e4m3fn",
        TensorProto.FLOAT8E5M2: "e5m2",
    }[tt]
    obs.update(
        dict(
            engine={"InferenceSession": "ort", "CReferenceEvaluator": "np"}[
                engine.__name__
            ],
            stype=stype,
            type=f"{stype}",
            M=dim[0],
            N=dim[1],
            K=dim[2],
            cost=numpy.prod(dim) * 4,
            cost_s=f"{numpy.prod(dim) * 4}-{dim[0]}x{dim[1]}x{dim[2]}",
            repeat=repeat,
            number=number,
            domain={
                "": "ORT",
                "com.microsoft": "COM",
                "onnx_extended.ortops.tutorial.cuda": "EXT",
            }[domain],
            provider={
                "CPUExecutionProvider": "cpu",
                "CUDAExecutionProvider": "cuda",
            }[provider[0]],
            platform=platform.processor(),
            intime=internal_time,
        )
    )
    return obs


opts = SessionOptions()
r = get_ort_ext_libs()
if r is not None:
    opts.register_custom_ops_library(r[0])


data = []
errors = []
pbar = tqdm(list(product(types, engine, providers, dims, domains)))
for tt, engine, provider, dim, domain in pbar:
    if (
        tt in {TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2}
        and properties.get("major", 0) < 9
    ):
        # f8 not available
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"f8 not available, major={properties.get('major', 0)}, "
            f"tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    elif provider[0] == "CPUExecutionProvider" and max(dim) > 2000:
        # too long
        continue
    if max(dim) <= 200:
        repeat, number = script_args.repeat * 4, script_args.number * 4
    elif max(dim) <= 256:
        repeat, number = script_args.repeat * 2, script_args.number * 2
    else:
        repeat, number = script_args.repeat, script_args.number

    onx = create_model(tt, provider=provider[0], domain=domain)
    if onx is None:
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"No model for tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    with open(f"plot_bench_gemm_ort_{tt}_{domain}.onnx", "wb") as f:
        f.write(onx.SerializeToString())
    k1 = (tt, dim[2], dim[0])
    k2 = (tt, dim[2], dim[1])
    if k1 not in matrices:
        errors.append(f"Key k1={k1!r} not in matrices.")
        continue
    if k2 not in matrices:
        errors.append(f"Key k2={k2!r} not in matrices.")
        continue

    pbar.set_description(f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}")

    if engine == CReferenceEvaluator:
        if (
            domain != ""
            or max(dim) > 256
            or provider != ["CPUExecutionProvider"]
            or tt not in [TensorProto.FLOAT, TensorProto.FLOAT16]
        ):
            # All impossible or slow cases.
            continue
        if tt == TensorProto.FLOAT16 and max(dim) > 50:
            repeat, number = 2, 2

        feeds = {"A": matrices[k1].numpy(), "B": matrices[k2].numpy()}
        sess = engine(onx)
        sess.run(None, feeds)
        obs = measure_time(lambda: sess.run(None, feeds), repeat=repeat, number=number)

    elif engine == InferenceSession:
        if provider[0] not in get_available_providers():
            errors.append(f"provider={provider[0]} is missing")
            continue
        try:
            sess = engine(onx.SerializeToString(), opts, providers=provider)
        except (NotImplemented, InvalidGraph, Fail) as e:
            # not implemented
            errors.append((tt, engine.__class__.__name__, provider, domain, e))
            continue

        the_feeds = (
            {"A": matrices[k1], "B": matrices[k2]}
            if provider == ["CPUExecutionProvider"]
            else {"A": matrices_cuda[k1], "B": matrices_cuda[k2]}
        )
        out_names = ["C"]

        # warmup
        for i in range(script_args.warmup):
            sess._sess.run_with_ort_values(the_feeds, out_names, None)[0]

        # benchamrk
        times = []

        def fct_benchmarked():
            got = sess._sess.run_with_ort_values(the_feeds, out_names, None)
            if len(got) > 1:
                times.append(got[1])

        obs = measure_time(fct_benchmarked, repeat=repeat, number=number)
        internal_time = None
        if times:
            np_times = [t.numpy() for t in times]
            internal_time = (sum(np_times) / len(times))[0]

    else:
        errors.append(f"unknown engine={engine}")
        continue

    # improves the rendering
    obs = rendering_obs(obs, dim, number, repeat, domain, provider, internal_time)
    data.append(obs)
    if unit_test_going() and len(data) >= 2:
        break
  0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:   0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:04<00:12, 20.74it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:04<00:12, 20.74it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:04<00:12, 20.74it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  25%|██▌       | 85/336 [00:04<00:12, 20.74it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:07<00:24, 10.10it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:07<00:24, 10.10it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▋       | 89/336 [00:07<00:25,  9.85it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▋       | 89/336 [00:07<00:25,  9.85it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  26%|██▋       | 89/336 [00:07<00:25,  9.85it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:10<00:47,  5.15it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:10<00:47,  5.15it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:11<00:48,  5.01it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:11<00:48,  5.01it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  27%|██▋       | 92/336 [00:11<00:48,  5.01it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:12<00:56,  4.26it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:12<00:56,  4.26it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:12<00:57,  4.17it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:12<00:57,  4.17it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  28%|██▊       | 95/336 [00:12<00:57,  4.17it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:13<00:55,  4.27it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:13<00:55,  4.27it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 98/336 [00:13<00:54,  4.35it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 98/336 [00:13<00:54,  4.35it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  29%|██▉       | 98/336 [00:13<00:54,  4.35it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:13<00:56,  4.18it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:13<00:56,  4.18it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|███       | 101/336 [00:14<00:58,  4.03it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|███       | 101/336 [00:14<00:58,  4.03it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  30%|███       | 101/336 [00:14<00:58,  4.03it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:15<01:36,  2.42it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:15<01:36,  2.42it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CPUE dim=[32, 32, 32]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CPUE dim=[64, 64, 64]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CPUE dim=[128, 128, 128]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CPUE dim=[256, 256, 256]:  31%|███       | 104/336 [00:17<02:12,  1.76it/s]
t=1 e=InferenceSession p=CPUE dim=[256, 256, 256]:  35%|███▍      | 116/336 [00:17<00:33,  6.59it/s]
t=1 e=InferenceSession p=CPUE dim=[400, 400, 400]:  35%|███▍      | 116/336 [00:17<00:33,  6.59it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  35%|███▍      | 116/336 [00:17<00:33,  6.59it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  36%|███▋      | 122/336 [00:17<00:22,  9.46it/s]
t=1 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  36%|███▋      | 122/336 [00:17<00:22,  9.46it/s]
t=1 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  38%|███▊      | 126/336 [00:17<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  38%|███▊      | 126/336 [00:18<00:25,  8.32it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  46%|████▌     | 155/336 [00:18<00:06, 25.96it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:23<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:23<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:23<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:23<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:30<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:30<00:24,  6.73it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  52%|█████▏    | 174/336 [00:30<00:56,  2.86it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 174/336 [00:30<00:56,  2.86it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 174/336 [00:40<00:56,  2.86it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 174/336 [00:41<00:56,  2.86it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:42<02:01,  1.32it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:42<02:01,  1.32it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  52%|█████▏    | 176/336 [00:42<02:01,  1.32it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:45<02:11,  1.20it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:45<02:11,  1.20it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:45<02:11,  1.20it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  53%|█████▎    | 178/336 [00:45<02:11,  1.20it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:49<02:19,  1.11it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:49<02:19,  1.11it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:49<02:19,  1.11it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 183/336 [00:49<01:59,  1.28it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  54%|█████▍    | 183/336 [00:49<01:59,  1.28it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  54%|█████▍    | 183/336 [00:54<01:59,  1.28it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▌    | 185/336 [00:54<02:50,  1.13s/it]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▌    | 185/336 [00:54<02:50,  1.13s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  55%|█████▌    | 185/336 [00:54<02:50,  1.13s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  55%|█████▌    | 185/336 [01:11<02:50,  1.13s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [01:29<10:31,  4.24s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [01:29<10:31,  4.24s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [01:30<09:30,  3.85s/it]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [01:30<09:30,  3.85s/it]
t=10 e=InferenceSession p=CPUE dim=[32, 32, 32]:  56%|█████▌    | 188/336 [01:30<09:30,  3.85s/it]
t=10 e=InferenceSession p=CPUE dim=[32, 32, 32]:  57%|█████▋    | 191/336 [01:30<06:08,  2.54s/it]
t=10 e=InferenceSession p=CPUE dim=[64, 64, 64]:  57%|█████▋    | 191/336 [01:30<06:08,  2.54s/it]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  57%|█████▋    | 191/336 [01:30<06:08,  2.54s/it]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  59%|█████▊    | 197/336 [01:30<03:02,  1.31s/it]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  59%|█████▊    | 197/336 [01:30<03:02,  1.31s/it]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  60%|█████▉    | 200/336 [01:30<02:15,  1.01it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|█████▉    | 200/336 [01:30<02:15,  1.01it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|██████    | 203/336 [01:31<01:38,  1.35it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  60%|██████    | 203/336 [01:31<01:38,  1.35it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  61%|██████▏   | 206/336 [01:31<01:14,  1.76it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  61%|██████▏   | 206/336 [01:31<01:14,  1.76it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  62%|██████▏   | 209/336 [01:33<01:13,  1.73it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  69%|██████▉   | 233/336 [01:33<00:14,  6.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  69%|██████▉   | 233/336 [01:33<00:14,  6.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  69%|██████▉   | 233/336 [01:33<00:14,  6.87it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  71%|███████   | 239/336 [01:33<00:11,  8.30it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  71%|███████   | 239/336 [01:33<00:11,  8.30it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  72%|███████▏  | 243/336 [01:34<00:13,  6.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  72%|███████▏  | 243/336 [01:34<00:13,  6.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  72%|███████▏  | 243/336 [01:34<00:13,  6.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  72%|███████▏  | 243/336 [01:34<00:13,  6.96it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  72%|███████▏  | 243/336 [01:34<00:13,  6.96it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [01:35<00:09,  8.84it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [01:35<00:09,  8.84it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▌  | 254/336 [01:35<00:09,  8.84it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▋  | 257/336 [01:36<00:10,  7.69it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▋  | 257/336 [01:36<00:10,  7.69it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  76%|███████▋  | 257/336 [01:36<00:10,  7.69it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 260/336 [01:37<00:11,  6.60it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 260/336 [01:37<00:11,  6.60it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  77%|███████▋  | 260/336 [01:37<00:11,  6.60it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [01:37<00:10,  6.91it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [01:37<00:10,  6.91it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  78%|███████▊  | 263/336 [01:37<00:10,  6.91it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  79%|███████▉  | 266/336 [01:37<00:09,  7.77it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  79%|███████▉  | 266/336 [01:37<00:09,  7.77it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  79%|███████▉  | 266/336 [01:37<00:09,  7.77it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [01:37<00:08,  8.18it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [01:37<00:08,  8.18it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  80%|████████  | 269/336 [01:37<00:08,  8.18it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[32, 32, 32]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[64, 64, 64]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[128, 128, 128]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[256, 256, 256]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[400, 400, 400]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[512, 512, 512]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  81%|████████  | 272/336 [01:39<00:12,  5.01it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]: 100%|██████████| 336/336 [01:39<00:00,  3.39it/s]

Results

df = DataFrame(data)
df.to_excel("plot_bench_gemm_ort.xlsx")
df.to_csv("plot_bench_gemm_ort.csv")
df.drop(["min_exec", "max_exec", "cost_s", "cost"], axis=1).to_csv(
    "plot_bench_gemm_ort.csv", index=False
)
print(df.head().T)
df
                            0  ...                    4
average              0.004862  ...             0.005088
deviation            0.000222  ...             0.000096
min_exec              0.00444  ...             0.004951
max_exec             0.005432  ...             0.005407
repeat                     40  ...                   40
number                     16  ...                   16
ttime                0.194488  ...             0.203515
context_size               64  ...                   64
warmup_time          0.005957  ...             0.005067
engine                    ort  ...                  ort
stype                     f32  ...                  f32
type                      f32  ...                  f32
M                          32  ...                  128
N                          32  ...                  128
K                          32  ...                  128
cost                   131072  ...              8388608
cost_s        131072-32x32x32  ...  8388608-128x128x128
domain                    EXT  ...                  EXT
provider                 cuda  ...                 cuda
platform               x86_64  ...               x86_64
intime                   None  ...                 None

[21 rows x 5 columns]
average deviation min_exec max_exec repeat number ttime context_size warmup_time engine stype type M N K cost cost_s domain provider platform intime
0 0.004862 0.000222 0.004440 0.005432 40 16 0.194488 64 0.005957 ort f32 f32 32 32 32 131072 131072-32x32x32 EXT cuda x86_64 None
1 0.000202 0.000043 0.000163 0.000330 40 16 0.008098 64 0.000348 ort f32 f32 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
2 0.004908 0.000142 0.004561 0.005192 40 16 0.196323 64 0.005162 ort f32 f32 64 64 64 1048576 1048576-64x64x64 EXT cuda x86_64 None
3 0.000314 0.000061 0.000226 0.000434 40 16 0.012554 64 0.000471 ort f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
4 0.005088 0.000096 0.004951 0.005407 40 16 0.203515 64 0.005067 ort f32 f32 128 128 128 8388608 8388608-128x128x128 EXT cuda x86_64 None
5 0.000498 0.000040 0.000449 0.000606 40 16 0.019934 64 0.000611 ort f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
6 0.006411 0.000148 0.006153 0.006726 20 8 0.128221 64 0.006471 ort f32 f32 256 256 256 67108864 67108864-256x256x256 EXT cuda x86_64 None
7 0.001798 0.000051 0.001740 0.001974 20 8 0.035954 64 0.001767 ort f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
8 0.008772 0.000087 0.008654 0.008911 10 4 0.087717 64 0.009133 ort f32 f32 400 400 400 256000000 256000000-400x400x400 EXT cuda x86_64 None
9 0.004143 0.000059 0.004052 0.004261 10 4 0.041428 64 0.004039 ort f32 f32 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
10 0.011558 0.000274 0.011144 0.011863 10 4 0.115578 64 0.011260 ort f32 f32 512 512 512 536870912 536870912-512x512x512 EXT cuda x86_64 None
11 0.006459 0.000040 0.006398 0.006557 10 4 0.064586 64 0.006457 ort f32 f32 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
12 0.035488 0.001814 0.034709 0.040913 10 4 0.354881 64 0.034331 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 EXT cuda x86_64 None
13 0.028698 0.000056 0.028598 0.028784 10 4 0.286978 64 0.028714 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None
14 0.000015 0.000003 0.000014 0.000029 40 16 0.000619 64 0.000036 ort f32 f32 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
15 0.000024 0.000016 0.000018 0.000121 40 16 0.000953 64 0.000035 ort f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
16 0.000052 0.000010 0.000042 0.000098 40 16 0.002088 64 0.000069 ort f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
17 0.000235 0.000072 0.000141 0.000404 20 8 0.004704 64 0.000140 ort f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
18 0.000613 0.000127 0.000542 0.000981 10 4 0.006130 64 0.000562 ort f32 f32 400 400 400 256000000 256000000-400x400x400 ORT cpu x86_64 None
19 0.001731 0.000308 0.001110 0.002101 10 4 0.017311 64 0.001904 ort f32 f32 512 512 512 536870912 536870912-512x512x512 ORT cpu x86_64 None
20 0.014278 0.001286 0.011839 0.016021 10 4 0.142777 64 0.014883 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cpu x86_64 None
21 0.000030 0.000004 0.000027 0.000046 40 16 0.001203 64 0.000092 np f32 f32 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
22 0.000042 0.000005 0.000037 0.000060 40 16 0.001694 64 0.000073 np f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
23 0.000159 0.000110 0.000098 0.000571 40 16 0.006344 64 0.000184 np f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
24 0.000437 0.000273 0.000297 0.001481 20 8 0.008741 64 0.000323 np f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
25 0.007856 0.000251 0.007565 0.008824 40 16 0.314227 64 0.009715 ort f16 f16 32 32 32 131072 131072-32x32x32 EXT cuda x86_64 None
26 0.000222 0.000020 0.000211 0.000305 40 16 0.008865 64 0.000409 ort f16 f16 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
27 0.011283 0.000376 0.010994 0.013262 40 16 0.451320 64 0.011636 ort f16 f16 64 64 64 1048576 1048576-64x64x64 EXT cuda x86_64 None
28 0.000329 0.000064 0.000247 0.000506 40 16 0.013162 64 0.000812 ort f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
29 0.014920 0.001437 0.012459 0.018327 40 16 0.596787 64 0.011739 ort f16 f16 128 128 128 8388608 8388608-128x128x128 EXT cuda x86_64 None
30 0.002651 0.000723 0.001910 0.004929 40 16 0.106051 64 0.000982 ort f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
31 0.018532 0.000847 0.017634 0.020636 20 8 0.370649 64 0.019084 ort f16 f16 256 256 256 67108864 67108864-256x256x256 EXT cuda x86_64 None
32 0.001357 0.000066 0.001231 0.001524 20 8 0.027148 64 0.001631 ort f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
33 0.072559 0.002372 0.070388 0.078199 10 4 0.725589 64 0.083750 ort f16 f16 400 400 400 256000000 256000000-400x400x400 EXT cuda x86_64 None
34 0.002719 0.000087 0.002610 0.002920 10 4 0.027190 64 0.003392 ort f16 f16 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
35 0.117292 0.004891 0.109007 0.125841 10 4 1.172916 64 0.106625 ort f16 f16 512 512 512 536870912 536870912-512x512x512 EXT cuda x86_64 None
36 0.006512 0.001233 0.005292 0.008907 10 4 0.065117 64 0.005234 ort f16 f16 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
37 0.740776 0.027270 0.705770 0.777931 10 4 7.407756 64 0.778124 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 EXT cuda x86_64 None
38 0.022766 0.001268 0.021628 0.025612 10 4 0.227664 64 0.022530 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None
39 0.000081 0.000043 0.000037 0.000273 40 16 0.003258 64 0.000071 ort f16 f16 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
40 0.000103 0.000048 0.000047 0.000245 40 16 0.004136 64 0.000129 ort f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
41 0.000409 0.000269 0.000143 0.001698 40 16 0.016347 64 0.000231 ort f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
42 0.001660 0.000446 0.000975 0.003041 20 8 0.033190 64 0.001496 ort f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
43 0.003915 0.000918 0.002479 0.005700 10 4 0.039147 64 0.002230 ort f16 f16 400 400 400 256000000 256000000-400x400x400 ORT cpu x86_64 None
44 0.008509 0.003019 0.005928 0.016634 10 4 0.085091 64 0.003873 ort f16 f16 512 512 512 536870912 536870912-512x512x512 ORT cpu x86_64 None
45 0.040063 0.006035 0.033173 0.054376 10 4 0.400627 64 0.032412 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cpu x86_64 None
46 0.000427 0.000098 0.000290 0.000644 40 16 0.017086 64 0.000427 np f16 f16 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
47 0.002866 0.000183 0.002683 0.003048 2 2 0.005732 64 0.003207 np f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
48 0.023623 0.000980 0.022643 0.024603 2 2 0.047247 64 0.027861 np f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
49 0.170814 0.017384 0.153430 0.188198 2 2 0.341627 64 0.159457 np f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
50 0.001095 0.000372 0.000492 0.002160 40 16 0.043810 64 0.000921 ort bf16 bf16 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
51 0.001069 0.000304 0.000591 0.002340 40 16 0.042763 64 0.001148 ort bf16 bf16 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
52 0.001162 0.000224 0.000668 0.001856 40 16 0.046492 64 0.001012 ort bf16 bf16 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
53 0.002023 0.000234 0.001561 0.002577 20 8 0.040469 64 0.001477 ort bf16 bf16 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
54 0.004269 0.000247 0.003815 0.004595 10 4 0.042688 64 0.004854 ort bf16 bf16 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
55 0.006163 0.000434 0.005740 0.006852 10 4 0.061632 64 0.006123 ort bf16 bf16 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
56 0.027960 0.000485 0.027326 0.028606 10 4 0.279605 64 0.027493 ort bf16 bf16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None


The errors

for i, e in enumerate(errors):
    print(f"{i+1}/{len(errors)}-{e}")
1/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
2/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
3/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
4/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
5/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
6/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
7/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
8/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
9/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
10/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
11/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
12/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
13/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
14/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
15/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
16/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
17/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
18/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
19/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
20/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
21/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
22/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
23/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
24/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
25/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
26/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
27/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
28/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
29/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
30/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
31/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
32/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
33/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
34/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
35/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
36/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
37/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
38/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
39/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
40/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
41/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
42/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
43/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
44/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
45/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
46/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
47/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
48/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
49/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
50/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
51/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
52/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
53/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
54/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
55/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
56/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
57/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
58/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
59/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
60/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
61/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
62/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
63/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
64/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
65/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
66/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
67/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
68/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
69/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
70/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
71/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
72/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
73/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
74/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
75/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
76/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
77/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
78/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
79/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
80/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
81/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
82/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
83/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.
84/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extended.ortops.tutorial.cuda'.

Summary

piv = pivot_table(
    df,
    index=["cost"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
piv.reset_index(drop=False).to_excel("plot_bench_gemm_ort_summary.xlsx")
piv.reset_index(drop=False).to_csv("plot_bench_gemm_ort_summary.csv")


print("summary")
print(piv)
piv
summary
             average                      ...
provider         cpu                      ...      cuda
type             f16                 f32  ...       f16       f32
domain           ORT                 ORT  ...       ORT       EXT       ORT
engine            np       ort        np  ...       ort       ort       ort
cost                                      ...
131072      0.000427  0.000081  0.000030  ...  0.000222  0.004862  0.000202
1048576     0.002866  0.000103  0.000042  ...  0.000329  0.004908  0.000314
8388608     0.023623  0.000409  0.000159  ...  0.002651  0.005088  0.000498
67108864    0.170814  0.001660  0.000437  ...  0.001357  0.006411  0.001798
256000000        NaN  0.003915       NaN  ...  0.002719  0.008772  0.004143
536870912        NaN  0.008509       NaN  ...  0.006512  0.011558  0.006459
4294967296       NaN  0.040063       NaN  ...  0.022766  0.035488  0.028698

[7 rows x 9 columns]
average
provider cpu cuda
type f16 f32 bf16 f16 f32
domain ORT ORT ORT EXT ORT EXT ORT
engine np ort np ort ort ort ort ort ort
cost
131072 0.000427 0.000081 0.000030 0.000015 0.001095 0.007856 0.000222 0.004862 0.000202
1048576 0.002866 0.000103 0.000042 0.000024 0.001069 0.011283 0.000329 0.004908 0.000314
8388608 0.023623 0.000409 0.000159 0.000052 0.001162 0.014920 0.002651 0.005088 0.000498
67108864 0.170814 0.001660 0.000437 0.000235 0.002023 0.018532 0.001357 0.006411 0.001798
256000000 NaN 0.003915 NaN 0.000613 0.004269 0.072559 0.002719 0.008772 0.004143
536870912 NaN 0.008509 NaN 0.001731 0.006163 0.117292 0.006512 0.011558 0.006459
4294967296 NaN 0.040063 NaN 0.014278 0.027960 0.740776 0.022766 0.035488 0.028698


With the dimensions.

pivs = pivot_table(
    df,
    index=["cost_s"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
print(pivs)
                            average            ...
provider                        cpu            ...      cuda
type                            f16            ...       f32
domain                          ORT            ...       EXT       ORT
engine                           np       ort  ...       ort       ort
cost_s                                         ...
1048576-64x64x64           0.002866  0.000103  ...  0.004908  0.000314
131072-32x32x32            0.000427  0.000081  ...  0.004862  0.000202
256000000-400x400x400           NaN  0.003915  ...  0.008772  0.004143
4294967296-1024x1024x1024       NaN  0.040063  ...  0.035488  0.028698
536870912-512x512x512           NaN  0.008509  ...  0.011558  0.006459
67108864-256x256x256       0.170814  0.001660  ...  0.006411  0.001798
8388608-128x128x128        0.023623  0.000409  ...  0.005088  0.000498

[7 rows x 9 columns]

plot

dfi = df[
    df.type.isin({"f32", "f16", "bf16", "e4m3fn", "e5m2"}) & df.engine.isin({"ort"})
]
pivi = pivot_table(
    dfi,
    index=["cost"],
    columns=["type", "domain", "provider", "engine"],
    values="average",
)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
piv.plot(ax=ax[0], title="Gemm performance\nlower is better", logx=True, logy=True)
if pivi.shape[0] > 0:
    pivi.plot(
        ax=ax[1],
        title=f"Gemm performance ORT\n{platform.processor()}",
        logx=True,
        logy=True,
    )
fig.tight_layout()
fig.savefig("plot_bench_gemm_ort.png")
Gemm performance lower is better, Gemm performance ORT x86_64

Total running time of the script: (1 minutes 54.521 seconds)

Gallery generated by Sphinx-Gallery