Measuring performance about Gemm with onnxruntime#

The benchmark measures the performance of Gemm for different types and configuration. That includes a custom operator only available on CUDA calling function cublasLtMatmul. This function offers many options.

import pprint
import platform
from itertools import product
import numpy
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame, pivot_table
from onnx import TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnx.numpy_helper import from_array
from onnx.reference import ReferenceEvaluator
from onnxruntime import InferenceSession, SessionOptions, get_available_providers
from onnxruntime.capi._pybind_state import (
    OrtValue as C_OrtValue,
    OrtDevice as C_OrtDevice,
)
from onnxruntime.capi.onnxruntime_pybind11_state import (
    Fail,
    NotImplemented,
    InvalidGraph,
    InvalidArgument,
)

try:
    from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
except ImportError:
    onnx_simple_text_plot = str
try:
    from onnx_extended.reference import CReferenceEvaluator
except ImportError:
    CReferenceEvaluator = ReferenceEvaluator
from onnx_extended.args import get_parsed_args
from onnx_extended.ext_test_case import unit_test_going, measure_time

try:
    from onnx_extended.validation.cuda.cuda_example_py import get_device_prop
    from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs

    has_cuda = True
except ImportError:

    def get_device_prop():
        return {"name": "CPU"}

    def get_ort_ext_libs():
        return None

    has_cuda = False

default_dims = (
    "32,32,32;64,64,64;128,128,128;256,256,256;"
    "400,400,400;512,512,512;1024,1024,1024"
)
if has_cuda:
    prop = get_device_prop()
    if prop.get("major", 0) >= 7:
        default_dims += ";2048,2048,2048;4096,4096,4096"
    if prop.get("major", 0) >= 9:
        default_dims += ";16384,16384,16384"


script_args = get_parsed_args(
    "plot_bench_gemm_ort",
    description=__doc__,
    dims=(
        "32,32,32;64,64,64" if unit_test_going() else default_dims,
        "square matrix dimensions to try, comma separated values",
    ),
    types=(
        "FLOAT" if unit_test_going() else "FLOAT8E4M3FN,FLOAT,FLOAT16,BFLOAT16",
        "element type to teest",
    ),
    number=2 if unit_test_going() else 4,
    repeat=2 if unit_test_going() else 10,
    warmup=2 if unit_test_going() else 5,
    expose="repeat,number,warmup",
)

Device properties#

if has_cuda:
    properties = get_device_prop()
    pprint.pprint(properties)
else:
    properties = {"major": 0}
{'clockRate': 1569000,
 'computeMode': 0,
 'concurrentKernels': 1,
 'isMultiGpuBoard': 0,
 'major': 6,
 'maxThreadsPerBlock': 1024,
 'minor': 1,
 'multiProcessorCount': 10,
 'name': 'NVIDIA GeForce GTX 1060',
 'sharedMemPerBlock': 49152,
 'totalConstMem': 65536,
 'totalGlobalMem': 6442319872}

Model to benchmark#

It includes one Gemm. The operator changes. It can the regular Gemm, a custom Gemm from domain com.microsoft or a custom implementation from domain onnx_extented.ortops.tutorial.cuda.

def create_model(
    mat_type=TensorProto.FLOAT, provider="CUDAExecutionProvider", domain="com.microsoft"
):
    A = make_tensor_value_info("A", mat_type, [None, None])
    B = make_tensor_value_info("B", mat_type, [None, None])
    outputs = [make_tensor_value_info("C", mat_type, [None, None])]
    inits = []
    if domain != "":
        if provider != "CUDAExecutionProvider":
            return None
        f8 = False
        if domain == "com.microsoft":
            op_name = "GemmFloat8"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT:
            op_name = "CustomGemmFloat"
            computeType = "CUBLAS_COMPUTE_32F_FAST_TF32"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT16:
            op_name = "CustomGemmFloat16"
            computeType = "CUBLAS_COMPUTE_16F"
            node_output = ["C"]
        elif mat_type in (TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2):
            f8 = True
            op_name = "CustomGemmFloat8E4M3FN"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
            outputs = [
                make_tensor_value_info("C", TensorProto.FLOAT16, [None, None]),
            ]
            inits.append(from_array(numpy.array([1], dtype=numpy.float32), name="I"))
        else:
            return None
        node_kw = dict(
            alpha=1.0,
            transB=1,
            domain=domain,
            computeType=computeType,
            fastAccumulationMode=1,
            rowMajor=0 if op_name.startswith("CustomGemmFloat") else 1,
        )
        node_kw["name"] = (
            f"{mat_type}.{len(node_output)}.{len(outputs)}."
            f"{domain}..{node_kw['rowMajor']}.."
            f"{node_kw['fastAccumulationMode']}..{node_kw['computeType']}.."
            f"{f8}"
        )
        node_inputs = ["A", "B"]
        if f8:
            node_inputs.append("")
            node_inputs.extend(["I"] * 3)
        nodes = [make_node(op_name, node_inputs, node_output, **node_kw)]
    else:
        nodes = [
            make_node("Gemm", ["A", "B"], ["C"], transA=1, beta=0.0),
        ]
    graph = make_graph(nodes, "a", [A, B], outputs, inits)
    if mat_type < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph,
        opset_imports=[
            make_opsetid("", opset),
            make_opsetid("com.microsoft", 1),
            make_opsetid("onnx_extented.ortops.tutorial.cuda", 1),
        ],
        ir_version=ir,
    )
    check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_model()))
opset: domain='' version=18
opset: domain='com.microsoft' version=1
opset: domain='onnx_extented.ortops.tutorial.cuda' version=1
input: name='A' type=dtype('float32') shape=['', '']
input: name='B' type=dtype('float32') shape=['', '']
GemmFloat8[com.microsoft](A, B, alpha=1.00, computeType=b'CUBLAS_COMPUTE_32F', fastAccumulationMode=1, rowMajor=1, transB=1) -> C
output: name='C' type=dtype('float32') shape=['', '']

A model to cast into anytype. numpy does not support float 8. onnxruntime is used to cast a float array into any type. It must be called with tensor of type OrtValue.

def create_cast(to, cuda=False):
    A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
    C = make_tensor_value_info("C", to, [None, None])
    if cuda:
        nodes = [
            make_node("Cast", ["A"], ["Cc"], to=to),
            make_node("MemcpyFromHost", ["Cc"], ["C"]),
        ]
    else:
        nodes = [make_node("Cast", ["A"], ["C"], to=to)]
    graph = make_graph(nodes, "a", [A], [C])
    if to < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    if not cuda:
        # OpType: MemcpyFromHost
        check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_cast(TensorProto.FLOAT16)))
opset: domain='' version=18
input: name='A' type=dtype('float32') shape=['', '']
Cast(A, to=10) -> C
output: name='C' type=dtype('float16') shape=['', '']

Performance#

The benchmark will run the following configurations.

types = list(getattr(TensorProto, a) for a in script_args.types.split(","))
engine = [InferenceSession, CReferenceEvaluator]
providers = [
    ["CUDAExecutionProvider", "CPUExecutionProvider"],
    ["CPUExecutionProvider"],
]
# M, N, K
# we use multiple of 8, otherwise, float8 does not work.
dims = [list(int(i) for i in line.split(",")) for line in script_args.dims.split(";")]
domains = ["onnx_extented.ortops.tutorial.cuda", "", "com.microsoft"]

Let’s cache the matrices involved.

def to_ort_value(m):
    device = C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    ort_value = C_OrtValue.ortvalue_from_numpy(m, device)
    return ort_value


def cached_inputs(dims, types):
    matrices = {}
    matrices_cuda = {}
    pbar = tqdm(list(product(dims, types)))
    for dim, tt in pbar:
        m, n, k = dim
        pbar.set_description(f"t={tt} dim={dim}")
        for i, j in [(m, k), (k, n), (k, m)]:
            if (tt, i, j) in matrices:
                continue
            # CPU
            try:
                sess = InferenceSession(
                    create_cast(tt).SerializeToString(),
                    providers=["CPUExecutionProvider"],
                )
                cpu = True
            except (InvalidGraph, InvalidArgument, NotImplemented):
                # not support by this version of onnxruntime
                cpu = False

            if cpu:
                vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
                ov = to_ort_value(vect)
                ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
                matrices[tt, i, j] = ovtt
            else:
                continue

            # CUDA
            if "CUDAExecutionProvider" not in get_available_providers():
                # No CUDA
                continue
            sess = InferenceSession(
                create_cast(tt, cuda=True).SerializeToString(),
                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
            )
            vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
            ov = to_ort_value(vect)
            ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
            matrices_cuda[tt, i, j] = ovtt
    return matrices, matrices_cuda


matrices, matrices_cuda = cached_inputs(dims, types)
print(f"{len(matrices)} matrices were created.")
  0%|          | 0/28 [00:00<?, ?it/s]
t=17 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=1 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=10 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=10 dim=[32, 32, 32]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=16 dim=[32, 32, 32]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=17 dim=[64, 64, 64]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=1 dim=[64, 64, 64]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=1 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=10 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=16 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=16 dim=[64, 64, 64]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=17 dim=[128, 128, 128]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=1 dim=[128, 128, 128]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=1 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=10 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=16 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=16 dim=[128, 128, 128]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=17 dim=[256, 256, 256]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=1 dim=[256, 256, 256]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=1 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=10 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=16 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=16 dim=[256, 256, 256]:  57%|█████▋    | 16/28 [00:00<00:00, 15.32it/s]
t=17 dim=[400, 400, 400]:  57%|█████▋    | 16/28 [00:00<00:00, 15.32it/s]
t=1 dim=[400, 400, 400]:  57%|█████▋    | 16/28 [00:01<00:00, 15.32it/s]
t=1 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=10 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=16 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=16 dim=[400, 400, 400]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=17 dim=[512, 512, 512]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=1 dim=[512, 512, 512]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=1 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=10 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=16 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=16 dim=[512, 512, 512]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=17 dim=[1024, 1024, 1024]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=1 dim=[1024, 1024, 1024]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=1 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=10 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=16 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:02<00:00,  6.44it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:02<00:00, 10.91it/s]
28 matrices were created.

Let’s run the benchmark

def rendering_obs(obs, dim, number, repeat, domain, provider, internal_time):
    stype = {
        TensorProto.FLOAT: "f32",
        TensorProto.FLOAT16: "f16",
        TensorProto.BFLOAT16: "bf16",
        TensorProto.INT8: "i8",
        TensorProto.INT16: "i16",
        TensorProto.INT32: "i32",
        TensorProto.UINT32: "u32",
        TensorProto.FLOAT8E4M3FN: "e4m3fn",
        TensorProto.FLOAT8E5M2: "e5m2",
    }[tt]
    obs.update(
        dict(
            engine={"InferenceSession": "ort", "CReferenceEvaluator": "np"}[
                engine.__name__
            ],
            stype=stype,
            type=f"{stype}",
            M=dim[0],
            N=dim[1],
            K=dim[2],
            cost=numpy.prod(dim) * 4,
            cost_s=f"{numpy.prod(dim) * 4}-{dim[0]}x{dim[1]}x{dim[2]}",
            repeat=repeat,
            number=number,
            domain={
                "": "ORT",
                "com.microsoft": "COM",
                "onnx_extented.ortops.tutorial.cuda": "EXT",
            }[domain],
            provider={
                "CPUExecutionProvider": "cpu",
                "CUDAExecutionProvider": "cuda",
            }[provider[0]],
            platform=platform.processor(),
            intime=internal_time,
        )
    )
    return obs


opts = SessionOptions()
r = get_ort_ext_libs()
if r is not None:
    opts.register_custom_ops_library(r[0])


data = []
errors = []
pbar = tqdm(list(product(types, engine, providers, dims, domains)))
for tt, engine, provider, dim, domain in pbar:
    if (
        tt in {TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2}
        and properties.get("major", 0) < 9
    ):
        # f8 not available
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"f8 not available, major={properties.get('major', 0)}, "
            f"tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    elif provider[0] == "CPUExecutionProvider" and max(dim) > 2000:
        # too long
        continue
    if max(dim) <= 200:
        repeat, number = script_args.repeat * 4, script_args.number * 4
    elif max(dim) <= 256:
        repeat, number = script_args.repeat * 2, script_args.number * 2
    else:
        repeat, number = script_args.repeat, script_args.number

    onx = create_model(tt, provider=provider[0], domain=domain)
    if onx is None:
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"No model for tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    with open(f"plot_bench_gemm_ort_{tt}_{domain}.onnx", "wb") as f:
        f.write(onx.SerializeToString())
    k1 = (tt, dim[2], dim[0])
    k2 = (tt, dim[2], dim[1])
    if k1 not in matrices:
        errors.append(f"Key k1={k1!r} not in matrices.")
        continue
    if k2 not in matrices:
        errors.append(f"Key k2={k2!r} not in matrices.")
        continue

    pbar.set_description(f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}")

    if engine == CReferenceEvaluator:
        if (
            domain != ""
            or max(dim) > 256
            or provider != ["CPUExecutionProvider"]
            or tt not in [TensorProto.FLOAT, TensorProto.FLOAT16]
        ):
            # All impossible or slow cases.
            continue
        if tt == TensorProto.FLOAT16 and max(dim) > 50:
            repeat, number = 2, 2

        feeds = {"A": matrices[k1].numpy(), "B": matrices[k2].numpy()}
        sess = engine(onx)
        sess.run(None, feeds)
        obs = measure_time(lambda: sess.run(None, feeds), repeat=repeat, number=number)

    elif engine == InferenceSession:
        if provider[0] not in get_available_providers():
            errors.append(f"provider={provider[0]} is missing")
            continue
        try:
            sess = engine(onx.SerializeToString(), opts, providers=provider)
        except (NotImplemented, InvalidGraph, Fail) as e:
            # not implemented
            errors.append((tt, engine.__class__.__name__, provider, domain, e))
            continue

        the_feeds = (
            {"A": matrices[k1], "B": matrices[k2]}
            if provider == ["CPUExecutionProvider"]
            else {"A": matrices_cuda[k1], "B": matrices_cuda[k2]}
        )
        out_names = ["C"]

        # warmup
        for i in range(script_args.warmup):
            sess._sess.run_with_ort_values(the_feeds, out_names, None)[0]

        # benchamrk
        times = []

        def fct_benchmarked():
            got = sess._sess.run_with_ort_values(the_feeds, out_names, None)
            if len(got) > 1:
                times.append(got[1])

        obs = measure_time(fct_benchmarked, repeat=repeat, number=number)
        internal_time = None
        if times:
            np_times = [t.numpy() for t in times]
            internal_time = (sum(np_times) / len(times))[0]

    else:
        errors.append(f"unknown engine={engine}")
        continue

    # improves the rendering
    obs = rendering_obs(obs, dim, number, repeat, domain, provider, internal_time)
    data.append(obs)
    if unit_test_going() and len(data) >= 2:
        break
  0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:   0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:02<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:02<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:03<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  25%|██▌       | 85/336 [00:03<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  27%|██▋       | 90/336 [00:06<00:21, 11.43it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 90/336 [00:06<00:21, 11.43it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:09<00:41,  5.95it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:09<00:41,  5.95it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:10<00:46,  5.20it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:10<00:46,  5.20it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:10<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:10<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:11<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  30%|██▉       | 100/336 [00:11<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:11<00:39,  5.97it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:11<00:39,  5.97it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[32, 32, 32]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[64, 64, 64]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[128, 128, 128]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[128, 128, 128]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[256, 256, 256]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[400, 400, 400]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[400, 400, 400]:  35%|███▌      | 119/336 [00:12<00:12, 17.01it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  35%|███▌      | 119/336 [00:12<00:12, 17.01it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  37%|███▋      | 123/336 [00:12<00:11, 18.70it/s]
t=1 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  37%|███▋      | 123/336 [00:12<00:11, 18.70it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  37%|███▋      | 123/336 [00:13<00:11, 18.70it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  45%|████▌     | 152/336 [00:13<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  45%|████▌     | 152/336 [00:13<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  45%|████▌     | 152/336 [00:20<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:28<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  51%|█████▏    | 173/336 [00:30<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [00:34<01:00,  2.47it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [00:34<01:00,  2.47it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[32, 32, 32]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[64, 64, 64]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  59%|█████▊    | 197/336 [00:34<00:26,  5.31it/s]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  59%|█████▊    | 197/336 [00:34<00:26,  5.31it/s]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  60%|█████▉    | 200/336 [00:35<00:22,  6.10it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|█████▉    | 200/336 [00:35<00:22,  6.10it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|██████    | 203/336 [00:35<00:17,  7.49it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  60%|██████    | 203/336 [00:35<00:17,  7.49it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  61%|██████▏   | 206/336 [00:35<00:15,  8.45it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  61%|██████▏   | 206/336 [00:35<00:15,  8.45it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  71%|███████   | 239/336 [00:36<00:04, 20.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  71%|███████   | 239/336 [00:36<00:04, 20.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  77%|███████▋  | 259/336 [00:37<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 259/336 [00:37<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 259/336 [00:38<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  77%|███████▋  | 259/336 [00:38<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]: 100%|██████████| 336/336 [00:38<00:00,  8.67it/s]

Results#

df = DataFrame(data)
df.to_excel("plot_bench_gemm_ort.xlsx")
df.to_csv("plot_bench_gemm_ort.csv")
df.drop(["min_exec", "max_exec", "cost_s", "cost"], axis=1).to_csv(
    "plot_bench_gemm_ort.csv", index=False
)
print(df.head().T)
df
                            0                1                 2                 3                    4
average              0.004122         0.000284          0.004653          0.000396              0.00421
deviation            0.000578         0.000109          0.000309          0.000153             0.000347
min_exec             0.003197         0.000182          0.004109          0.000242             0.003513
max_exec             0.005962         0.000624          0.005381          0.000805              0.00502
repeat                     40               40                40                40                   40
number                     16               16                16                16                   16
ttime                0.164877         0.011356          0.186115          0.015823              0.16838
context_size               64               64                64                64                   64
warmup_time          0.004435         0.000249          0.004432          0.001938             0.004779
engine                    ort              ort               ort               ort                  ort
stype                     f32              f32               f32               f32                  f32
type                      f32              f32               f32               f32                  f32
M                          32               32                64                64                  128
N                          32               32                64                64                  128
K                          32               32                64                64                  128
cost                   131072           131072           1048576           1048576              8388608
cost_s        131072-32x32x32  131072-32x32x32  1048576-64x64x64  1048576-64x64x64  8388608-128x128x128
domain                    EXT              ORT               EXT               ORT                  EXT
provider                 cuda             cuda              cuda              cuda                 cuda
platform               x86_64           x86_64            x86_64            x86_64               x86_64
intime                   None             None              None              None                 None
average deviation min_exec max_exec repeat number ttime context_size warmup_time engine stype type M N K cost cost_s domain provider platform intime
0 0.004122 0.000578 0.003197 0.005962 40 16 0.164877 64 0.004435 ort f32 f32 32 32 32 131072 131072-32x32x32 EXT cuda x86_64 None
1 0.000284 0.000109 0.000182 0.000624 40 16 0.011356 64 0.000249 ort f32 f32 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
2 0.004653 0.000309 0.004109 0.005381 40 16 0.186115 64 0.004432 ort f32 f32 64 64 64 1048576 1048576-64x64x64 EXT cuda x86_64 None
3 0.000396 0.000153 0.000242 0.000805 40 16 0.015823 64 0.001938 ort f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
4 0.004210 0.000347 0.003513 0.005020 40 16 0.168380 64 0.004779 ort f32 f32 128 128 128 8388608 8388608-128x128x128 EXT cuda x86_64 None
5 0.000344 0.000101 0.000256 0.000593 40 16 0.013747 64 0.000579 ort f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
6 0.004425 0.000512 0.003926 0.006452 20 8 0.088494 64 0.004685 ort f32 f32 256 256 256 67108864 67108864-256x256x256 EXT cuda x86_64 None
7 0.000601 0.000073 0.000534 0.000808 20 8 0.012018 64 0.000724 ort f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
8 0.004817 0.000283 0.004376 0.005421 10 4 0.048173 64 0.004500 ort f32 f32 400 400 400 256000000 256000000-400x400x400 EXT cuda x86_64 None
9 0.001246 0.000080 0.001101 0.001412 10 4 0.012464 64 0.001223 ort f32 f32 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
10 0.005571 0.000256 0.005223 0.006130 10 4 0.055710 64 0.006282 ort f32 f32 512 512 512 536870912 536870912-512x512x512 EXT cuda x86_64 None
11 0.001722 0.000050 0.001610 0.001771 10 4 0.017221 64 0.001733 ort f32 f32 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
12 0.009730 0.000635 0.008817 0.011191 10 4 0.097297 64 0.012024 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 EXT cuda x86_64 None
13 0.005438 0.000074 0.005334 0.005576 10 4 0.054380 64 0.005687 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None
14 0.000021 0.000007 0.000013 0.000050 40 16 0.000842 64 0.000046 ort f32 f32 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
15 0.000027 0.000003 0.000025 0.000040 40 16 0.001075 64 0.000050 ort f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
16 0.000067 0.000008 0.000051 0.000103 40 16 0.002692 64 0.000086 ort f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
17 0.000359 0.000025 0.000327 0.000415 20 8 0.007174 64 0.000390 ort f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
18 0.001319 0.000038 0.001229 0.001375 10 4 0.013193 64 0.001518 ort f32 f32 400 400 400 256000000 256000000-400x400x400 ORT cpu x86_64 None
19 0.003123 0.000393 0.002809 0.004208 10 4 0.031229 64 0.002939 ort f32 f32 512 512 512 536870912 536870912-512x512x512 ORT cpu x86_64 None
20 0.025415 0.002176 0.023589 0.030808 10 4 0.254145 64 0.021144 ort f32 f32 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cpu x86_64 None
21 0.000063 0.000027 0.000044 0.000147 40 16 0.002539 64 0.000118 np f32 f32 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
22 0.000093 0.000026 0.000062 0.000147 40 16 0.003701 64 0.000162 np f32 f32 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
23 0.010302 0.003517 0.002135 0.020524 40 16 0.412060 64 0.006489 np f32 f32 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
24 0.012554 0.004106 0.003891 0.020194 20 8 0.251079 64 0.008421 np f32 f32 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
25 0.004686 0.001919 0.003153 0.010099 40 16 0.187434 64 0.006409 ort f16 f16 32 32 32 131072 131072-32x32x32 EXT cuda x86_64 None
26 0.000230 0.000072 0.000163 0.000445 40 16 0.009192 64 0.000258 ort f16 f16 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
27 0.004055 0.000301 0.003511 0.005197 40 16 0.162212 64 0.003966 ort f16 f16 64 64 64 1048576 1048576-64x64x64 EXT cuda x86_64 None
28 0.000198 0.000033 0.000148 0.000309 40 16 0.007937 64 0.000213 ort f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
29 0.003477 0.000289 0.002972 0.004156 40 16 0.139098 64 0.003759 ort f16 f16 128 128 128 8388608 8388608-128x128x128 EXT cuda x86_64 None
30 0.000246 0.000036 0.000186 0.000325 40 16 0.009837 64 0.000261 ort f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
31 0.003870 0.000276 0.003438 0.004474 20 8 0.077403 64 0.004728 ort f16 f16 256 256 256 67108864 67108864-256x256x256 EXT cuda x86_64 None
32 0.000379 0.000068 0.000330 0.000664 20 8 0.007587 64 0.000377 ort f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
33 0.006882 0.000220 0.006523 0.007184 10 4 0.068824 64 0.006844 ort f16 f16 400 400 400 256000000 256000000-400x400x400 EXT cuda x86_64 None
34 0.000638 0.000037 0.000583 0.000698 10 4 0.006383 64 0.000637 ort f16 f16 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
35 0.009471 0.000303 0.008982 0.009824 10 4 0.094706 64 0.008904 ort f16 f16 512 512 512 536870912 536870912-512x512x512 EXT cuda x86_64 None
36 0.001042 0.000068 0.000920 0.001182 10 4 0.010416 64 0.001072 ort f16 f16 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
37 0.042594 0.000164 0.042252 0.042910 10 4 0.425938 64 0.042064 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 EXT cuda x86_64 None
38 0.003411 0.000140 0.003192 0.003616 10 4 0.034109 64 0.003368 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None
39 0.000041 0.000024 0.000021 0.000166 40 16 0.001637 64 0.000078 ort f16 f16 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
40 0.000056 0.000029 0.000043 0.000215 40 16 0.002251 64 0.000106 ort f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
41 0.000431 0.000237 0.000175 0.001358 40 16 0.017229 64 0.000311 ort f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
42 0.001192 0.000112 0.000872 0.001378 20 8 0.023834 64 0.001916 ort f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
43 0.002023 0.000433 0.001702 0.003095 10 4 0.020235 64 0.002813 ort f16 f16 400 400 400 256000000 256000000-400x400x400 ORT cpu x86_64 None
44 0.004482 0.000501 0.003942 0.005915 10 4 0.044815 64 0.004937 ort f16 f16 512 512 512 536870912 536870912-512x512x512 ORT cpu x86_64 None
45 0.025175 0.003117 0.021383 0.031425 10 4 0.251748 64 0.027772 ort f16 f16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cpu x86_64 None
46 0.000329 0.000023 0.000302 0.000420 40 16 0.013148 64 0.000382 np f16 f16 32 32 32 131072 131072-32x32x32 ORT cpu x86_64 None
47 0.002264 0.000371 0.001892 0.002635 2 2 0.004528 64 0.002859 np f16 f16 64 64 64 1048576 1048576-64x64x64 ORT cpu x86_64 None
48 0.016651 0.000124 0.016527 0.016776 2 2 0.033303 64 0.014219 np f16 f16 128 128 128 8388608 8388608-128x128x128 ORT cpu x86_64 None
49 0.105103 0.006122 0.098980 0.111225 2 2 0.210205 64 0.112516 np f16 f16 256 256 256 67108864 67108864-256x256x256 ORT cpu x86_64 None
50 0.000213 0.000040 0.000167 0.000326 40 16 0.008521 64 0.000278 ort bf16 bf16 32 32 32 131072 131072-32x32x32 ORT cuda x86_64 None
51 0.000265 0.000058 0.000190 0.000411 40 16 0.010607 64 0.000350 ort bf16 bf16 64 64 64 1048576 1048576-64x64x64 ORT cuda x86_64 None
52 0.000340 0.000088 0.000244 0.000601 40 16 0.013611 64 0.000331 ort bf16 bf16 128 128 128 8388608 8388608-128x128x128 ORT cuda x86_64 None
53 0.000442 0.000044 0.000386 0.000540 20 8 0.008845 64 0.000473 ort bf16 bf16 256 256 256 67108864 67108864-256x256x256 ORT cuda x86_64 None
54 0.000965 0.000049 0.000884 0.001049 10 4 0.009645 64 0.001136 ort bf16 bf16 400 400 400 256000000 256000000-400x400x400 ORT cuda x86_64 None
55 0.001387 0.000085 0.001258 0.001537 10 4 0.013867 64 0.001349 ort bf16 bf16 512 512 512 536870912 536870912-512x512x512 ORT cuda x86_64 None
56 0.005542 0.000873 0.004499 0.007643 10 4 0.055419 64 0.006246 ort bf16 bf16 1024 1024 1024 4294967296 4294967296-1024x1024x1024 ORT cuda x86_64 None


The errors#

for i, e in enumerate(errors):
    print(f"{i+1}/{len(errors)}-{e}")
1/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
2/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
3/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
4/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
5/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
6/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
7/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
8/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
9/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
10/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
11/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
12/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
13/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
14/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
15/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
16/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
17/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
18/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
19/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
20/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
21/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
22/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
23/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
24/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
25/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
26/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
27/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
28/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
29/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
30/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
31/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
32/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
33/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
34/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
35/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
36/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
37/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
38/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
39/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
40/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
41/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
42/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
43/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
44/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
45/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
46/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
47/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
48/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
49/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
50/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
51/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
52/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
53/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
54/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
55/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
56/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
57/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
58/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
59/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
60/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
61/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
62/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
63/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
64/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
65/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
66/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
67/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
68/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
69/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
70/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
71/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
72/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
73/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
74/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
75/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
76/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
77/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
78/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
79/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
80/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
81/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
82/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
83/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
84/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.

Summary#

piv = pivot_table(
    df,
    index=["cost"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
piv.reset_index(drop=False).to_excel("plot_bench_gemm_ort_summary.xlsx")
piv.reset_index(drop=False).to_csv("plot_bench_gemm_ort_summary.csv")


print("summary")
print(piv)
piv
summary
             average
provider         cpu                                    cuda
type             f16                 f32                bf16       f16                 f32
domain           ORT                 ORT                 ORT       EXT       ORT       EXT       ORT
engine            np       ort        np       ort       ort       ort       ort       ort       ort
cost
131072      0.000329  0.000041  0.000063  0.000021  0.000213  0.004686  0.000230  0.004122  0.000284
1048576     0.002264  0.000056  0.000093  0.000027  0.000265  0.004055  0.000198  0.004653  0.000396
8388608     0.016651  0.000431  0.010302  0.000067  0.000340  0.003477  0.000246  0.004210  0.000344
67108864    0.105103  0.001192  0.012554  0.000359  0.000442  0.003870  0.000379  0.004425  0.000601
256000000        NaN  0.002023       NaN  0.001319  0.000965  0.006882  0.000638  0.004817  0.001246
536870912        NaN  0.004482       NaN  0.003123  0.001387  0.009471  0.001042  0.005571  0.001722
4294967296       NaN  0.025175       NaN  0.025415  0.005542  0.042594  0.003411  0.009730  0.005438
average
provider cpu cuda
type f16 f32 bf16 f16 f32
domain ORT ORT ORT EXT ORT EXT ORT
engine np ort np ort ort ort ort ort ort
cost
131072 0.000329 0.000041 0.000063 0.000021 0.000213 0.004686 0.000230 0.004122 0.000284
1048576 0.002264 0.000056 0.000093 0.000027 0.000265 0.004055 0.000198 0.004653 0.000396
8388608 0.016651 0.000431 0.010302 0.000067 0.000340 0.003477 0.000246 0.004210 0.000344
67108864 0.105103 0.001192 0.012554 0.000359 0.000442 0.003870 0.000379 0.004425 0.000601
256000000 NaN 0.002023 NaN 0.001319 0.000965 0.006882 0.000638 0.004817 0.001246
536870912 NaN 0.004482 NaN 0.003123 0.001387 0.009471 0.001042 0.005571 0.001722
4294967296 NaN 0.025175 NaN 0.025415 0.005542 0.042594 0.003411 0.009730 0.005438


With the dimensions.

pivs = pivot_table(
    df,
    index=["cost_s"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
print(pivs)
                            average
provider                        cpu                                    cuda
type                            f16                 f32                bf16       f16                 f32
domain                          ORT                 ORT                 ORT       EXT       ORT       EXT       ORT
engine                           np       ort        np       ort       ort       ort       ort       ort       ort
cost_s
1048576-64x64x64           0.002264  0.000056  0.000093  0.000027  0.000265  0.004055  0.000198  0.004653  0.000396
131072-32x32x32            0.000329  0.000041  0.000063  0.000021  0.000213  0.004686  0.000230  0.004122  0.000284
256000000-400x400x400           NaN  0.002023       NaN  0.001319  0.000965  0.006882  0.000638  0.004817  0.001246
4294967296-1024x1024x1024       NaN  0.025175       NaN  0.025415  0.005542  0.042594  0.003411  0.009730  0.005438
536870912-512x512x512           NaN  0.004482       NaN  0.003123  0.001387  0.009471  0.001042  0.005571  0.001722
67108864-256x256x256       0.105103  0.001192  0.012554  0.000359  0.000442  0.003870  0.000379  0.004425  0.000601
8388608-128x128x128        0.016651  0.000431  0.010302  0.000067  0.000340  0.003477  0.000246  0.004210  0.000344

plot

dfi = df[
    df.type.isin({"f32", "f16", "bf16", "e4m3fn", "e5m2"}) & df.engine.isin({"ort"})
]
pivi = pivot_table(
    dfi,
    index=["cost"],
    columns=["type", "domain", "provider", "engine"],
    values="average",
)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
piv.plot(ax=ax[0], title="Gemm performance\nlower is better", logx=True, logy=True)
if pivi.shape[0] > 0:
    pivi.plot(
        ax=ax[1],
        title=f"Gemm performance ORT\n{platform.processor()}",
        logx=True,
        logy=True,
    )
fig.tight_layout()
fig.savefig("plot_bench_gemm_ort.png")
Gemm performance lower is better, Gemm performance ORT x86_64

Total running time of the script: (0 minutes 44.327 seconds)

Gallery generated by Sphinx-Gallery