Measuring performance about Gemm with onnxruntime#

The benchmark measures the performance of Gemm for different types and configuration. That includes a custom operator only available on CUDA calling function cublasLtMatmul. This function offers many options.

import pprint
import platform
from itertools import product
import numpy
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame, pivot_table
from onnx import TensorProto
from onnx.helper import (
    make_model,
    make_node,
    make_graph,
    make_tensor_value_info,
    make_opsetid,
)
from onnx.checker import check_model
from onnx.numpy_helper import from_array
from onnx.reference import ReferenceEvaluator
from onnxruntime import InferenceSession, SessionOptions, get_available_providers
from onnxruntime.capi._pybind_state import (
    OrtValue as C_OrtValue,
    OrtDevice as C_OrtDevice,
)
from onnxruntime.capi.onnxruntime_pybind11_state import (
    Fail,
    NotImplemented,
    InvalidGraph,
    InvalidArgument,
)

try:
    from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
except ImportError:
    onnx_simple_text_plot = str
try:
    from onnx_extended.reference import CReferenceEvaluator
except ImportError:
    CReferenceEvaluator = ReferenceEvaluator
from onnx_extended.args import get_parsed_args
from onnx_extended.ext_test_case import unit_test_going, measure_time

try:
    from onnx_extended.validation.cuda.cuda_example_py import get_device_prop
    from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs

    has_cuda = True
except ImportError:

    def get_device_prop():
        return {"name": "CPU"}

    def get_ort_ext_libs():
        return None

    has_cuda = False

default_dims = (
    "32,32,32;64,64,64;128,128,128;256,256,256;"
    "400,400,400;512,512,512;1024,1024,1024"
)
if has_cuda:
    prop = get_device_prop()
    if prop.get("major", 0) >= 7:
        default_dims += ";2048,2048,2048;4096,4096,4096"
    if prop.get("major", 0) >= 9:
        default_dims += ";16384,16384,16384"


script_args = get_parsed_args(
    "plot_bench_gemm_ort",
    description=__doc__,
    dims=(
        "32,32,32;64,64,64" if unit_test_going() else default_dims,
        "square matrix dimensions to try, comma separated values",
    ),
    types=(
        "FLOAT" if unit_test_going() else "FLOAT8E4M3FN,FLOAT,FLOAT16,BFLOAT16",
        "element type to teest",
    ),
    number=2 if unit_test_going() else 4,
    repeat=2 if unit_test_going() else 10,
    warmup=2 if unit_test_going() else 5,
    expose="repeat,number,warmup",
)

Device properties#

if has_cuda:
    properties = get_device_prop()
    pprint.pprint(properties)
else:
    properties = {"major": 0}

{'clockRate': 1569000,
 'computeMode': 0,
 'concurrentKernels': 1,
 'isMultiGpuBoard': 0,
 'major': 6,
 'maxThreadsPerBlock': 1024,
 'minor': 1,
 'multiProcessorCount': 10,
 'name': 'NVIDIA GeForce GTX 1060',
 'sharedMemPerBlock': 49152,
 'totalConstMem': 65536,
 'totalGlobalMem': 6442319872}

Model to benchmark#

It includes one Gemm. The operator changes. It can the regular Gemm, a custom Gemm from domain com.microsoft or a custom implementation from domain onnx_extented.ortops.tutorial.cuda.

def create_model(
    mat_type=TensorProto.FLOAT, provider="CUDAExecutionProvider", domain="com.microsoft"
):
    A = make_tensor_value_info("A", mat_type, [None, None])
    B = make_tensor_value_info("B", mat_type, [None, None])
    outputs = [make_tensor_value_info("C", mat_type, [None, None])]
    inits = []
    if domain != "":
        if provider != "CUDAExecutionProvider":
            return None
        f8 = False
        if domain == "com.microsoft":
            op_name = "GemmFloat8"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT:
            op_name = "CustomGemmFloat"
            computeType = "CUBLAS_COMPUTE_32F_FAST_TF32"
            node_output = ["C"]
        elif mat_type == TensorProto.FLOAT16:
            op_name = "CustomGemmFloat16"
            computeType = "CUBLAS_COMPUTE_16F"
            node_output = ["C"]
        elif mat_type in (TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2):
            f8 = True
            op_name = "CustomGemmFloat8E4M3FN"
            computeType = "CUBLAS_COMPUTE_32F"
            node_output = ["C"]
            outputs = [
                make_tensor_value_info("C", TensorProto.FLOAT16, [None, None]),
            ]
            inits.append(from_array(numpy.array([1], dtype=numpy.float32), name="I"))
        else:
            return None
        node_kw = dict(
            alpha=1.0,
            transB=1,
            domain=domain,
            computeType=computeType,
            fastAccumulationMode=1,
            rowMajor=0 if op_name.startswith("CustomGemmFloat") else 1,
        )
        node_kw["name"] = (
            f"{mat_type}.{len(node_output)}.{len(outputs)}."
            f"{domain}..{node_kw['rowMajor']}.."
            f"{node_kw['fastAccumulationMode']}..{node_kw['computeType']}.."
            f"{f8}"
        )
        node_inputs = ["A", "B"]
        if f8:
            node_inputs.append("")
            node_inputs.extend(["I"] * 3)
        nodes = [make_node(op_name, node_inputs, node_output, **node_kw)]
    else:
        nodes = [
            make_node("Gemm", ["A", "B"], ["C"], transA=1, beta=0.0),
        ]
    graph = make_graph(nodes, "a", [A, B], outputs, inits)
    if mat_type < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph,
        opset_imports=[
            make_opsetid("", opset),
            make_opsetid("com.microsoft", 1),
            make_opsetid("onnx_extented.ortops.tutorial.cuda", 1),
        ],
        ir_version=ir,
    )
    check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_model()))

opset: domain='' version=18
opset: domain='com.microsoft' version=1
opset: domain='onnx_extented.ortops.tutorial.cuda' version=1
input: name='A' type=dtype('float32') shape=['', '']
input: name='B' type=dtype('float32') shape=['', '']
GemmFloat8[com.microsoft](A, B, alpha=1.00, computeType=b'CUBLAS_COMPUTE_32F', fastAccumulationMode=1, rowMajor=1, transB=1) -> C
output: name='C' type=dtype('float32') shape=['', '']

A model to cast into anytype. numpy does not support float 8. onnxruntime is used to cast a float array into any type. It must be called with tensor of type OrtValue.

def create_cast(to, cuda=False):
    A = make_tensor_value_info("A", TensorProto.FLOAT, [None, None])
    C = make_tensor_value_info("C", to, [None, None])
    if cuda:
        nodes = [
            make_node("Cast", ["A"], ["Cc"], to=to),
            make_node("MemcpyFromHost", ["Cc"], ["C"]),
        ]
    else:
        nodes = [make_node("Cast", ["A"], ["C"], to=to)]
    graph = make_graph(nodes, "a", [A], [C])
    if to < 16:
        # regular type
        opset, ir = 18, 8
    else:
        opset, ir = 19, 9
    onnx_model = make_model(
        graph, opset_imports=[make_opsetid("", opset)], ir_version=ir
    )
    if not cuda:
        # OpType: MemcpyFromHost
        check_model(onnx_model)
    return onnx_model


print(onnx_simple_text_plot(create_cast(TensorProto.FLOAT16)))

opset: domain='' version=18
input: name='A' type=dtype('float32') shape=['', '']
Cast(A, to=10) -> C
output: name='C' type=dtype('float16') shape=['', '']

Performance#

The benchmark will run the following configurations.

types = list(getattr(TensorProto, a) for a in script_args.types.split(","))
engine = [InferenceSession, CReferenceEvaluator]
providers = [
    ["CUDAExecutionProvider", "CPUExecutionProvider"],
    ["CPUExecutionProvider"],
]
# M, N, K
# we use multiple of 8, otherwise, float8 does not work.
dims = [list(int(i) for i in line.split(",")) for line in script_args.dims.split(";")]
domains = ["onnx_extented.ortops.tutorial.cuda", "", "com.microsoft"]

Let’s cache the matrices involved.

def to_ort_value(m):
    device = C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    ort_value = C_OrtValue.ortvalue_from_numpy(m, device)
    return ort_value


def cached_inputs(dims, types):
    matrices = {}
    matrices_cuda = {}
    pbar = tqdm(list(product(dims, types)))
    for dim, tt in pbar:
        m, n, k = dim
        pbar.set_description(f"t={tt} dim={dim}")
        for i, j in [(m, k), (k, n), (k, m)]:
            if (tt, i, j) in matrices:
                continue
            # CPU
            try:
                sess = InferenceSession(
                    create_cast(tt).SerializeToString(),
                    providers=["CPUExecutionProvider"],
                )
                cpu = True
            except (InvalidGraph, InvalidArgument, NotImplemented):
                # not support by this version of onnxruntime
                cpu = False

            if cpu:
                vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
                ov = to_ort_value(vect)
                ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
                matrices[tt, i, j] = ovtt
            else:
                continue

            # CUDA
            if "CUDAExecutionProvider" not in get_available_providers():
                # No CUDA
                continue
            sess = InferenceSession(
                create_cast(tt, cuda=True).SerializeToString(),
                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
            )
            vect = (numpy.random.randn(i, j) * 10).astype(numpy.float32)
            ov = to_ort_value(vect)
            ovtt = sess._sess.run_with_ort_values({"A": ov}, ["C"], None)[0]
            matrices_cuda[tt, i, j] = ovtt
    return matrices, matrices_cuda


matrices, matrices_cuda = cached_inputs(dims, types)
print(f"{len(matrices)} matrices were created.")

  0%|          | 0/28 [00:00<?, ?it/s]
t=17 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=1 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=10 dim=[32, 32, 32]:   0%|          | 0/28 [00:00<?, ?it/s]
t=10 dim=[32, 32, 32]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=16 dim=[32, 32, 32]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=17 dim=[64, 64, 64]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=1 dim=[64, 64, 64]:  11%|█         | 3/28 [00:00<00:01, 21.29it/s]
t=1 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=10 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=16 dim=[64, 64, 64]:  21%|██▏       | 6/28 [00:00<00:01, 18.30it/s]
t=16 dim=[64, 64, 64]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=17 dim=[128, 128, 128]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=1 dim=[128, 128, 128]:  29%|██▊       | 8/28 [00:00<00:01, 17.77it/s]
t=1 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=10 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=16 dim=[128, 128, 128]:  36%|███▌      | 10/28 [00:00<00:01, 15.60it/s]
t=16 dim=[128, 128, 128]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=17 dim=[256, 256, 256]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=1 dim=[256, 256, 256]:  43%|████▎     | 12/28 [00:00<00:01, 15.74it/s]
t=1 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=10 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=16 dim=[256, 256, 256]:  50%|█████     | 14/28 [00:00<00:00, 14.86it/s]
t=16 dim=[256, 256, 256]:  57%|█████▋    | 16/28 [00:00<00:00, 15.32it/s]
t=17 dim=[400, 400, 400]:  57%|█████▋    | 16/28 [00:00<00:00, 15.32it/s]
t=1 dim=[400, 400, 400]:  57%|█████▋    | 16/28 [00:01<00:00, 15.32it/s]
t=1 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=10 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=16 dim=[400, 400, 400]:  64%|██████▍   | 18/28 [00:01<00:00, 14.27it/s]
t=16 dim=[400, 400, 400]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=17 dim=[512, 512, 512]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=1 dim=[512, 512, 512]:  71%|███████▏  | 20/28 [00:01<00:00, 13.50it/s]
t=1 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=10 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=16 dim=[512, 512, 512]:  79%|███████▊  | 22/28 [00:01<00:00, 13.12it/s]
t=16 dim=[512, 512, 512]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=17 dim=[1024, 1024, 1024]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=1 dim=[1024, 1024, 1024]:  86%|████████▌ | 24/28 [00:01<00:00, 12.95it/s]
t=1 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=10 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=16 dim=[1024, 1024, 1024]:  93%|█████████▎| 26/28 [00:02<00:00,  8.00it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:02<00:00,  6.44it/s]
t=16 dim=[1024, 1024, 1024]: 100%|██████████| 28/28 [00:02<00:00, 10.91it/s]
28 matrices were created.

Let’s run the benchmark

def rendering_obs(obs, dim, number, repeat, domain, provider, internal_time):
    stype = {
        TensorProto.FLOAT: "f32",
        TensorProto.FLOAT16: "f16",
        TensorProto.BFLOAT16: "bf16",
        TensorProto.INT8: "i8",
        TensorProto.INT16: "i16",
        TensorProto.INT32: "i32",
        TensorProto.UINT32: "u32",
        TensorProto.FLOAT8E4M3FN: "e4m3fn",
        TensorProto.FLOAT8E5M2: "e5m2",
    }[tt]
    obs.update(
        dict(
            engine={"InferenceSession": "ort", "CReferenceEvaluator": "np"}[
                engine.__name__
            ],
            stype=stype,
            type=f"{stype}",
            M=dim[0],
            N=dim[1],
            K=dim[2],
            cost=numpy.prod(dim) * 4,
            cost_s=f"{numpy.prod(dim) * 4}-{dim[0]}x{dim[1]}x{dim[2]}",
            repeat=repeat,
            number=number,
            domain={
                "": "ORT",
                "com.microsoft": "COM",
                "onnx_extented.ortops.tutorial.cuda": "EXT",
            }[domain],
            provider={
                "CPUExecutionProvider": "cpu",
                "CUDAExecutionProvider": "cuda",
            }[provider[0]],
            platform=platform.processor(),
            intime=internal_time,
        )
    )
    return obs


opts = SessionOptions()
r = get_ort_ext_libs()
if r is not None:
    opts.register_custom_ops_library(r[0])


data = []
errors = []
pbar = tqdm(list(product(types, engine, providers, dims, domains)))
for tt, engine, provider, dim, domain in pbar:
    if (
        tt in {TensorProto.FLOAT8E4M3FN, TensorProto.FLOAT8E5M2}
        and properties.get("major", 0) < 9
    ):
        # f8 not available
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"f8 not available, major={properties.get('major', 0)}, "
            f"tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    elif provider[0] == "CPUExecutionProvider" and max(dim) > 2000:
        # too long
        continue
    if max(dim) <= 200:
        repeat, number = script_args.repeat * 4, script_args.number * 4
    elif max(dim) <= 256:
        repeat, number = script_args.repeat * 2, script_args.number * 2
    else:
        repeat, number = script_args.repeat, script_args.number

    onx = create_model(tt, provider=provider[0], domain=domain)
    if onx is None:
        if provider[0] == "CPUExecutionProvider":
            continue
        errors.append(
            f"No model for tt={tt}, provider={provider!r}, domain={domain!r}."
        )
        continue
    with open(f"plot_bench_gemm_ort_{tt}_{domain}.onnx", "wb") as f:
        f.write(onx.SerializeToString())
    k1 = (tt, dim[2], dim[0])
    k2 = (tt, dim[2], dim[1])
    if k1 not in matrices:
        errors.append(f"Key k1={k1!r} not in matrices.")
        continue
    if k2 not in matrices:
        errors.append(f"Key k2={k2!r} not in matrices.")
        continue

    pbar.set_description(f"t={tt} e={engine.__name__} p={provider[0][:4]} dim={dim}")

    if engine == CReferenceEvaluator:
        if (
            domain != ""
            or max(dim) > 256
            or provider != ["CPUExecutionProvider"]
            or tt not in [TensorProto.FLOAT, TensorProto.FLOAT16]
        ):
            # All impossible or slow cases.
            continue
        if tt == TensorProto.FLOAT16 and max(dim) > 50:
            repeat, number = 2, 2

        feeds = {"A": matrices[k1].numpy(), "B": matrices[k2].numpy()}
        sess = engine(onx)
        sess.run(None, feeds)
        obs = measure_time(lambda: sess.run(None, feeds), repeat=repeat, number=number)

    elif engine == InferenceSession:
        if provider[0] not in get_available_providers():
            errors.append(f"provider={provider[0]} is missing")
            continue
        try:
            sess = engine(onx.SerializeToString(), opts, providers=provider)
        except (NotImplemented, InvalidGraph, Fail) as e:
            # not implemented
            errors.append((tt, engine.__class__.__name__, provider, domain, e))
            continue

        the_feeds = (
            {"A": matrices[k1], "B": matrices[k2]}
            if provider == ["CPUExecutionProvider"]
            else {"A": matrices_cuda[k1], "B": matrices_cuda[k2]}
        )
        out_names = ["C"]

        # warmup
        for i in range(script_args.warmup):
            sess._sess.run_with_ort_values(the_feeds, out_names, None)[0]

        # benchamrk
        times = []

        def fct_benchmarked():
            got = sess._sess.run_with_ort_values(the_feeds, out_names, None)
            if len(got) > 1:
                times.append(got[1])

        obs = measure_time(fct_benchmarked, repeat=repeat, number=number)
        internal_time = None
        if times:
            np_times = [t.numpy() for t in times]
            internal_time = (sum(np_times) / len(times))[0]

    else:
        errors.append(f"unknown engine={engine}")
        continue

    # improves the rendering
    obs = rendering_obs(obs, dim, number, repeat, domain, provider, internal_time)
    data.append(obs)
    if unit_test_going() and len(data) >= 2:
        break

  0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:   0%|          | 0/336 [00:00<?, ?it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:02<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:02<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[32, 32, 32]:  25%|██▌       | 85/336 [00:03<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  25%|██▌       | 85/336 [00:03<00:08, 28.39it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  26%|██▌       | 88/336 [00:06<00:21, 11.75it/s]
t=1 e=InferenceSession p=CUDA dim=[64, 64, 64]:  27%|██▋       | 90/336 [00:06<00:21, 11.43it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 90/336 [00:06<00:21, 11.43it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:09<00:41,  5.95it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 91/336 [00:09<00:41,  5.95it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[128, 128, 128]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  27%|██▋       | 92/336 [00:09<00:41,  5.86it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:10<00:46,  5.20it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 94/336 [00:10<00:46,  5.20it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[256, 256, 256]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  28%|██▊       | 95/336 [00:10<00:45,  5.36it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[400, 400, 400]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  29%|██▉       | 97/336 [00:10<00:41,  5.70it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:10<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:10<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[512, 512, 512]:  30%|██▉       | 100/336 [00:11<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  30%|██▉       | 100/336 [00:11<00:37,  6.23it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:11<00:39,  5.97it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 103/336 [00:11<00:39,  5.97it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[32, 32, 32]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[64, 64, 64]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[128, 128, 128]:  31%|███       | 104/336 [00:11<00:41,  5.59it/s]
t=1 e=InferenceSession p=CPUE dim=[128, 128, 128]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[256, 256, 256]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[400, 400, 400]:  34%|███▎      | 113/336 [00:11<00:17, 12.65it/s]
t=1 e=InferenceSession p=CPUE dim=[400, 400, 400]:  35%|███▌      | 119/336 [00:12<00:12, 17.01it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  35%|███▌      | 119/336 [00:12<00:12, 17.01it/s]
t=1 e=InferenceSession p=CPUE dim=[512, 512, 512]:  37%|███▋      | 123/336 [00:12<00:11, 18.70it/s]
t=1 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  37%|███▋      | 123/336 [00:12<00:11, 18.70it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  37%|███▋      | 123/336 [00:13<00:11, 18.70it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  38%|███▊      | 127/336 [00:13<00:25,  8.30it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  45%|████▌     | 152/336 [00:13<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  45%|████▌     | 152/336 [00:13<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  45%|████▌     | 152/336 [00:20<00:07, 25.52it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=1 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  48%|████▊     | 160/336 [00:22<00:51,  3.41it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[32, 32, 32]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:25<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  50%|█████     | 169/336 [00:28<00:51,  3.22it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[64, 64, 64]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  51%|█████▏    | 173/336 [00:28<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  51%|█████▏    | 173/336 [00:30<01:01,  2.67it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[128, 128, 128]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  52%|█████▏    | 176/336 [00:30<01:09,  2.30it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[256, 256, 256]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  53%|█████▎    | 178/336 [00:31<01:06,  2.37it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[400, 400, 400]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  54%|█████▍    | 181/336 [00:31<00:56,  2.74it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[512, 512, 512]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  55%|█████▍    | 184/336 [00:32<00:48,  3.13it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [00:34<01:00,  2.47it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 187/336 [00:34<01:00,  2.47it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[32, 32, 32]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[64, 64, 64]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  56%|█████▌    | 188/336 [00:34<00:56,  2.62it/s]
t=10 e=InferenceSession p=CPUE dim=[128, 128, 128]:  59%|█████▊    | 197/336 [00:34<00:26,  5.31it/s]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  59%|█████▊    | 197/336 [00:34<00:26,  5.31it/s]
t=10 e=InferenceSession p=CPUE dim=[256, 256, 256]:  60%|█████▉    | 200/336 [00:35<00:22,  6.10it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|█████▉    | 200/336 [00:35<00:22,  6.10it/s]
t=10 e=InferenceSession p=CPUE dim=[400, 400, 400]:  60%|██████    | 203/336 [00:35<00:17,  7.49it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  60%|██████    | 203/336 [00:35<00:17,  7.49it/s]
t=10 e=InferenceSession p=CPUE dim=[512, 512, 512]:  61%|██████▏   | 206/336 [00:35<00:15,  8.45it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  61%|██████▏   | 206/336 [00:35<00:15,  8.45it/s]
t=10 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  62%|██████▏   | 209/336 [00:36<00:24,  5.23it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  69%|██████▉   | 233/336 [00:36<00:05, 18.05it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  71%|███████   | 239/336 [00:36<00:04, 20.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  71%|███████   | 239/336 [00:36<00:04, 20.96it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=10 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  73%|███████▎  | 244/336 [00:37<00:05, 15.52it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[32, 32, 32]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  76%|███████▌  | 254/336 [00:37<00:03, 21.63it/s]
t=16 e=InferenceSession p=CUDA dim=[64, 64, 64]:  77%|███████▋  | 259/336 [00:37<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 259/336 [00:37<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[128, 128, 128]:  77%|███████▋  | 259/336 [00:38<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  77%|███████▋  | 259/336 [00:38<00:03, 22.53it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[256, 256, 256]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[400, 400, 400]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  78%|███████▊  | 263/336 [00:38<00:03, 19.60it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[512, 512, 512]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  80%|████████  | 269/336 [00:38<00:02, 22.72it/s]
t=16 e=InferenceSession p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=InferenceSession p=CPUE dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CUDA dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[32, 32, 32]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[64, 64, 64]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[128, 128, 128]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[256, 256, 256]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[400, 400, 400]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[512, 512, 512]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]:  81%|████████▏ | 273/336 [00:38<00:03, 19.46it/s]
t=16 e=CReferenceEvaluator p=CPUE dim=[1024, 1024, 1024]: 100%|██████████| 336/336 [00:38<00:00,  8.67it/s]

Results#

df = DataFrame(data)
df.to_excel("plot_bench_gemm_ort.xlsx")
df.to_csv("plot_bench_gemm_ort.csv")
df.drop(["min_exec", "max_exec", "cost_s", "cost"], axis=1).to_csv(
    "plot_bench_gemm_ort.csv", index=False
)
print(df.head().T)
df

                            0                1                 2                 3                    4
average              0.004122         0.000284          0.004653          0.000396              0.00421
deviation            0.000578         0.000109          0.000309          0.000153             0.000347
min_exec             0.003197         0.000182          0.004109          0.000242             0.003513
max_exec             0.005962         0.000624          0.005381          0.000805              0.00502
repeat                     40               40                40                40                   40
number                     16               16                16                16                   16
ttime                0.164877         0.011356          0.186115          0.015823              0.16838
context_size               64               64                64                64                   64
warmup_time          0.004435         0.000249          0.004432          0.001938             0.004779
engine                    ort              ort               ort               ort                  ort
stype                     f32              f32               f32               f32                  f32
type                      f32              f32               f32               f32                  f32
M                          32               32                64                64                  128
N                          32               32                64                64                  128
K                          32               32                64                64                  128
cost                   131072           131072           1048576           1048576              8388608
cost_s        131072-32x32x32  131072-32x32x32  1048576-64x64x64  1048576-64x64x64  8388608-128x128x128
domain                    EXT              ORT               EXT               ORT                  EXT
provider                 cuda             cuda              cuda              cuda                 cuda
platform               x86_64           x86_64            x86_64            x86_64               x86_64
intime                   None             None              None              None                 None

	average	deviation	min_exec	max_exec	repeat	number	ttime	context_size	warmup_time	engine	stype	type	M	N	K	cost	cost_s	domain	provider	platform	intime
0	0.004122	0.000578	0.003197	0.005962	40	16	0.164877	64	0.004435	ort	f32	f32	32	32	32	131072	131072-32x32x32	EXT	cuda	x86_64	None
1	0.000284	0.000109	0.000182	0.000624	40	16	0.011356	64	0.000249	ort	f32	f32	32	32	32	131072	131072-32x32x32	ORT	cuda	x86_64	None
2	0.004653	0.000309	0.004109	0.005381	40	16	0.186115	64	0.004432	ort	f32	f32	64	64	64	1048576	1048576-64x64x64	EXT	cuda	x86_64	None
3	0.000396	0.000153	0.000242	0.000805	40	16	0.015823	64	0.001938	ort	f32	f32	64	64	64	1048576	1048576-64x64x64	ORT	cuda	x86_64	None
4	0.004210	0.000347	0.003513	0.005020	40	16	0.168380	64	0.004779	ort	f32	f32	128	128	128	8388608	8388608-128x128x128	EXT	cuda	x86_64	None
5	0.000344	0.000101	0.000256	0.000593	40	16	0.013747	64	0.000579	ort	f32	f32	128	128	128	8388608	8388608-128x128x128	ORT	cuda	x86_64	None
6	0.004425	0.000512	0.003926	0.006452	20	8	0.088494	64	0.004685	ort	f32	f32	256	256	256	67108864	67108864-256x256x256	EXT	cuda	x86_64	None
7	0.000601	0.000073	0.000534	0.000808	20	8	0.012018	64	0.000724	ort	f32	f32	256	256	256	67108864	67108864-256x256x256	ORT	cuda	x86_64	None
8	0.004817	0.000283	0.004376	0.005421	10	4	0.048173	64	0.004500	ort	f32	f32	400	400	400	256000000	256000000-400x400x400	EXT	cuda	x86_64	None
9	0.001246	0.000080	0.001101	0.001412	10	4	0.012464	64	0.001223	ort	f32	f32	400	400	400	256000000	256000000-400x400x400	ORT	cuda	x86_64	None
10	0.005571	0.000256	0.005223	0.006130	10	4	0.055710	64	0.006282	ort	f32	f32	512	512	512	536870912	536870912-512x512x512	EXT	cuda	x86_64	None
11	0.001722	0.000050	0.001610	0.001771	10	4	0.017221	64	0.001733	ort	f32	f32	512	512	512	536870912	536870912-512x512x512	ORT	cuda	x86_64	None
12	0.009730	0.000635	0.008817	0.011191	10	4	0.097297	64	0.012024	ort	f32	f32	1024	1024	1024	4294967296	4294967296-1024x1024x1024	EXT	cuda	x86_64	None
13	0.005438	0.000074	0.005334	0.005576	10	4	0.054380	64	0.005687	ort	f32	f32	1024	1024	1024	4294967296	4294967296-1024x1024x1024	ORT	cuda	x86_64	None
14	0.000021	0.000007	0.000013	0.000050	40	16	0.000842	64	0.000046	ort	f32	f32	32	32	32	131072	131072-32x32x32	ORT	cpu	x86_64	None
15	0.000027	0.000003	0.000025	0.000040	40	16	0.001075	64	0.000050	ort	f32	f32	64	64	64	1048576	1048576-64x64x64	ORT	cpu	x86_64	None
16	0.000067	0.000008	0.000051	0.000103	40	16	0.002692	64	0.000086	ort	f32	f32	128	128	128	8388608	8388608-128x128x128	ORT	cpu	x86_64	None
17	0.000359	0.000025	0.000327	0.000415	20	8	0.007174	64	0.000390	ort	f32	f32	256	256	256	67108864	67108864-256x256x256	ORT	cpu	x86_64	None
18	0.001319	0.000038	0.001229	0.001375	10	4	0.013193	64	0.001518	ort	f32	f32	400	400	400	256000000	256000000-400x400x400	ORT	cpu	x86_64	None
19	0.003123	0.000393	0.002809	0.004208	10	4	0.031229	64	0.002939	ort	f32	f32	512	512	512	536870912	536870912-512x512x512	ORT	cpu	x86_64	None
20	0.025415	0.002176	0.023589	0.030808	10	4	0.254145	64	0.021144	ort	f32	f32	1024	1024	1024	4294967296	4294967296-1024x1024x1024	ORT	cpu	x86_64	None
21	0.000063	0.000027	0.000044	0.000147	40	16	0.002539	64	0.000118	np	f32	f32	32	32	32	131072	131072-32x32x32	ORT	cpu	x86_64	None
22	0.000093	0.000026	0.000062	0.000147	40	16	0.003701	64	0.000162	np	f32	f32	64	64	64	1048576	1048576-64x64x64	ORT	cpu	x86_64	None
23	0.010302	0.003517	0.002135	0.020524	40	16	0.412060	64	0.006489	np	f32	f32	128	128	128	8388608	8388608-128x128x128	ORT	cpu	x86_64	None
24	0.012554	0.004106	0.003891	0.020194	20	8	0.251079	64	0.008421	np	f32	f32	256	256	256	67108864	67108864-256x256x256	ORT	cpu	x86_64	None
25	0.004686	0.001919	0.003153	0.010099	40	16	0.187434	64	0.006409	ort	f16	f16	32	32	32	131072	131072-32x32x32	EXT	cuda	x86_64	None
26	0.000230	0.000072	0.000163	0.000445	40	16	0.009192	64	0.000258	ort	f16	f16	32	32	32	131072	131072-32x32x32	ORT	cuda	x86_64	None
27	0.004055	0.000301	0.003511	0.005197	40	16	0.162212	64	0.003966	ort	f16	f16	64	64	64	1048576	1048576-64x64x64	EXT	cuda	x86_64	None
28	0.000198	0.000033	0.000148	0.000309	40	16	0.007937	64	0.000213	ort	f16	f16	64	64	64	1048576	1048576-64x64x64	ORT	cuda	x86_64	None
29	0.003477	0.000289	0.002972	0.004156	40	16	0.139098	64	0.003759	ort	f16	f16	128	128	128	8388608	8388608-128x128x128	EXT	cuda	x86_64	None
30	0.000246	0.000036	0.000186	0.000325	40	16	0.009837	64	0.000261	ort	f16	f16	128	128	128	8388608	8388608-128x128x128	ORT	cuda	x86_64	None
31	0.003870	0.000276	0.003438	0.004474	20	8	0.077403	64	0.004728	ort	f16	f16	256	256	256	67108864	67108864-256x256x256	EXT	cuda	x86_64	None
32	0.000379	0.000068	0.000330	0.000664	20	8	0.007587	64	0.000377	ort	f16	f16	256	256	256	67108864	67108864-256x256x256	ORT	cuda	x86_64	None
33	0.006882	0.000220	0.006523	0.007184	10	4	0.068824	64	0.006844	ort	f16	f16	400	400	400	256000000	256000000-400x400x400	EXT	cuda	x86_64	None
34	0.000638	0.000037	0.000583	0.000698	10	4	0.006383	64	0.000637	ort	f16	f16	400	400	400	256000000	256000000-400x400x400	ORT	cuda	x86_64	None
35	0.009471	0.000303	0.008982	0.009824	10	4	0.094706	64	0.008904	ort	f16	f16	512	512	512	536870912	536870912-512x512x512	EXT	cuda	x86_64	None
36	0.001042	0.000068	0.000920	0.001182	10	4	0.010416	64	0.001072	ort	f16	f16	512	512	512	536870912	536870912-512x512x512	ORT	cuda	x86_64	None
37	0.042594	0.000164	0.042252	0.042910	10	4	0.425938	64	0.042064	ort	f16	f16	1024	1024	1024	4294967296	4294967296-1024x1024x1024	EXT	cuda	x86_64	None
38	0.003411	0.000140	0.003192	0.003616	10	4	0.034109	64	0.003368	ort	f16	f16	1024	1024	1024	4294967296	4294967296-1024x1024x1024	ORT	cuda	x86_64	None
39	0.000041	0.000024	0.000021	0.000166	40	16	0.001637	64	0.000078	ort	f16	f16	32	32	32	131072	131072-32x32x32	ORT	cpu	x86_64	None
40	0.000056	0.000029	0.000043	0.000215	40	16	0.002251	64	0.000106	ort	f16	f16	64	64	64	1048576	1048576-64x64x64	ORT	cpu	x86_64	None
41	0.000431	0.000237	0.000175	0.001358	40	16	0.017229	64	0.000311	ort	f16	f16	128	128	128	8388608	8388608-128x128x128	ORT	cpu	x86_64	None
42	0.001192	0.000112	0.000872	0.001378	20	8	0.023834	64	0.001916	ort	f16	f16	256	256	256	67108864	67108864-256x256x256	ORT	cpu	x86_64	None
43	0.002023	0.000433	0.001702	0.003095	10	4	0.020235	64	0.002813	ort	f16	f16	400	400	400	256000000	256000000-400x400x400	ORT	cpu	x86_64	None
44	0.004482	0.000501	0.003942	0.005915	10	4	0.044815	64	0.004937	ort	f16	f16	512	512	512	536870912	536870912-512x512x512	ORT	cpu	x86_64	None
45	0.025175	0.003117	0.021383	0.031425	10	4	0.251748	64	0.027772	ort	f16	f16	1024	1024	1024	4294967296	4294967296-1024x1024x1024	ORT	cpu	x86_64	None
46	0.000329	0.000023	0.000302	0.000420	40	16	0.013148	64	0.000382	np	f16	f16	32	32	32	131072	131072-32x32x32	ORT	cpu	x86_64	None
47	0.002264	0.000371	0.001892	0.002635	2	2	0.004528	64	0.002859	np	f16	f16	64	64	64	1048576	1048576-64x64x64	ORT	cpu	x86_64	None
48	0.016651	0.000124	0.016527	0.016776	2	2	0.033303	64	0.014219	np	f16	f16	128	128	128	8388608	8388608-128x128x128	ORT	cpu	x86_64	None
49	0.105103	0.006122	0.098980	0.111225	2	2	0.210205	64	0.112516	np	f16	f16	256	256	256	67108864	67108864-256x256x256	ORT	cpu	x86_64	None
50	0.000213	0.000040	0.000167	0.000326	40	16	0.008521	64	0.000278	ort	bf16	bf16	32	32	32	131072	131072-32x32x32	ORT	cuda	x86_64	None
51	0.000265	0.000058	0.000190	0.000411	40	16	0.010607	64	0.000350	ort	bf16	bf16	64	64	64	1048576	1048576-64x64x64	ORT	cuda	x86_64	None
52	0.000340	0.000088	0.000244	0.000601	40	16	0.013611	64	0.000331	ort	bf16	bf16	128	128	128	8388608	8388608-128x128x128	ORT	cuda	x86_64	None
53	0.000442	0.000044	0.000386	0.000540	20	8	0.008845	64	0.000473	ort	bf16	bf16	256	256	256	67108864	67108864-256x256x256	ORT	cuda	x86_64	None
54	0.000965	0.000049	0.000884	0.001049	10	4	0.009645	64	0.001136	ort	bf16	bf16	400	400	400	256000000	256000000-400x400x400	ORT	cuda	x86_64	None
55	0.001387	0.000085	0.001258	0.001537	10	4	0.013867	64	0.001349	ort	bf16	bf16	512	512	512	536870912	536870912-512x512x512	ORT	cuda	x86_64	None
56	0.005542	0.000873	0.004499	0.007643	10	4	0.055419	64	0.006246	ort	bf16	bf16	1024	1024	1024	4294967296	4294967296-1024x1024x1024	ORT	cuda	x86_64	None

The errors#

for i, e in enumerate(errors):
    print(f"{i+1}/{len(errors)}-{e}")

1/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
2/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
3/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
4/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
5/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
6/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
7/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
8/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
9/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
10/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
11/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
12/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
13/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
14/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
15/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
16/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
17/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
18/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
19/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
20/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
21/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
22/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
23/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
24/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
25/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
26/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
27/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
28/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
29/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
30/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
31/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
32/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
33/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
34/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
35/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
36/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
37/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
38/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
39/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
40/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
41/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain=''.
42/84-f8 not available, major=6, tt=17, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='com.microsoft'.
43/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
44/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
45/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
46/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
47/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
48/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
49/84-(1, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("1.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float),"B": tensor(float),) -> ("C": tensor(float),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
50/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
51/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
52/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
53/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
54/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
55/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
56/84-(10, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("10.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(float16),"B": tensor(float16),) -> ("C": tensor(float16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
57/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
58/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
59/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
60/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
61/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
62/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
63/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
64/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
65/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
66/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
67/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
68/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
69/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
70/84-(16, 'type', ['CUDAExecutionProvider', 'CPUExecutionProvider'], 'com.microsoft', InvalidGraph('[ONNXRuntimeError] : 10 : INVALID_GRAPH : This is an invalid model. In Node, ("16.1.1.com.microsoft..1..1..CUBLAS_COMPUTE_32F..False", GemmFloat8, "com.microsoft", -1) : ("A": tensor(bfloat16),"B": tensor(bfloat16),) -> ("C": tensor(bfloat16),) , Error Unrecognized attribute: rowMajor for operator GemmFloat8'))
71/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
72/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
73/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
74/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
75/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
76/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
77/84-(16, 'type', ['CPUExecutionProvider'], '', NotImplemented("[ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Gemm(13) node with name ''"))
78/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
79/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
80/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
81/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
82/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
83/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.
84/84-No model for tt=16, provider=['CUDAExecutionProvider', 'CPUExecutionProvider'], domain='onnx_extented.ortops.tutorial.cuda'.

Summary#

piv = pivot_table(
    df,
    index=["cost"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
piv.reset_index(drop=False).to_excel("plot_bench_gemm_ort_summary.xlsx")
piv.reset_index(drop=False).to_csv("plot_bench_gemm_ort_summary.csv")


print("summary")
print(piv)
piv

summary
             average
provider         cpu                                    cuda
type             f16                 f32                bf16       f16                 f32
domain           ORT                 ORT                 ORT       EXT       ORT       EXT       ORT
engine            np       ort        np       ort       ort       ort       ort       ort       ort
cost
131072      0.000329  0.000041  0.000063  0.000021  0.000213  0.004686  0.000230  0.004122  0.000284
1048576     0.002264  0.000056  0.000093  0.000027  0.000265  0.004055  0.000198  0.004653  0.000396
8388608     0.016651  0.000431  0.010302  0.000067  0.000340  0.003477  0.000246  0.004210  0.000344
67108864    0.105103  0.001192  0.012554  0.000359  0.000442  0.003870  0.000379  0.004425  0.000601
256000000        NaN  0.002023       NaN  0.001319  0.000965  0.006882  0.000638  0.004817  0.001246
536870912        NaN  0.004482       NaN  0.003123  0.001387  0.009471  0.001042  0.005571  0.001722
4294967296       NaN  0.025175       NaN  0.025415  0.005542  0.042594  0.003411  0.009730  0.005438

	average
provider	cpu				cuda
type	f16		f32		bf16	f16		f32
domain	ORT		ORT		ORT	EXT	ORT	EXT	ORT
engine	np	ort	np	ort	ort	ort	ort	ort	ort
cost
131072	0.000329	0.000041	0.000063	0.000021	0.000213	0.004686	0.000230	0.004122	0.000284
1048576	0.002264	0.000056	0.000093	0.000027	0.000265	0.004055	0.000198	0.004653	0.000396
8388608	0.016651	0.000431	0.010302	0.000067	0.000340	0.003477	0.000246	0.004210	0.000344
67108864	0.105103	0.001192	0.012554	0.000359	0.000442	0.003870	0.000379	0.004425	0.000601
256000000	NaN	0.002023	NaN	0.001319	0.000965	0.006882	0.000638	0.004817	0.001246
536870912	NaN	0.004482	NaN	0.003123	0.001387	0.009471	0.001042	0.005571	0.001722
4294967296	NaN	0.025175	NaN	0.025415	0.005542	0.042594	0.003411	0.009730	0.005438

With the dimensions.

pivs = pivot_table(
    df,
    index=["cost_s"],
    columns=["provider", "type", "domain", "engine"],
    values=["average", "intime"],
)
print(pivs)

                            average
provider                        cpu                                    cuda
type                            f16                 f32                bf16       f16                 f32
domain                          ORT                 ORT                 ORT       EXT       ORT       EXT       ORT
engine                           np       ort        np       ort       ort       ort       ort       ort       ort
cost_s
1048576-64x64x64           0.002264  0.000056  0.000093  0.000027  0.000265  0.004055  0.000198  0.004653  0.000396
131072-32x32x32            0.000329  0.000041  0.000063  0.000021  0.000213  0.004686  0.000230  0.004122  0.000284
256000000-400x400x400           NaN  0.002023       NaN  0.001319  0.000965  0.006882  0.000638  0.004817  0.001246
4294967296-1024x1024x1024       NaN  0.025175       NaN  0.025415  0.005542  0.042594  0.003411  0.009730  0.005438
536870912-512x512x512           NaN  0.004482       NaN  0.003123  0.001387  0.009471  0.001042  0.005571  0.001722
67108864-256x256x256       0.105103  0.001192  0.012554  0.000359  0.000442  0.003870  0.000379  0.004425  0.000601
8388608-128x128x128        0.016651  0.000431  0.010302  0.000067  0.000340  0.003477  0.000246  0.004210  0.000344

plot

dfi = df[
    df.type.isin({"f32", "f16", "bf16", "e4m3fn", "e5m2"}) & df.engine.isin({"ort"})
]
pivi = pivot_table(
    dfi,
    index=["cost"],
    columns=["type", "domain", "provider", "engine"],
    values="average",
)

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
piv.plot(ax=ax[0], title="Gemm performance\nlower is better", logx=True, logy=True)
if pivi.shape[0] > 0:
    pivi.plot(
        ax=ax[1],
        title=f"Gemm performance ORT\n{platform.processor()}",
        logx=True,
        logy=True,
    )
fig.tight_layout()
fig.savefig("plot_bench_gemm_ort.png")

Gemm performance lower is better, Gemm performance ORT x86_64

Total running time of the script: (0 minutes 44.327 seconds)

Gallery generated by Sphinx-Gallery