Source code for experimental_experiment.torch_bench.dort_bench_profile

"""
Profiles an execution
=====================

The datas should be saved with script ``dort_bench.py`` and option ``--export <something>``.

::

    python -m experimental_experiment.torch_bench.dort_bench_profile --help

Example, run llama model with onnxrt backend on cuda.

::

    python -m experimental_experiment.torch_bench.dort_bench_profile \\
           --model model.onnx --inputs model.onnx.pkl

"""



[docs]
def main():
    """
    Main function for command line
    ``python -m experimental_experiment.torch_bench.dort_bench_profile``.
    """
    from experimental_experiment.args import get_parsed_args

    args = get_parsed_args(
        "experimental_experiment.torch_bench.dort_bench_profile",
        description=__doc__,
        model=("model.onnx", "model to load"),
        inputs=("model.onnx.mkl", "inputs for the model"),
        profile=(0, "runs the profiling"),
        rewrite=(0, "rewrite again"),
        debug=(0, "run a backend to debug"),
        repeat=5,
        warmup=5,
        expose="model,inputs,warmup,repeat,profile,rewrite,debug",
    )

    import pickle
    import time
    import numpy as np
    import onnx
    import matplotlib.pyplot as plt
    import torch
    from torch._C import _from_dlpack
    from onnxruntime import InferenceSession, SessionOptions, RunOptions
    from onnxruntime.capi import _pybind_state as ORTC
    from experimental_experiment.torch_dynamo.fast_backend import (
        _run_onnx_session_with_ortvaluevector,
    )
    from experimental_experiment.convert.convert_helper import optimize_model_proto_oxs
    from experimental_experiment.torch_dynamo.backend_helper import get_dimensions

    print(f"-- loading inputs {args.model}")
    with open(args.inputs, "rb") as f:
        inputs = pickle.load(f)
    print("-- done")

    assert isinstance(inputs, list), f"Unexpected type {type(inputs)} for {args.inputs}"
    assert len(inputs) == 3, f"Unexpected length {len(inputs)} for {args.inputs}"
    print(f"input_names={inputs[0]}")
    print(f"output_names={inputs[2]}")
    max_device = -1
    for i, t in enumerate(inputs[1]):
        if isinstance(t, torch.Tensor):
            print(f"input {i}: device={t.get_device()} dtype={t.dtype} shape={t.shape}")
            max_device = max(t.get_device(), max_device)
        else:
            print(f"input {i}: type={type(t)}")

    providers = (
        [("CUDAExecutionProvider", {"device_id": max_device}), ("CPUExecutionProvider", {})]
        if max_device >= 0
        else ["CPUExecutionProvider"]
    )

    sess_options = SessionOptions()
    if args.profile in (1, "1"):
        sess_options.enable_profiling = True
    run_options = RunOptions()
    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")

    if args.rewrite in (1, "1"):
        model_model = args.model.replace(".onnx", ".rewrite.onnx")
        print(f"-- optimize again into {model_model}")
        proto = onnx.load(args.model)
        new_proto = optimize_model_proto_oxs(proto, verbose=args.verbose)
        onnx.save(new_proto, model_model)
        print("-- done")
    else:
        model_model = args.model

    print(f"-- loading model {model_model}")
    onnx_model = onnx.load(model_model)
    is_dimension_in, is_dimension_out = get_dimensions(onnx_model)
    sess = InferenceSession(model_model, sess_options, providers=providers)
    print("-- done")

    TORCH_DTYPE_TO_NUMPY_DTYPE = {
        torch.float16: np.float16,
        torch.float32: np.float32,
        torch.float64: np.float64,
        torch.uint8: np.uint8,
        torch.int8: np.int8,
        torch.int16: np.int16,
        torch.int32: np.int32,
        torch.int64: np.longlong,
        torch.bool: np.bool_,
    }

    DEVICES = {-1: ORTC.OrtDevice(ORTC.OrtDevice.cpu(), ORTC.OrtDevice.default_memory(), 0)}
    for i in range(torch.cuda.device_count()):
        DEVICES[i] = ORTC.OrtDevice(ORTC.OrtDevice.cuda(), ORTC.OrtDevice.default_memory(), i)

    input_names, output_names = inputs[0], inputs[2]
    inputs = inputs[1]
    is_cuda = max_device >= 0

    if args.debug:
        print("-- debugging")
        from experimental_experiment.reference import ExtendedReferenceEvaluator

        ref = ExtendedReferenceEvaluator(model_model, verbose=10)
        feeds = dict(
            zip(
                input_names,
                [
                    (
                        t.detach().cpu().numpy()
                        if isinstance(t, torch.Tensor)
                        else np.array([int(t)], dtype=np.int64)
                    )
                    for t in inputs
                ],
            )
        )
        ref.run(None, feeds)
        print("-- end debugging")

    print(f"-- warmup: {args.warmup}")
    begin = time.perf_counter()
    for i in range(args.warmup):
        if is_cuda:
            torch.cuda.synchronize()
        res = _run_onnx_session_with_ortvaluevector(
            ORTC.OrtValueVector,
            _from_dlpack,
            TORCH_DTYPE_TO_NUMPY_DTYPE,
            DEVICES,
            run_options,
            sess,
            input_names,
            inputs,
            output_names,
            is_dimension_in=is_dimension_in,
            is_dimension_out=is_dimension_out,
        )
        if is_cuda:
            torch.cuda.synchronize()
        if i == 0:
            for ti, t in enumerate(res):
                if isinstance(t, torch.Tensor):
                    print(
                        f"  output {ti}: device={t.get_device()} "
                        f"dtype={t.dtype} - shape={t.shape}"
                    )
                elif isinstance(t, torch.SymInt):
                    print(f"  output {ti}: dimension {t}")
                elif isinstance(t, torch.SymFloat):
                    print(f"  output {ti}: dimensiof {t}")

    warmup_time = time.perf_counter() - begin
    print(f"-- done: warmup time {warmup_time}")

    print(f"-- measure: {args.repeat}")
    times = []
    for _ in range(args.repeat):
        if is_cuda:
            torch.cuda.synchronize()
        begin = time.perf_counter()
        res = _run_onnx_session_with_ortvaluevector(
            ORTC.OrtValueVector,
            _from_dlpack,
            TORCH_DTYPE_TO_NUMPY_DTYPE,
            DEVICES,
            run_options,
            sess,
            input_names,
            inputs,
            output_names,
            is_dimension_in=is_dimension_in,
            is_dimension_out=is_dimension_out,
        )
        if is_cuda:
            torch.cuda.synchronize()
        d = time.perf_counter()
        times.append(d - begin)
    print(f"-- times: {np.mean(times)} - {times}")

    if args.profile in (1, "1"):
        from onnx_extended.tools.js_profile import (
            js_profile_to_dataframe,
            plot_ort_profile,
        )
        from onnx_extended.tools.js_profile import plot_ort_profile_timeline

        def _align(s, n):
            if len(s) >= n:
                return s[:n]
            return s + " " * (n - len(s))

        prof = sess.end_profiling()
        print(f"-- profiling name {prof}")
        onx = onnx.load(model_model)
        n_nodes = len(onx.graph.node)
        n_unique_nodes = len(set(n.name for n in onx.graph.node))
        print(
            "\n".join(
                f"{_align(n.op_type, 16)} - {n.input} -> {n.output}" for n in onx.graph.node
            )
        )

        # first graph: aggregated profile
        df = js_profile_to_dataframe(prof, first_it_out=True)
        df.to_csv(f"{model_model}.csv", errors="ignore")
        df.to_excel(f"{model_model}.xlsx")
        assert set(df["it==0"]) == {0, 1}
        for v in set(df["it==0"]):
            dfv = df[df["it==0"] == v]
            vs = "after" if v == 0 else "warmup"
            fig, ax = plt.subplots(1, 2, figsize=(10, max(5, n_unique_nodes // 12)))

            plot_ort_profile(
                dfv, ax[0], ax[1], f"profiling {vs} {n_nodes} nodes\n{model_model}"
            )
            fig.tight_layout()
            fig.savefig(f"{model_model}_{vs}.png")

        # second graph: timeline

        fig, ax = plt.subplots(1, 1, figsize=(5, max(5, n_nodes)))
        iteration = args.repeat - 2
        plot_ort_profile_timeline(
            df,
            ax,
            iteration=iteration,
            title=f"profiling it={iteration} {n_nodes} nodes\n{model_model}",
        )
        fig.tight_layout()
        fig.savefig(f"{model_model}_{iteration}_timeline.png")



if __name__ == "__main__":
    main()