Note

Go to the end to download the full example code.

201: Evaluate different ways to export a torch model to ONNX¶

The example evaluates the performance of onnxruntime of a simple torch model after it was converted into ONNX through different processes:

TorchScript-based ONNX Exporter, let’s call it script
TorchDynamo-based ONNX Exporter, let’s call it dynamo
if available, the previous model but optimized, dynopt
a custom exporter cus_p0, this exporter supports a very limited set of models, as dynamo, it relies on torch.fx but the design is closer to what tensorflow-onnx does.
the same exporter but unused nodes were removed and constants were folded, cus_p2

To run the script:

python _doc/examples/plot_torch_export --help

The script takes around 12 minutes with a larger models.

Some helpers¶

from experimental_experiment.args import get_parsed_args


script_args = get_parsed_args(
    "plot_torch_export",
    description=__doc__,
    scenarios={
        "small": "small model to test",
        "middle": "55Mb model",
        "large": "1Gb model",
    },
    warmup=5,
    repeat=5,
    maxtime=(
        2,
        "maximum time to run a model to measure the computation time, "
        "it is 0.1 when scenario is small",
    ),
    expose="scenarios,repeat,warmup",
)


import contextlib
import itertools
import os
import platform
import pprint
import multiprocessing
import time
import cProfile
import pstats
import io
import warnings
import logging
from pstats import SortKey

try:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        import onnxruntime

        has_cuda = "CUDAExecutionProvider" in onnxruntime.get_available_providers()
except ImportError:
    print("onnxruntime not available.")
    import sys

    sys.exit(0)

import numpy as np
import matplotlib.pyplot as plt
import pandas
import onnx
from onnx_array_api.profiling import profile2graph
import torch
from torch import nn
import torch.nn.functional as F
import experimental_experiment
from experimental_experiment.torch_interpreter import to_onnx
from experimental_experiment.xbuilder import OptimizationOptions
from experimental_experiment.plotting.memory import memory_peak_plot
from experimental_experiment.ext_test_case import measure_time, get_figure
from experimental_experiment.memory_peak import start_spying_on
from experimental_experiment.ext_test_case import unit_test_going
from experimental_experiment.helpers import pretty_onnx
from tqdm import tqdm

has_cuda = has_cuda and torch.cuda.device_count() > 0
logging.disable(logging.ERROR)


def system_info():
    obs = {}
    obs["processor"] = platform.processor()
    obs["cores"] = multiprocessing.cpu_count()
    try:
        obs["cuda"] = 1 if torch.cuda.device_count() > 0 else 0
        obs["cuda_count"] = torch.cuda.device_count()
        obs["cuda_name"] = torch.cuda.get_device_name()
        obs["cuda_capa"] = torch.cuda.get_device_capability()
    except (RuntimeError, AssertionError):
        # no cuda
        pass
    return obs


pprint.pprint(system_info())

{'cores': 20,
 'cuda': 1,
 'cuda_capa': (8, 9),
 'cuda_count': 1,
 'cuda_name': 'NVIDIA GeForce RTX 4060 Laptop GPU',
 'processor': 'x86_64'}

Scripts arguments

if script_args.scenario in (None, "small"):
    script_args.maxtime = 0.1

if unit_test_going():
    script_args.warmup = 1
    script_args.repeat = 1
    script_args.maxtime = 0.1
    script_args.scenario = "small"

print(f"scenario={script_args.scenario or 'small'}")
print(f"warmup={script_args.warmup}")
print(f"repeat={script_args.repeat}")
print(f"maxtime={script_args.maxtime}")

scenario=small
warmup=5
repeat=5
maxtime=0.1

The model¶

A simple model to convert.

class MyModelClass(nn.Module):
    def __init__(self, scenario=script_args.scenario):
        super().__init__()
        if scenario == "middle":
            self.large = False
            self.conv1 = nn.Conv2d(1, 128, 5)
            self.conv2 = nn.Conv2d(128, 16, 5)
            self.fc1 = nn.Linear(13456, 1024)
            self.fcs = []
            self.fc2 = nn.Linear(1024, 128)
            self.fc3 = nn.Linear(128, 10)
        elif scenario in (None, "small"):
            self.large = False
            self.conv1 = nn.Conv2d(1, 16, 5)
            self.conv2 = nn.Conv2d(16, 16, 5)
            self.fc1 = nn.Linear(16, 512)
            self.fcs = []
            self.fc2 = nn.Linear(512, 128)
            self.fc3 = nn.Linear(128, 10)
        elif scenario in (None, "large"):
            self.large = True
            self.conv1 = nn.Conv2d(1, 128, 5)
            self.conv2 = nn.Conv2d(128, 16, 5)
            self.fc1 = nn.Linear(13456, 4096)
            # torch script does not support loops.
            self.fca = nn.Linear(4096, 4096)
            self.fcb = nn.Linear(4096, 4096)
            self.fcc = nn.Linear(4096, 4096)
            self.fcd = nn.Linear(4096, 4096)
            self.fce = nn.Linear(4096, 4096)
            self.fcf = nn.Linear(4096, 4096)
            self.fcg = nn.Linear(4096, 4096)
            self.fch = nn.Linear(4096, 4096)
            self.fci = nn.Linear(4096, 4096)
            self.fck = nn.Linear(4096, 4096)
            self.fcl = nn.Linear(4096, 4096)
            self.fcm = nn.Linear(4096, 4096)
            self.fcn = nn.Linear(4096, 4096)
            # end of the unfolded loop.
            self.fc2 = nn.Linear(4096, 128)
            self.fc3 = nn.Linear(128, 10)
        else:
            raise ValueError(f"Unsupported scenario={scenario!r}.")

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        if self.large:
            # loop
            x = F.relu(self.fca(x))
            x = F.relu(self.fcb(x))
            x = F.relu(self.fcc(x))
            x = F.relu(self.fcd(x))
            x = F.relu(self.fce(x))
            x = F.relu(self.fcf(x))
            x = F.relu(self.fcg(x))
            x = F.relu(self.fch(x))
            x = F.relu(self.fci(x))
            x = F.relu(self.fck(x))
            x = F.relu(self.fcl(x))
            x = F.relu(self.fcm(x))
            x = F.relu(self.fcn(x))
            # end of the loop
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def create_model_and_input(scenario=script_args.scenario):
    if scenario == "middle":
        shape = [1, 1, 128, 128]
    elif scenario in (None, "small"):
        shape = [1, 1, 16, 16]
    elif scenario == "large":
        shape = [1, 1, 128, 128]
    else:
        raise ValueError(f"Unsupported scenario={scenario!r}.")
    input_tensor = torch.rand(*shape).to(torch.float32)
    model = MyModelClass(scenario=scenario)
    assert model(input_tensor) is not None
    return model, input_tensor


def torch_model_size(model):
    size_model = 0
    for param in model.parameters():
        size = param.numel() * torch.finfo(param.data.dtype).bits / 8
        size_model += size
    return size_model


model, input_tensor = create_model_and_input()
model_size = torch_model_size(model)
print(f"model size={model_size / 2 ** 20} Mb")

model size=0.31467437744140625 Mb

The exporters¶

def export_script(filename, model, *args):
    with contextlib.redirect_stdout(io.StringIO()):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            torch.onnx.export(model, *args, filename, input_names=["input"])


def export_dynamo(filename, model, *args):
    with contextlib.redirect_stdout(io.StringIO()):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            export_output = torch.onnx.export(model, args, dynamo=True)
            export_output.save(filename)


def export_dynopt(filename, model, *args):
    with contextlib.redirect_stdout(io.StringIO()):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            export_output = torch.onnx.export(model, args, dynamo=True)
            model_onnx = export_output.model_proto

            from experimental_experiment.convert.convert_helper import (
                optimize_model_proto_oxs,
            )

            optimized_model = optimize_model_proto_oxs(model_onnx)

            with open(filename, "wb") as f:
                f.write(optimized_model.SerializeToString())


def export_cus_p0(filename, model, *args):
    onx = to_onnx(model, tuple(args), input_names=["input"])
    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())


def export_cus_p2(filename, model, *args):
    onx = to_onnx(
        model,
        tuple(args),
        input_names=["input"],
        options=OptimizationOptions(
            remove_unused=True,
            constant_folding=True,
        ),
    )
    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

Let’s check they are working.

export_functions = [
    export_script,
    export_dynamo,
    export_dynopt,
    export_cus_p0,
    export_cus_p2,
]

exporters = {f.__name__.replace("export_", ""): f for f in export_functions}

supported_exporters = {}
for k, v in exporters.items():
    print(f"run exporter {k}")
    filename = f"plot_torch_export_{k}.onnx"
    try:
        v(filename, model, input_tensor)
    except Exception as e:
        print(f"skipped due to {str(e)[:1000]}")
        continue
    supported_exporters[k] = v
    print(f"done. size={os.stat(filename).st_size / 2 ** 20:1.0f} Mb")

run exporter script
done. size=0 Mb
run exporter dynamo
done. size=0 Mb
run exporter dynopt
done. size=0 Mb
run exporter cus_p0
done. size=0 Mb
run exporter cus_p2
done. size=0 Mb

Exporter memory¶

def flatten(ps):
    obs = ps["cpu"].to_dict(unit=2**20)
    if "gpus" in ps:
        for i, g in enumerate(ps["gpus"]):
            for k, v in g.to_dict(unit=2**20).items():
                obs[f"gpu{i}_{k}"] = v
    return obs


data = []

for k, v in supported_exporters.items():
    print(f"run exporter for memory {k}")
    filename = f"plot_torch_export_{k}.onnx"
    if has_cuda:
        torch.cuda.set_device(0)
    stat = start_spying_on(cuda=1 if has_cuda else 0)
    v(filename, model, input_tensor)
    obs = flatten(stat.stop())
    print("done.")
    onx = onnx.load(filename)
    obs.update(dict(nodes=len(onx.graph.node), export=k))
    data.append(obs)

stat = start_spying_on(cuda=1 if has_cuda else 0)
exported_mod = torch.export.export(model, (input_tensor,))
obs = flatten(stat.stop())
obs.update(dict(export="torch.fx"))
data.append(obs)

run exporter for memory script
done.
run exporter for memory dynamo
done.
run exporter for memory dynopt
done.
run exporter for memory cus_p0
done.
run exporter for memory cus_p2
done.

The result.

df1 = pandas.DataFrame(data)
df1.to_csv("plot_torch_export_memory.csv", index=False)
df1.to_excel("plot_torch_export_memory.xlsx", index=False)
print(df1)

ax = memory_peak_plot(
    data,
    bars=[model_size * i / 2**20 for i in range(1, 5)],
    suptitle=f"Memory Consumption of the Export\nmodel size={model_size / 2**20:1.0f} Mb",
)
get_figure(ax).savefig("plot_torch_export_memory.png")

Memory Consumption of the Export model size=0 Mb, Memory peak (Mb), Memory peak - memory begin (Mb), Memory average - memory begin (Mb), GPU Memory peak (Mb), GPU Memory peak - memory begin (Mb), GPU Memory average - memory begin (Mb)

          peak         mean    n        begin          end   gpu0_peak   gpu0_mean  gpu0_n  gpu0_begin    gpu0_end  nodes    export
1991.621094  1990.958984   12  1991.617188  1991.621094  449.617188  449.617188      12  449.617188  449.617188   12.0    script
1991.687500  1991.652860  106  1991.621094  1991.687500  449.617188  449.617188     106  449.617188  449.617188   13.0    dynamo
1991.687500  1991.687500  149  1991.687500  1991.687500  449.617188  449.617188     149  449.617188  449.617188   13.0    dynopt
1991.695312  1991.688519   23  1991.687500  1991.695312  449.617188  449.617188      23  449.617188  449.617188   15.0    cus_p0
1991.699219  1989.976412   26  1991.695312  1957.074219  449.617188  449.617188      26  449.617188  449.617188   12.0    cus_p2
1957.253906  1957.249844   25  1957.246094  1957.253906  449.617188  449.617188      25  449.617188  449.617188    NaN  torch.fx

Exporter speed¶

data = []

for k, v in supported_exporters.items():
    print(f"run exporter {k}")
    filename = f"plot_torch_export_{k}.onnx"
    times = []
    for _ in range(script_args.repeat):
        begin = time.perf_counter()
        v(filename, model, input_tensor)
        duration = time.perf_counter() - begin
        times.append(duration)
    onx = onnx.load(filename)
    print("done.")
    data.append(
        dict(
            export=k,
            time=np.mean(times),
            min=min(times),
            max=max(times),
            first=times[0],
            last=times[-1],
            std=np.std(times),
            nodes=len(onx.graph.node),
        )
    )

run exporter script
done.
run exporter dynamo
done.
run exporter dynopt
done.
run exporter cus_p0
done.
run exporter cus_p2
done.

The last export to measure time torch spends in export the model before any other export can begin the translation except the first one.

times = []
for _ in range(script_args.repeat):
    begin = time.perf_counter()
    exported_mod = torch.export.export(model, (input_tensor,))
    duration = time.perf_counter() - begin
    times.append(duration)
data.append(
    dict(
        export="torch.fx",
        time=np.mean(times),
        min=min(times),
        max=max(times),
        first=times[0],
        last=times[-1],
        std=np.std(times),
        nodes=len(onx.graph.node),
    )
)

The result.

df1 = pandas.DataFrame(data)
df1.to_csv("plot_torch_export_time.csv", index=False)
df1.to_excel("plot_torch_export_time.xlsx", index=False)
print(df1)

fig, ax = plt.subplots(1, 1)
dfi = df1[["export", "time", "std"]].set_index("export")
dfi["time"].plot.bar(ax=ax, title="Export time", yerr=dfi["std"], rot=30)
fig.tight_layout()
fig.savefig("plot_torch_export_time.png")

     export      time       min       max     first      last       std  nodes
  script  0.071847  0.032638  0.175475  0.175475  0.035229  0.055204     12
  dynamo  1.299576  1.013765  1.656289  1.656289  1.105589  0.226074     13
  dynopt  1.265269  1.048148  1.540096  1.261092  1.048148  0.190792     13
  cus_p0  0.260758  0.149327  0.392067  0.149327  0.392067  0.090509     15
  cus_p2  0.203443  0.160868  0.250119  0.250119  0.176179  0.032620     12
torch.fx  0.129058  0.119741  0.146721  0.121158  0.146721  0.010517     12

Exporter Profiling¶

def clean_text(text):
    pathes = [
        os.path.abspath(os.path.normpath(os.path.join(os.path.dirname(torch.__file__), ".."))),
        os.path.abspath(os.path.normpath(os.path.join(os.path.dirname(onnx.__file__), ".."))),
        os.path.abspath(
            os.path.normpath(
                os.path.join(os.path.dirname(experimental_experiment.__file__), "..")
            )
        ),
    ]
    for p in pathes:
        text = text.replace(p, "")
    text = text.replace("experimental_experiment", "experimental_experiment".upper())
    return text


def profile_function(name, export_function, verbose=False):
    print(f"profile {name}: {export_function}")
    pr = cProfile.Profile()
    pr.enable()
    for _ in range(script_args.repeat):
        export_function("dummyc.onnx", model, input_tensor)
    pr.disable()
    s = io.StringIO()
    sortby = SortKey.CUMULATIVE
    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
    ps.print_stats()

    raw = s.getvalue()
    text = "\n".join(raw.split("\n")[:200])
    if verbose:
        print(text)
    with open(f"plot_torch_export_profile_{name}.txt", "w") as f:
        f.write(raw)

    root, nodes = profile2graph(ps, clean_text=clean_text)
    text = root.to_text()
    with open(f"plot_torch_export_profile_{name}_h.txt", "w") as f:
        f.write(text)
    print("done.")


profile_function("custom0", export_cus_p0, True)
profile_function("custom2", export_cus_p2)

profile custom0: <function export_cus_p0 at 0x7f50d07a0220>
         1041763 function calls (1012599 primitive calls) in 2.044 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       60    0.002    0.000    1.959    0.033 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:966(call_function)
       25    0.002    0.000    1.845    0.074 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/nn_module.py:371(call_function)
 1080/690    0.004    0.000    0.516    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_stats.py:22(wrapper)
    55/10    0.004    0.000    0.414    0.041 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py:7081(run_node)
       65    0.001    0.000    0.409    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:2209(wrap_fx_proxy)
       65    0.000    0.000    0.408    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:2282(wrap_fx_proxy_cls)
       60    0.002    0.000    0.401    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:2366(_wrap_fx_proxy)
       90    0.002    0.000    0.384    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:2578(wrap_fake_exception)
      870    0.007    0.000    0.382    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1257(__torch_dispatch__)
       60    0.003    0.000    0.382    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:2993(get_fake_value)
      870    0.018    0.000    0.374    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1782(dispatch)
      485    0.007    0.000    0.350    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1342(_cached_dispatch_impl)
       50    0.002    0.000    0.310    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/nn/parameter.py:63(__deepcopy__)
   280/53    0.003    0.000    0.291    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:1034(step)
  160/110    0.080    0.001    0.289    0.003 {method 'clone' of 'torch._C.TensorBase' objects}
    55/11    0.001    0.000    0.279    0.025 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:741(wrapper)
    55/11    0.002    0.000    0.278    0.025 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:2488(CALL)
    55/11    0.001    0.000    0.277    0.025 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:2447(_call)
     1315    0.005    0.000    0.262    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1229(__torch_function__)
        5    0.001    0.000    0.258    0.052 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:5046(to_onnx)
  435/325    0.001    0.000    0.257    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:755(__call__)
5160/2130    0.018    0.000    0.256    0.000 /usr/lib/python3.12/copy.py:118(deepcopy)
       25    0.001    0.000    0.256    0.010 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:2589(deepcopy_to_fake_tensor)
     1315    0.003    0.000    0.252    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1258(__torch_function__)
   135/26    0.002    0.000    0.250    0.010 ~/vv/this312/lib/python3.12/site-packages/torch/fx/interpreter.py:218(run_node)
  595/235    0.004    0.000    0.247    0.001 /usr/lib/python3.12/copy.py:247(_reconstruct)
       60    0.002    0.000    0.245    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:863(handler)
       60    0.012    0.000    0.240    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_library/utils.py:281(handle_dispatch_mode)
  255/120    0.019    0.000    0.238    0.002 /usr/lib/python3.12/copy.py:217(_deepcopy_dict)
       25    0.000    0.000    0.236    0.009 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:2591(<lambda>)
       60    0.001    0.000    0.226    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1327(__torch_dispatch__)
       60    0.005    0.000    0.224    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:760(proxy_call)
     20/4    0.001    0.000    0.201    0.050 ~/vv/this312/lib/python3.12/site-packages/torch/fx/interpreter.py:342(call_module)
       60    0.002    0.000    0.200    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:800(recompile)
      250    0.003    0.000    0.190    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:2719(__torch_function__)
      485    0.004    0.000    0.183    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1716(_output_from_cache_entry)
      515    0.021    0.000    0.179    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1650(_get_output_tensor_from_cache_entry)
       65    0.001    0.000    0.175    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1562(python_code)
        5    0.001    0.000    0.154    0.031 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:5579(optimize)
      485    0.006    0.000    0.153    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1385(_cache_key)
        5    0.000    0.000    0.148    0.030 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:5901(optimize_with_patterns)
       50    0.001    0.000    0.148    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py:480(call_module)
7485/7195    0.011    0.000    0.147    0.000 {built-in method builtins.next}
        5    0.010    0.002    0.147    0.029 ~/github/experimental-experiment/experimental_experiment/xoptim/graph_builder_optim.py:1065(optimize)
       25    0.000    0.000    0.146    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py:801(module_call_wrapper)
       45    0.000    0.000    0.146    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/linear.py:124(forward)
       25    0.000    0.000    0.146    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1739(call_module)
    75/45    0.013    0.000    0.146    0.003 {built-in method torch._C._nn.linear}
       65    0.002    0.000    0.142    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1639(_python_code)
       65    0.017    0.000    0.140    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:397(_gen_python_code)
 2070/535    0.018    0.000    0.140    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1461(_prep_args_for_hash)
       25    0.000    0.000    0.138    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py:803(forward)
   100/60    0.001    0.000    0.137    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/nn/functional.py:1693(relu)
       60    0.008    0.000    0.135    0.002 {built-in method torch.relu}
      150    0.003    0.000    0.127    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:758(__torch_dispatch__)
       35    0.000    0.000    0.126    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/interpreter.py:296(call_function)
       60    0.000    0.000    0.115    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:3054(<lambda>)
       60    0.001    0.000    0.115    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:3144(run_node)
        5    0.000    0.000    0.113    0.023 ~/vv/this312/lib/python3.12/site-packages/torch/_functorch/functional_call.py:11(functional_call)
        5    0.000    0.000    0.113    0.023 ~/vv/this312/lib/python3.12/site-packages/torch/nn/utils/stateless.py:246(_functional_call)
        5    0.000    0.000    0.110    0.022 ~/vv/this312/lib/python3.12/site-packages/torch/fx/_lazy_graph_module.py:115(_lazy_forward)
       35    0.002    0.000    0.107    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/torch.py:970(call_function)
    60/30    0.000    0.000    0.106    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/overrides.py:1670(handle_torch_function)
     1115    0.029    0.000    0.103    0.000 ~/github/experimental-experiment/experimental_experiment/xoptim/patterns_api.py:128(enumerate_matches)
       50    0.003    0.000    0.096    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/nn/parameter.py:40(__new__)
        5    0.001    0.000    0.094    0.019 ~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py:437(_produce_aten_artifact)
  175/125    0.004    0.000    0.092    0.001 {method 'detach' of 'torch._C.TensorBase' objects}
3575/3430    0.004    0.000    0.092    0.000 /usr/lib/python3.12/contextlib.py:132(__enter__)
        5    0.000    0.000    0.091    0.018 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:829(call_wrapped)
        5    0.000    0.000    0.091    0.018 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:388(__call__)
      105    0.002    0.000    0.091    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:847(meta_tensor)
       30    0.000    0.000    0.087    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/conv.py:553(forward)
       30    0.000    0.000    0.086    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/conv.py:536(_conv_forward)
    50/30    0.007    0.000    0.086    0.003 {built-in method torch.conv2d}
        5    0.001    0.000    0.085    0.017 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:4342(_build_initializers)
       50    0.002    0.000    0.082    0.002 ~/github/experimental-experiment/experimental_experiment/mini_onnx_builder.py:108(proto_from_array)
      105    0.005    0.000    0.080    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/autograd/grad_mode.py:273(__exit__)
      240    0.002    0.000    0.079    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/proxy.py:211(create_proxy)
  590/520    0.005    0.000    0.078    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py:1944(__setattr__)
       25    0.001    0.000    0.076    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:443(__init__)
193215/190695    0.063    0.000    0.075    0.000 {built-in method builtins.isinstance}
      110    0.001    0.000    0.072    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/base.py:484(build)
        5    0.000    0.000    0.071    0.014 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1191(rewrite_signature)
        5    0.000    0.000    0.070    0.014 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:1870(SHAPE_ENV)
        5    0.064    0.013    0.070    0.014 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:1280(add_python_lambda_leaf_guard_to_root)
      110    0.001    0.000    0.070    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:397(__call__)
        5    0.001    0.000    0.069    0.014 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:4780(process)
       25    0.000    0.000    0.068    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:556(graph)
      120    0.002    0.000    0.066    0.001 ~/github/experimental-experiment/experimental_experiment/torch_interpreter/interpreter.py:177(run_node)
       65    0.004    0.000    0.066    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:534(_wrap)
      735    0.022    0.000    0.066    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:671(__new__)
3575/3430    0.005    0.000    0.063    0.000 /usr/lib/python3.12/contextlib.py:141(__exit__)
     1260    0.010    0.000    0.063    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:627(emit_node)
        5    0.001    0.000    0.061    0.012 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2751(build_guard_function)
      635    0.014    0.000    0.060    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:968(extract_tensor_metadata)
      250    0.004    0.000    0.059    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/proxy.py:145(create_node)
    50/30    0.000    0.000    0.059    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/_jit_internal.py:613(fn)
    50/30    0.000    0.000    0.059    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/nn/functional.py:807(_max_pool2d)
      635    0.018    0.000    0.058    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:948(_flatten_into)
       30    0.004    0.000    0.057    0.002 {built-in method torch.max_pool2d}
       65    0.000    0.000    0.057    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:592(track_tensor_tree)
   120/65    0.001    0.000    0.055    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:614(wrap_with_proxy)
       60    0.002    0.000    0.053    0.001 ~/github/experimental-experiment/experimental_experiment/torch_interpreter/interpreter.py:1365(call_function)
        5    0.000    0.000    0.051    0.010 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:2840(__init__)
      260    0.004    0.000    0.050    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1104(create_node)
 2920/135    0.008    0.000    0.050    0.000 /usr/lib/python3.12/ast.py:403(visit)
       10    0.000    0.000    0.050    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/lazy.py:64(realize)
      105    0.010    0.000    0.049    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/autograd/grad_mode.py:269(__enter__)
     55/5    0.001    0.000    0.047    0.009 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/lazy.py:107(realize_all)
      190    0.002    0.000    0.046    0.000 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder_opset.py:115(make_node)
        5    0.000    0.000    0.046    0.009 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1128(transform)
        5    0.001    0.000    0.046    0.009 ~/vv/this312/lib/python3.12/site-packages/torch/fx/interpreter.py:571(transform)
     4770    0.018    0.000    0.045    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:873(__setattr__)
      210    0.007    0.000    0.045    0.000 ~/github/experimental-experiment/experimental_experiment/xbuilder/graph_builder.py:3609(make_node)
       65    0.000    0.000    0.044    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:1934(LOAD_ATTR)
       10    0.000    0.000    0.044    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/passes/replace_with_hop_pass_util.py:157(_replace_with_hop_pass_helper)
       65    0.001    0.000    0.043    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:1927(_load_attr)
       15    0.000    0.000    0.042    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/_lazy_graph_module.py:57(_make_graph_module)
        5    0.000    0.000    0.041    0.008 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:3213(RETURN_VALUE)
        5    0.000    0.000    0.041    0.008 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:3180(_return)
      115    0.001    0.000    0.041    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:485(set_meta)
       60    0.000    0.000    0.041    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/symbolic_convert.py:1904(LOAD_METHOD)
        5    0.000    0.000    0.041    0.008 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:977(compile_subgraph)
     4365    0.012    0.000    0.040    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:172(is_sparse_any)
       65    0.001    0.000    0.040    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builtin.py:1064(call_function)
       65    0.000    0.000    0.039    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builtin.py:917(builtin_dispatch)
       65    0.000    0.000    0.039    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builtin.py:837(call_self_handler)
8260/4895    0.022    0.000    0.039    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:903(map_aggregate)
       65    0.001    0.000    0.039    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builtin.py:1731(call_getattr)
       10    0.000    0.000    0.038    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/lazy.py:22(realize)
       65    0.005    0.000    0.037    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:2002(create_proxy)
       60    0.000    0.000    0.037    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:91(_forward_from_src)
     4340    0.005    0.000    0.037    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:894(map_arg)
       60    0.001    0.000    0.037    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:97(_method_from_src)
      150    0.036    0.000    0.036    0.000 {built-in method builtins.compile}
       60    0.000    0.000    0.036    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:86(_exec_with_source)
      360    0.001    0.000    0.036    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:456(_ast_unparse)
        5    0.000    0.000    0.036    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/metrics_context.py:52(__exit__)
        5    0.001    0.000    0.036    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:1425(record_compilation_metrics)
    35475    0.026    0.000    0.035    0.000 {built-in method builtins.getattr}
       60    0.001    0.000    0.035    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:596(create_proxy)
        5    0.001    0.000    0.035    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1321(compile_and_call_fx_graph)
      105    0.000    0.000    0.035    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:1277(get_guard_manager)
      360    0.001    0.000    0.034    0.000 /usr/lib/python3.12/ast.py:1789(unparse)
     1570    0.004    0.000    0.034    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py:3076(_suppress_guards)
      120    0.001    0.000    0.034    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1775(create_node)
      360    0.001    0.000    0.033    0.000 /usr/lib/python3.12/ast.py:855(visit)
1655/1585    0.011    0.000    0.033    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/recording.py:238(wrapper)
 1845/360    0.002    0.000    0.032    0.000 /usr/lib/python3.12/ast.py:845(traverse)
  210/105    0.006    0.000    0.031    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:927(get_guard_manager_from_source)
       45    0.000    0.000    0.031    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2757(replace)
       45    0.000    0.000    0.031    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2355(replace)
      105    0.008    0.000    0.031    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:255(describe_tensor)
      960    0.002    0.000    0.030    0.000 ~/github/experimental-experiment/experimental_experiment/xoptim/patterns_api.py:968(match)
      620    0.029    0.000    0.029    0.000 {built-in method torch.empty_strided}
    13770    0.027    0.000    0.028    0.000 {built-in method builtins.setattr}
        1    0.000    0.000    0.028    0.028 <eval_with_key>.684:4(forward)
     4330    0.015    0.000    0.028    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:144(create_name)
     3675    0.011    0.000    0.028    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:589(__set__)
       30    0.001    0.000    0.028    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/nn_module.py:275(var_getattr)
      960    0.002    0.000    0.028    0.000 ~/github/experimental-experiment/experimental_experiment/xoptim/patterns_api.py:355(_get_match_pattern)
        5    0.000    0.000    0.027    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:1593(wrap_tensor)
       10    0.000    0.000    0.027    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_config_module.py:623(get_config_copy)
      115    0.000    0.000    0.027    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:385(extract_val)
       10    0.004    0.000    0.027    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_config_module.py:463(_get_dict)
      115    0.001    0.000    0.026    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:359(snapshot_fake)
      130    0.003    0.000    0.026    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1616(override_node_repr)
     6110    0.005    0.000    0.025    0.000 {built-in method builtins.repr}
      115    0.003    0.000    0.025    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_impls.py:1010(fast_detach)
       10    0.001    0.000    0.025    0.003 ~/github/experimental-experiment/experimental_experiment/xoptim/patterns_api.py:305(_build_pattern)
       85    0.010    0.000    0.024    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:776(getitem_on_dict_mgr)
        1    0.000    0.000    0.024    0.024 <eval_with_key>.660:4(forward)
        5    0.000    0.000    0.024    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2346(count)
        5    0.002    0.000    0.024    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/utils.py:1382(_scrubbed_inductor_config_for_logging)
       30    0.001    0.000    0.024    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:1471(wrap_module)
    25/15    0.002    0.000    0.023    0.002 {built-in method torch.flatten}
        5    0.001    0.000    0.023    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:580(apply_runtime_assertion_pass)
   455/45    0.002    0.000    0.023    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2309(visit)
       75    0.000    0.000    0.023    0.000 /usr/lib/python3.12/inspect.py:3343(signature)
   415/45    0.002    0.000    0.023    0.001 /usr/lib/python3.12/ast.py:477(generic_visit)
       70    0.000    0.000    0.023    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py:1108(run_node)
       75    0.000    0.000    0.023    0.000 /usr/lib/python3.12/inspect.py:3081(from_callable)
        5    0.000    0.000    0.022    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/passes/replace_autocast_with_hop_pass.py:178(replace_autocast_with_hop_pass)
   145/75    0.002    0.000    0.022    0.000 /usr/lib/python3.12/inspect.py:2501(_signature_from_callable)
   705/45    0.002    0.000    0.022    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/guards.py:2293(visit)
   705/45    0.002    0.000    0.022    0.000 /usr/lib/python3.12/ast.py:409(generic_visit)
        5    0.001    0.000    0.021    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:820(placeholder_naming_pass)
4340/4240    0.006    0.000    0.021    0.000 {method 'join' of 'str' objects}
       65    0.003    0.000    0.021    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/variables/builder.py:2390(handle_traced_output)
        5    0.000    0.000    0.021    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py:110(replace_set_grad_with_hop_pass)
      260    0.003    0.000    0.021    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:377(prepend)
        5    0.000    0.000    0.021    0.004 ~/github/experimental-experiment/experimental_experiment/torch_interpreter/_aten_functions.py:2924(aten_flatten_using_ints)
  415/335    0.002    0.000    0.021    0.000 /usr/lib/python3.12/ast.py:1573(visit_Subscript)
      780    0.002    0.000    0.020    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:548(_format_args)
     4125    0.004    0.000    0.020    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:602(__repr__)
done.
profile custom2: <function export_cus_p2 at 0x7f50d07a20c0>
done.

Same with dynamo-exporter.

profile_function("dynamo", export_dynamo, verbose=True)
if "dynopt" in supported_exporters:
    profile_function("dynopt", export_dynopt)

profile dynamo: <function export_dynamo at 0x7f50d07a18a0>
         10152076 function calls (9995653 primitive calls) in 11.901 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.051    0.010    5.712    1.142 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_registration.py:115(from_torchlib)
        5    0.093    0.019    4.183    0.837 ~/github/onnxscript/onnxscript/_framework_apis/torch_2_5.py:82(get_torchlib_ops)
     2215    0.038    0.000    4.072    0.002 ~/github/onnxscript/onnxscript/values.py:640(function_ir)
202120/201090    0.082    0.000    1.993    0.000 {built-in method builtins.next}
       10    0.002    0.000    1.911    0.191 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:1134(_collect_all_valid_cia_ops)
      270    0.024    0.000    1.909    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:1117(_collect_all_valid_cia_ops_for_namespace)
4680/4170    0.005    0.000    1.827    0.000 /usr/lib/python3.12/contextlib.py:132(__enter__)
       20    0.195    0.010    1.735    0.087 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:188(_override_composite_implicit_decomp)
      270    0.597    0.002    1.732    0.006 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:1052(_materialize_cpp_cia_ops)
     2215    0.024    0.000    1.648    0.001 ~/github/onnxscript/onnxscript/_internal/ast_utils.py:16(get_src_and_ast)
     2895    0.117    0.000    1.396    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_schemas.py:432(from_function)
     2215    0.008    0.000    1.299    0.001 ~/github/onnxscript/onnxscript/converter.py:1466(translate_function_signature)
     2215    0.090    0.000    1.282    0.001 ~/github/onnxscript/onnxscript/converter.py:1381(_translate_function_signature_common)
     2215    0.006    0.000    1.146    0.001 /usr/lib/python3.12/inspect.py:1279(getsource)
     2215    0.121    0.000    1.135    0.001 /usr/lib/python3.12/inspect.py:1258(getsourcelines)
    64735    1.030    0.000    1.113    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:120(inner)
        5    0.012    0.002    1.112    0.222 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_decomp.py:42(create_onnx_friendly_decomposition_table)
     35/5    0.003    0.000    1.031    0.206 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:2597(from_tensor)
    100/5    0.003    0.000    1.031    0.206 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:323(from_real_tensor)
    105/5    0.004    0.000    1.028    0.206 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:1809(__call__)
        5    0.000    0.000    1.025    0.205 ~/vv/this312/lib/python3.12/site-packages/torch/export/decomp_utils.py:125(items)
        5    0.000    0.000    1.024    0.205 ~/vv/this312/lib/python3.12/site-packages/torch/export/decomp_utils.py:142(_materialize_if_needed)
        5    0.003    0.001    1.024    0.205 ~/vv/this312/lib/python3.12/site-packages/torch/export/decomp_utils.py:129(materialize)
     2215    0.085    0.000    1.017    0.000 /usr/lib/python3.12/inspect.py:1606(getclosurevars)
        5    0.011    0.002    1.017    0.203 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:295(_split_decomp_table_to_cia_and_python_decomp)
    65905    0.340    0.000    0.852    0.000 /usr/lib/python3.12/dis.py:434(_get_instructions_bytes)
    24130    0.808    0.000    0.808    0.000 {built-in method builtins.compile}
     2215    0.227    0.000    0.784    0.000 /usr/lib/python3.12/inspect.py:1239(getblock)
75165/15510    0.148    0.000    0.745    0.000 ~/github/onnxscript/onnxscript/type_annotation.py:131(is_value_type)
      660    0.192    0.000    0.667    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/functional_tensor.py:352(__torch_dispatch__)
   851980    0.624    0.000    0.631    0.000 {built-in method builtins.getattr}
    80/20    0.001    0.000    0.614    0.031 ~/vv/this312/lib/python3.12/site-packages/torch/nn/functional.py:1693(relu)
     3450    0.012    0.000    0.548    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1229(__torch_function__)
1578290/1570705    0.410    0.000    0.521    0.000 {built-in method builtins.isinstance}
     8980    0.009    0.000    0.495    0.000 ~/github/onnxscript/onnxscript/type_annotation.py:172(is_valid_type)
     2895    0.067    0.000    0.494    0.000 /usr/lib/python3.12/typing.py:2215(get_type_hints)
   228115    0.266    0.000    0.485    0.000 /usr/lib/python3.12/tokenize.py:569(_generate_tokens_from_c_tokenizer)
27655/5170    0.133    0.000    0.440    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_schemas.py:268(_get_allowed_types_from_type_annotation)
     2215    0.006    0.000    0.401    0.000 /usr/lib/python3.12/ast.py:34(parse)
      625    0.005    0.000    0.391    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1327(__torch_dispatch__)
     1795    0.023    0.000    0.366    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1782(dispatch)
      120    0.011    0.000    0.356    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:760(proxy_call)
    64735    0.057    0.000    0.354    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:111(py_impl)
      495    0.007    0.000    0.332    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1342(_cached_dispatch_impl)
     35/5    0.001    0.000    0.329    0.066 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py:1755(_call_impl)
       90    0.003    0.000    0.329    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:800(recompile)
        5    0.000    0.000    0.327    0.065 ~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py:1775(forward)
        5    0.000    0.000    0.308    0.062 ~/github/experimental-experiment/_doc/examples/plot_torch_export_201.py:191(forward)
   132200    0.181    0.000    0.308    0.000 /usr/lib/python3.12/typing.py:1546(__getitem__)
      270    0.300    0.001    0.300    0.001 {built-in method torch._C._dispatch_get_registrations_for_dispatch_key}
    75165    0.077    0.000    0.300    0.000 ~/github/onnxscript/onnxscript/type_annotation.py:123(_is_tensor_type)
    11040    0.043    0.000    0.289    0.000 ~/github/onnxscript/onnxscript/converter.py:451(_eval_constant_expr)
     6530    0.005    0.000    0.264    0.000 ~/github/onnxscript/onnxscript/type_annotation.py:168(is_attr_type)
       90    0.002    0.000    0.253    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1562(python_code)
     1315    0.004    0.000    0.246    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1258(__torch_function__)
     1725    0.005    0.000    0.240    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py:636(__torch_function__)
     2935    0.004    0.000    0.231    0.000 /usr/lib/python3.12/inspect.py:3343(signature)
       60    0.004    0.000    0.228    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:863(handler)
     2935    0.005    0.000    0.227    0.000 /usr/lib/python3.12/inspect.py:3081(from_callable)
     7285    0.007    0.000    0.226    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:1038(_is_preservable_cia_op)
2965/2935    0.036    0.000    0.222    0.000 /usr/lib/python3.12/inspect.py:2501(_signature_from_callable)
       60    0.014    0.000    0.221    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_library/utils.py:281(handle_dispatch_mode)
     1365    0.006    0.000    0.220    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py:1267(tree_map_only)
   225900    0.118    0.000    0.219    0.000 /usr/lib/python3.12/collections/__init__.py:447(_make)
    78040    0.062    0.000    0.217    0.000 ~/github/onnxscript/onnxscript/type_annotation.py:70(_remove_annotation)
       70    0.001    0.000    0.216    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/_higher_order_ops/utils.py:23(autograd_not_implemented_inner)
   503540    0.211    0.000    0.211    0.000 {method 'split' of 'str' objects}
     2215    0.032    0.000    0.211    0.000 /usr/lib/python3.12/inspect.py:1070(findsource)
       90    0.002    0.000    0.200    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1639(_python_code)
       90    0.022    0.000    0.198    0.002 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:397(_gen_python_code)
    40/10    0.001    0.000    0.196    0.020 ~/vv/this312/lib/python3.12/site-packages/torch/_jit_internal.py:613(fn)
   117475    0.107    0.000    0.190    0.000 /usr/lib/python3.12/typing.py:2340(get_origin)
   131810    0.160    0.000    0.189    0.000 /usr/lib/python3.12/dis.py:623(_unpack_opargs)
    10785    0.019    0.000    0.187    0.000 /usr/lib/python3.12/typing.py:892(__init__)
53360/53310    0.043    0.000    0.187    0.000 {built-in method builtins.repr}
     7285    0.100    0.000    0.186    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:1086(_check_valid_to_preserve)
24170/10785    0.031    0.000    0.177    0.000 /usr/lib/python3.12/typing.py:407(_eval_type)
1058405/1057915    0.171    0.000    0.172    0.000 {built-in method builtins.len}
    10785    0.036    0.000    0.163    0.000 /usr/lib/python3.12/typing.py:916(_evaluate)
    34220    0.028    0.000    0.159    0.000 ~/github/onnxscript/onnxscript/ir/_core.py:1475(__hash__)
     2215    0.050    0.000    0.156    0.000 /usr/lib/python3.12/dis.py:647(findlabels)
     2935    0.053    0.000    0.151    0.000 /usr/lib/python3.12/inspect.py:2397(_signature_from_function)
        5    0.038    0.008    0.150    0.030 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_decomp.py:15(get_onnx_implemented_overloads)
     20/5    0.000    0.000    0.147    0.029 {built-in method torch.flatten}
        5    0.001    0.000    0.141    0.028 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_fx_passes.py:22(insert_type_promotion_nodes)
      170    0.004    0.000    0.131    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:847(meta_tensor)
    125/5    0.003    0.000    0.131    0.026 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py:66(wrapper)
      420    0.003    0.000    0.131    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1716(_output_from_cache_entry)
        5    0.000    0.000    0.130    0.026 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/fx/_pass.py:240(run)
        5    0.000    0.000    0.130    0.026 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py:1691(_run)
      440    0.014    0.000    0.128    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1650(_get_output_tensor_from_cache_entry)
       30    0.002    0.000    0.127    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:443(__init__)
  675/575    0.005    0.000    0.126    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py:1944(__setattr__)
      495    0.006    0.000    0.121    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1385(_cache_key)
      230    0.002    0.000    0.120    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/proxy.py:211(create_proxy)
       10    0.001    0.000    0.119    0.012 ~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py:437(_produce_aten_artifact)
      120    0.001    0.000    0.117    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py:1596(run_node)
       30    0.000    0.000    0.116    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:556(graph)
    90/30    0.001    0.000    0.113    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/overrides.py:1670(handle_torch_function)
 1875/495    0.016    0.000    0.110    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1461(_prep_args_for_hash)
        5    0.000    0.000    0.109    0.022 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:1314(module)
        5    0.000    0.000    0.109    0.022 ~/vv/this312/lib/python3.12/site-packages/torch/export/_unlift.py:383(_unlift_exported_program_lifted_states)
   357250    0.108    0.000    0.108    0.000 {built-in method __new__ of type object at 0xa20960}
    37640    0.047    0.000    0.107    0.000 ~/github/onnxscript/onnxscript/ir/_core.py:1483(__repr__)
4680/4170    0.006    0.000    0.106    0.000 /usr/lib/python3.12/contextlib.py:141(__exit__)
      130    0.001    0.000    0.105    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:592(track_tensor_tree)
   219216    0.084    0.000    0.105    0.000 {built-in method builtins.hasattr}
      240    0.006    0.000    0.102    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:1775(create_node)
  240/130    0.003    0.000    0.102    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:614(wrap_with_proxy)
      170    0.007    0.000    0.101    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/autograd/grad_mode.py:273(__exit__)
    21845    0.078    0.000    0.101    0.000 {built-in method builtins.eval}
     1995    0.005    0.000    0.098    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py:985(tree_flatten)
6985/1995    0.025    0.000    0.093    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py:993(helper)
        5    0.001    0.000    0.093    0.019 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_core.py:933(_exported_program_to_onnx_program)
     1575    0.002    0.000    0.092    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py:1212(wrapped)
     2160    0.012    0.000    0.088    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:627(emit_node)
    63865    0.041    0.000    0.087    0.000 {built-in method builtins.issubclass}
      145    0.005    0.000    0.084    0.001 ~/github/onnxscript/onnxscript/optimizer/_constant_folding.py:923(process_node)
        5    0.001    0.000    0.081    0.016 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_core.py:662(_translate_fx_graph)
       60    0.002    0.000    0.078    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_core.py:449(_handle_call_function_node_with_lowering)
   146545    0.050    0.000    0.077    0.000 /usr/lib/python3.12/inspect.py:302(isclass)
    75/15    0.001    0.000    0.076    0.005 {built-in method torch._to_functional_tensor}
      230    0.005    0.000    0.075    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:485(set_meta)
     2215    0.015    0.000    0.074    0.000 /usr/lib/python3.12/textwrap.py:419(dedent)
     2215    0.023    0.000    0.074    0.000 /usr/lib/python3.12/inspect.py:951(getsourcefile)
    15135    0.011    0.000    0.072    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_registration.py:261(is_registered)
    83575    0.033    0.000    0.072    0.000 <frozen abc>:117(__instancecheck__)
       20    0.001    0.000    0.072    0.004 {built-in method torch.relu}
   225900    0.072    0.000    0.072    0.000 /usr/lib/python3.12/inspect.py:1196(tokeneater)
     8405    0.009    0.000    0.070    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:894(map_arg)
    95045    0.043    0.000    0.070    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_ops.py:763(__hash__)
   134610    0.069    0.000    0.070    0.000 /usr/lib/python3.12/typing.py:392(inner)
      145    0.003    0.000    0.069    0.000 ~/github/onnxscript/onnxscript/optimizer/_constant_folding.py:823(_do_inference)
       10    0.000    0.000    0.068    0.007 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:928(__init__)
       75    0.001    0.000    0.068    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:1968(_dispatch_impl)
       90    0.000    0.000    0.066    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:91(_forward_from_src)
15805/9585    0.030    0.000    0.066    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:903(map_aggregate)
       90    0.000    0.000    0.066    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:97(_method_from_src)
       90    0.001    0.000    0.065    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph_module.py:86(_exec_with_source)
     2215    0.027    0.000    0.065    0.000 /usr/lib/python3.12/dis.py:342(get_instructions)
       55    0.000    0.000    0.064    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:66(_detach_and_copy_item_memo)
      105    0.005    0.000    0.064    0.001 {method 'detach' of 'torch._C.TensorBase' objects}
      840    0.019    0.000    0.062    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:671(__new__)
    15195    0.037    0.000    0.062    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_registration.py:239(get_decomps)
     8085    0.020    0.000    0.062    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/node.py:873(__setattr__)
       15    0.000    0.000    0.061    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/linear.py:124(forward)
    60/15    0.001    0.000    0.061    0.004 {built-in method torch._C._nn.linear}
      170    0.010    0.000    0.060    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/autograd/grad_mode.py:269(__enter__)
   122945    0.053    0.000    0.057    0.000 {method 'get' of 'dict' objects}
    70/60    0.000    0.000    0.056    0.001 ~/github/onnxscript/onnxscript/values.py:634(__call__)
        5    0.000    0.000    0.056    0.011 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:339(default_decompositions)
        5    0.003    0.001    0.056    0.011 ~/vv/this312/lib/python3.12/site-packages/torch/export/decomp_utils.py:33(__init__)
    10785    0.034    0.000    0.056    0.000 /usr/lib/python3.12/typing.py:175(_type_check)
3530/1190    0.005    0.000    0.054    0.000 ~/github/onnxscript/onnxscript/ir/serde.py:95(wrapper)
       10    0.000    0.000    0.054    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py:1587(_create_graph_module_for_export)
 1260/775    0.025    0.000    0.054    0.000 {built-in method torch._ops.prim.}
155250/153420    0.048    0.000    0.053    0.000 {built-in method builtins.hash}
    37640    0.022    0.000    0.053    0.000 ~/github/onnxscript/onnxscript/ir/_enums.py:95(__repr__)
    63690    0.033    0.000    0.052    0.000 <string>:1(<lambda>)
     2300    0.008    0.000    0.051    0.000 /usr/lib/python3.12/linecache.py:52(checkcache)
    70/65    0.001    0.000    0.051    0.001 ~/github/onnxscript/onnxscript/values.py:295(__call__)
      145    0.017    0.000    0.051    0.000 ~/github/onnx/onnx/shape_inference.py:99(infer_node_outputs)
   251250    0.050    0.000    0.050    0.000 /usr/lib/python3.12/dis.py:195(_deoptop)
    70/65    0.001    0.000    0.049    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/onnx/_internal/exporter/_building.py:570(eval)
      415    0.006    0.000    0.049    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1104(create_node)
      620    0.015    0.000    0.048    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:948(_flatten_into)
13385/10785    0.022    0.000    0.047    0.000 /usr/lib/python3.12/typing.py:2315(_strip_annotations)
    62065    0.022    0.000    0.047    0.000 <frozen abc>:121(__subclasscheck__)
       10    0.000    0.000    0.046    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/conv.py:553(forward)
       10    0.000    0.000    0.046    0.005 ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/conv.py:536(_conv_forward)
    40/10    0.001    0.000    0.046    0.005 {built-in method torch.conv2d}
       60    0.002    0.000    0.046    0.001 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_traceback.py:171(summary)
     8300    0.045    0.000    0.045    0.000 {method 'copy' of 'dict' objects}
      230    0.001    0.000    0.045    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:385(extract_val)
       10    0.001    0.000    0.045    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:580(apply_runtime_assertion_pass)
      240    0.003    0.000    0.044    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/proxy.py:145(create_node)
      620    0.012    0.000    0.044    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_tensor.py:968(extract_tensor_metadata)
      230    0.001    0.000    0.043    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py:359(snapshot_fake)
  2395/16    0.008    0.000    0.043    0.003 ~/vv/this312/lib/python3.12/site-packages/torch/utils/_stats.py:22(wrapper)
      170    0.010    0.000    0.043    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:255(describe_tensor)
      180    0.005    0.000    0.043    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/fx/graph.py:1616(override_node_repr)
       10    0.002    0.000    0.043    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/utils.py:820(placeholder_naming_pass)
     7405    0.037    0.000    0.042    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_decomp/__init__.py:56(_should_decompose_because_unsafe_op)
     2295    0.042    0.000    0.042    0.000 {built-in method posix.stat}
      230    0.009    0.000    0.042    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/fake_impls.py:1010(fast_detach)
        5    0.000    0.000    0.042    0.008 ~/vv/this312/lib/python3.12/site-packages/torch/export/_unlift.py:178(_unlift)
     5215    0.008    0.000    0.042    0.000 ~/vv/this312/lib/python3.12/site-packages/torch/_subclasses/meta_utils.py:172(is_sparse_any)
    17470    0.024    0.000    0.042    0.000 /usr/lib/python3.12/typing.py:2370(get_args)
       10    0.000    0.000    0.042    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/_export/passes/replace_with_hop_pass_util.py:157(_replace_with_hop_pass_helper)
     5180    0.021    0.000    0.041    0.000 /usr/lib/python3.12/inspect.py:754(unwrap)
     8075    0.022    0.000    0.040    0.000 /usr/lib/python3.12/inspect.py:2743(__init__)
    83575    0.040    0.000    0.040    0.000 {built-in method _abc._abc_instancecheck}
       10    0.000    0.000    0.039    0.004 ~/github/onnxscript/onnxscript/rewriter/__init__.py:26(rewrite)
 3135/130    0.008    0.000    0.038    0.000 /usr/lib/python3.12/copy.py:118(deepcopy)
    40/10    0.000    0.000    0.038    0.004 ~/vv/this312/lib/python3.12/site-packages/torch/nn/functional.py:807(_max_pool2d)
done.
profile dynopt: <function export_dynopt at 0x7f50d07a1620>
done.

Benchmark exported models with ORT¶

def benchmark(shape):
    from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel

    providers = [["CPUExecutionProvider"]]
    if has_cuda:
        providers.append(["CUDAExecutionProvider", "CPUExecutionProvider"])

    data = []
    data1 = []
    data_mem_load = []
    data_mem_first_run = []
    data_mem_run = []
    confs = list(
        itertools.product(
            [_ for _ in os.listdir(".") if ".onnx" in _ and _.startswith("plot_torch")],
            providers,
            ["0", "1"],
        )
    )
    loop = tqdm(confs)
    print(f"number of experiments: {len(loop)}")
    for name, ps, aot in loop:
        root = os.path.split(name)[-1]
        _, ext = os.path.splitext(root)
        if ext != ".onnx":
            continue

        obs = {}  # system_info()
        obs["name"] = name
        obs["providers"] = ",".join(ps)
        p = "CUDA" if "CUDA" in obs["providers"] else "CPU"
        obs["compute"] = p
        obs["aot"] = 1 if aot == "0" else 0
        obs["export"] = name.replace("plot_torch_export_", "").replace(".onnx", "")

        if not has_cuda and p == "CUDA":
            continue

        onx = onnx.load(name)
        obs["n_nodes"] = len(onx.graph.node)
        obs["n_function"] = len(onx.functions or [])
        obs["n_sub"] = len([n for n in onx.graph.node if n.op_type == "Sub"])
        obs1 = obs.copy()
        short_obs = dict(
            name=obs["name"],
            aot=obs["aot"],
            providers=obs["providers"],
            export=obs["export"],
            compute=obs["compute"],
        )

        opts = SessionOptions()
        opts.add_session_config_entry("session.disable_aot_function_inlining", aot)
        opts.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        opts.optimized_model_filepath = (
            f"ort-{name.replace('.onnx', '')}-{p.lower()}-aot{1 if aot == '0' else 0}.onnx"
        )

        try:
            InferenceSession(name, opts, providers=ps)
        except Exception as e:
            loop.set_description(f"ERROR-load: {name} {e}")
            obs.update({"error": e, "step": "run"})
            data.append(obs)
            continue

        opts = SessionOptions()
        opts.add_session_config_entry("session.disable_aot_function_inlining", aot)
        opts.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        stat = start_spying_on(cuda=1 if has_cuda else 0)
        sess = InferenceSession(name, opts, providers=ps)
        memobs = flatten(stat.stop())
        memobs.update(short_obs)
        data_mem_load.append(memobs)

        input_name = sess.get_inputs()[0].name
        feeds = {input_name: np.random.rand(*shape).astype(np.float32)}

        stat = start_spying_on(cuda=1 if has_cuda else 0)
        try:
            sess.run(None, feeds)
        except Exception as e:
            loop.set_description(f"ERROR-run: {name} {e}")
            obs.update({"error": e, "step": "load"})
            data.append(obs)
            stat.stop()
            continue
        memobs = flatten(stat.stop())
        memobs.update(short_obs)
        data_mem_first_run.append(memobs)

        # memory consumption
        stat = start_spying_on(cuda=1 if has_cuda else 0)
        for _ in range(0, script_args.warmup):
            sess.run(None, feeds)
        memobs = flatten(stat.stop())
        memobs.update(short_obs)
        data_mem_run.append(memobs)

        obs.update(
            measure_time(
                lambda sess=sess, feeds=feeds: sess.run(None, feeds),
                max_time=script_args.maxtime,
                repeat=script_args.repeat,
                number=1,
            )
        )

        loop.set_description(f"{obs['average']} {name} {ps}")
        data.append(obs)

        # check first run
        obs1.update(
            measure_time(
                lambda name=name, opts=opts, ps=ps, feeds=feeds: InferenceSession(
                    name, opts, providers=ps
                ).run(None, feeds),
                max_time=script_args.maxtime,
                repeat=max(1, script_args.repeat // 2),
                number=1,
            )
        )
        data1.append(obs1)

    df = pandas.DataFrame(data)
    df.to_csv("plot_torch_export_ort_time.csv", index=False)
    df.to_excel("plot_torch_export_ort_time.xlsx", index=False)
    df1 = pandas.DataFrame(data1)
    df1.to_csv("plot_torch_export_ort_time1_init.csv", index=False)
    df1.to_excel("plot_torch_export_ort_time1_init.xlsx", index=False)
    dfmem = pandas.DataFrame(data_mem_load)
    dfmem.to_csv("plot_torch_export_ort_load_mem.csv", index=False)
    dfmem.to_excel("plot_torch_export_ort_load_mem.xlsx", index=False)
    dfmemr = pandas.DataFrame(data_mem_run)
    dfmemr.to_csv("plot_torch_export_ort_run_mem.csv", index=False)
    dfmemr.to_excel("plot_torch_export_ort_run_mem.xlsx", index=False)
    dfmemfr = pandas.DataFrame(data_mem_first_run)
    dfmemfr.to_csv("plot_torch_export_ort_first_run_mem.csv", index=False)
    dfmemfr.to_excel("plot_torch_export_ort_first_run_mem.xlsx", index=False)
    return df, df1, dfmem, dfmemfr, dfmemr


df, df_init, dfmem, dfmemfr, dfmemr = benchmark(list(input_tensor.shape))
print(df)

  0%|          | 0/20 [00:00<?, ?it/s]number of experiments: 20

774530103809392e-05 plot_torch_export_cus_p2.onnx ['CPUExecutionProvider']:   0%|          | 0/20 [00:01<?, ?it/s]
774530103809392e-05 plot_torch_export_cus_p2.onnx ['CPUExecutionProvider']:   5%|▌         | 1/20 [00:01<00:27,  1.45s/it]
85442191134514e-05 plot_torch_export_cus_p2.onnx ['CPUExecutionProvider']:   5%|▌         | 1/20 [00:02<00:27,  1.45s/it]
85442191134514e-05 plot_torch_export_cus_p2.onnx ['CPUExecutionProvider']:  10%|█         | 2/20 [00:02<00:19,  1.06s/it]
0011334410512887174 plot_torch_export_cus_p2.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  10%|█         | 2/20 [00:03<00:19,  1.06s/it]
0011334410512887174 plot_torch_export_cus_p2.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  15%|█▌        | 3/20 [00:03<00:17,  1.06s/it]
0015487084047522547 plot_torch_export_cus_p2.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  15%|█▌        | 3/20 [00:03<00:17,  1.06s/it]
0015487084047522547 plot_torch_export_cus_p2.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  20%|██        | 4/20 [00:04<00:15,  1.04it/s]
094345524299926e-05 plot_torch_export_dynopt.onnx ['CPUExecutionProvider']:  20%|██        | 4/20 [00:04<00:15,  1.04it/s]
094345524299926e-05 plot_torch_export_dynopt.onnx ['CPUExecutionProvider']:  25%|██▌       | 5/20 [00:04<00:12,  1.18it/s]
639918181899357e-05 plot_torch_export_dynopt.onnx ['CPUExecutionProvider']:  25%|██▌       | 5/20 [00:05<00:12,  1.18it/s]
639918181899357e-05 plot_torch_export_dynopt.onnx ['CPUExecutionProvider']:  30%|███       | 6/20 [00:05<00:10,  1.30it/s]
0009676481709343184 plot_torch_export_dynopt.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  30%|███       | 6/20 [00:05<00:10,  1.30it/s]
0009676481709343184 plot_torch_export_dynopt.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  35%|███▌      | 7/20 [00:06<00:09,  1.33it/s]
0013565394561466523 plot_torch_export_dynopt.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  35%|███▌      | 7/20 [00:06<00:09,  1.33it/s]
0013565394561466523 plot_torch_export_dynopt.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  40%|████      | 8/20 [00:07<00:09,  1.21it/s]
677814703928677e-05 plot_torch_export_dynamo.onnx ['CPUExecutionProvider']:  40%|████      | 8/20 [00:07<00:09,  1.21it/s]
677814703928677e-05 plot_torch_export_dynamo.onnx ['CPUExecutionProvider']:  45%|████▌     | 9/20 [00:07<00:08,  1.28it/s]
941147486441518e-05 plot_torch_export_dynamo.onnx ['CPUExecutionProvider']:  45%|████▌     | 9/20 [00:08<00:08,  1.28it/s]
941147486441518e-05 plot_torch_export_dynamo.onnx ['CPUExecutionProvider']:  50%|█████     | 10/20 [00:08<00:07,  1.36it/s]
0007144002052966894 plot_torch_export_dynamo.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  50%|█████     | 10/20 [00:08<00:07,  1.36it/s]
0007144002052966894 plot_torch_export_dynamo.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  55%|█████▌    | 11/20 [00:09<00:06,  1.43it/s]
0008318764285788694 plot_torch_export_dynamo.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  55%|█████▌    | 11/20 [00:09<00:06,  1.43it/s]
0008318764285788694 plot_torch_export_dynamo.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  60%|██████    | 12/20 [00:09<00:05,  1.44it/s]
236133662293357e-05 plot_torch_export_script.onnx ['CPUExecutionProvider']:  60%|██████    | 12/20 [00:10<00:05,  1.44it/s]
236133662293357e-05 plot_torch_export_script.onnx ['CPUExecutionProvider']:  65%|██████▌   | 13/20 [00:10<00:04,  1.45it/s]
37282538627513e-05 plot_torch_export_script.onnx ['CPUExecutionProvider']:  65%|██████▌   | 13/20 [00:10<00:04,  1.45it/s]
37282538627513e-05 plot_torch_export_script.onnx ['CPUExecutionProvider']:  70%|███████   | 14/20 [00:10<00:04,  1.50it/s]
0007548896853152725 plot_torch_export_script.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  70%|███████   | 14/20 [00:11<00:04,  1.50it/s]
0007548896853152725 plot_torch_export_script.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  75%|███████▌  | 15/20 [00:11<00:03,  1.56it/s]
0007309904576326989 plot_torch_export_script.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  75%|███████▌  | 15/20 [00:12<00:03,  1.56it/s]
0007309904576326989 plot_torch_export_script.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  80%|████████  | 16/20 [00:12<00:02,  1.59it/s]
436829438438482e-05 plot_torch_export_cus_p0.onnx ['CPUExecutionProvider']:  80%|████████  | 16/20 [00:12<00:02,  1.59it/s]
436829438438482e-05 plot_torch_export_cus_p0.onnx ['CPUExecutionProvider']:  85%|████████▌ | 17/20 [00:12<00:02,  1.50it/s]
242102288097068e-05 plot_torch_export_cus_p0.onnx ['CPUExecutionProvider']:  85%|████████▌ | 17/20 [00:13<00:02,  1.50it/s]
242102288097068e-05 plot_torch_export_cus_p0.onnx ['CPUExecutionProvider']:  90%|█████████ | 18/20 [00:13<00:01,  1.55it/s]
0007602554666610322 plot_torch_export_cus_p0.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  90%|█████████ | 18/20 [00:13<00:01,  1.55it/s]
0007602554666610322 plot_torch_export_cus_p0.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  95%|█████████▌| 19/20 [00:14<00:00,  1.58it/s]
0007602310370472238 plot_torch_export_cus_p0.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']:  95%|█████████▌| 19/20 [00:14<00:00,  1.58it/s]
0007602310370472238 plot_torch_export_cus_p0.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']: 100%|██████████| 20/20 [00:14<00:00,  1.62it/s]
0007602310370472238 plot_torch_export_cus_p0.onnx ['CUDAExecutionProvider', 'CPUExecutionProvider']: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]
                             name                                   providers compute  aot  export  n_nodes  ...  max_exec  repeat  number     ttime  context_size  warmup_time
 plot_torch_export_cus_p2.onnx                        CPUExecutionProvider     CPU    1  cus_p2       12  ...  0.000070       1  1734.0  0.117470            64     0.000297
 plot_torch_export_cus_p2.onnx                        CPUExecutionProvider     CPU    0  cus_p2       12  ...  0.000134       1  1287.0  0.126826            64     0.000366
 plot_torch_export_cus_p2.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    1  cus_p2       12  ...  0.001289       1   117.0  0.132613            64     0.001657
 plot_torch_export_cus_p2.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    0  cus_p2       12  ...  0.001685       1    84.0  0.130092            64     0.002684
 plot_torch_export_dynopt.onnx                        CPUExecutionProvider     CPU    1  dynopt       13  ...  0.000100       1  1173.0  0.106677            64     0.000316
 plot_torch_export_dynopt.onnx                        CPUExecutionProvider     CPU    0  dynopt       13  ...  0.000256       1  1265.0  0.109295            64     0.000601
 plot_torch_export_dynopt.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    1  dynopt       13  ...  0.001662       1   117.0  0.113215            64     0.002181
 plot_torch_export_dynopt.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    0  dynopt       13  ...  0.001411       1   114.0  0.154645            64     0.001793
 plot_torch_export_dynamo.onnx                        CPUExecutionProvider     CPU    1  dynamo       13  ...  0.000142       1  1503.0  0.100368            64     0.000361
 plot_torch_export_dynamo.onnx                        CPUExecutionProvider     CPU    0  dynamo       13  ...  0.000095       1  1293.0  0.102679            64     0.000347
plot_torch_export_dynamo.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    1  dynamo       13  ...  0.001068       1   151.0  0.107874            64     0.001688
plot_torch_export_dynamo.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    0  dynamo       13  ...  0.000858       1   126.0  0.104816            64     0.001903
plot_torch_export_script.onnx                        CPUExecutionProvider     CPU    1  script       12  ...  0.000098       1  1723.0  0.107449            64     0.000367
plot_torch_export_script.onnx                        CPUExecutionProvider     CPU    0  script       12  ...  0.000109       1  2265.0  0.144344            64     0.000326
plot_torch_export_script.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    1  script       12  ...  0.001012       1   143.0  0.107949            64     0.001761
plot_torch_export_script.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    0  script       12  ...  0.001096       1   177.0  0.129385            64     0.001868
plot_torch_export_cus_p0.onnx                        CPUExecutionProvider     CPU    1  cus_p0       15  ...  0.000096       1  1763.0  0.131111            64     0.000427
plot_torch_export_cus_p0.onnx                        CPUExecutionProvider     CPU    0  cus_p0       15  ...  0.000081       1  1923.0  0.120036            64     0.000351
plot_torch_export_cus_p0.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    1  cus_p0       15  ...  0.001160       1   135.0  0.102634            64     0.001992
plot_torch_export_cus_p0.onnx  CUDAExecutionProvider,CPUExecutionProvider    CUDA    0  cus_p0       15  ...  0.000769       1   135.0  0.102631            64     0.001782

[20 rows x 17 columns]

Other view

def view_time(df, title, suffix="time"):
    piv = pandas.pivot_table(df, index="export", columns=["compute", "aot"], values="average")
    print(piv)
    piv.to_csv(f"plot_torch_export_ort_{suffix}_compute.csv")
    piv.to_excel(f"plot_torch_export_ort_{suffix}_compute.xlsx")

    piv_cpu = pandas.pivot_table(
        df[df.compute == "CPU"],
        index="export",
        columns=["compute", "aot"],
        values="average",
    )

    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    fig.suptitle(title)
    piv_cpu.plot.barh(ax=ax[0], title="CPU")

    if has_cuda:
        piv_gpu = pandas.pivot_table(
            df[df.compute == "CUDA"],
            index="export",
            columns=["compute", "aot"],
            values="average",
        )
        piv_gpu.plot.barh(ax=ax[1], title="CUDA")

    fig.tight_layout()
    fig.savefig(f"plot_torch_export_ort_{suffix}.png")
    return ax


view_time(df, "Compares onnxruntime time on exported models")

Compares onnxruntime time on exported models, CPU, CUDA

compute       CPU                CUDA
aot             0         1         0         1
export
cus_p0   0.000062  0.000074  0.000760  0.000760
cus_p2   0.000099  0.000068  0.001549  0.001133
dynamo   0.000079  0.000067  0.000832  0.000714
dynopt   0.000086  0.000091  0.001357  0.000968
script   0.000064  0.000062  0.000731  0.000755

array([<Axes: title={'center': 'CPU'}, ylabel='export'>,
       <Axes: title={'center': 'CUDA'}, ylabel='export'>], dtype=object)

New graph without the very long times.

piv_cpu = pandas.pivot_table(
    df[
        (df.compute == "CPU")
        & ((df.aot == 1) | ((df.export != "dynamo") & (df.export != "dynopt")))
    ],
    index="export",
    columns=["compute", "aot"],
    values="average",
)

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle("Compares onnxruntime time on exported models\nHide dynamo without AOT")
piv_cpu.plot.barh(ax=ax[0], title="CPU")

if has_cuda:
    piv_gpu = pandas.pivot_table(
        df[df.compute == "CUDA"],
        index="export",
        columns=["compute", "aot"],
        values="average",
    )
    piv_gpu.plot.barh(ax=ax[1], title="CUDA")

fig.tight_layout()
fig.savefig("plot_torch_export_ort_time_2.png")

Compares onnxruntime time on exported models Hide dynamo without AOT, CPU, CUDA

Let’s do the same with the loading time + the first run.

view_time(
    df_init,
    "Compares onnxruntime loading time and first run on exported models",
    suffix="time1_init",
)

Compares onnxruntime loading time and first run on exported models, CPU, CUDA

compute       CPU                CUDA
aot             0         1         0         1
export
cus_p0   0.005893  0.011159  0.015356  0.020402
cus_p2   0.008422  0.006899  0.029318  0.023198
dynamo   0.007780  0.006664  0.030340  0.019251
dynopt   0.008386  0.007506  0.020447  0.019667
script   0.006734  0.005652  0.034694  0.036693

array([<Axes: title={'center': 'CPU'}, ylabel='export'>,
       <Axes: title={'center': 'CUDA'}, ylabel='export'>], dtype=object)

Memory Loading Time (ORT)¶

for compute in ["CPU", "CUDA"]:
    if not has_cuda and compute == "CUDA":
        continue
    ax = memory_peak_plot(
        dfmem[dfmem.compute == compute],
        ("export", "aot"),
        suptitle=f"Memory Consumption of onnxruntime loading time\nrunning on {compute}",
        bars=[model_size * i / 2**20 for i in range(1, 3)],
        figsize=(18, 6),
    )
    get_figure(ax).savefig(f"plot_torch_export_ort_load_mem_{compute}.png")

Memory First Running Time (ORT)¶

for compute in ["CPU", "CUDA"]:
    if not has_cuda and compute == "CUDA":
        continue
    ax = memory_peak_plot(
        dfmemfr[dfmemfr.compute == compute],
        ("export", "aot"),
        suptitle=f"Memory Consumption of onnxruntime first running time"
        f"\nrunning on {compute}",
        bars=[model_size * i / 2**20 for i in range(1, 3)],
        figsize=(18, 6),
    )
    get_figure(ax).savefig(f"plot_torch_export_ort_first_run_mem_{compute}.png")

Memory Running Time (ORT)¶

for compute in ["CPU", "CUDA"]:
    if not has_cuda and compute == "CUDA":
        continue
    ax = memory_peak_plot(
        dfmemr[dfmemr.compute == compute],
        ("export", "aot"),
        suptitle=f"Memory Consumption of onnxruntime running time\nrunning on {compute}",
        bars=[model_size * i / 2**20 for i in range(1, 3)],
        figsize=(18, 6),
    )
    get_figure(ax).savefig(f"plot_torch_export_ort_run_mem_{compute}.png")

Show the interesting models for CPU¶

script¶

model = "ort-plot_torch_export_cus_p2-cpu-aot0.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='input' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='_onx_concat_gatherelements__shape_max_pool2d_1000' type=int64 shape=(2,) -- array([ 1, -1])-- GraphBuilder.constant_folding.from/fold(_onx_gatherelements__shape_max_pool2d_100,init7_s1_-1)##_onx_gatherelements__shape_max_pool2d_100/GraphBuilder.constant_folding.from/fold(_shape_max_pool2d_10,init7_s1_0)##_shape_max_pool2d_10/##init7_s1_0/Opset.make_node.1/Shape##init7_s1_-1/Opset.make_node.1/Shape
init: name='GemmTransposePattern--_onx_transpose_p_fc1_weight0' type=float32 shape=(512, 16)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc1_weight0)##_onx_transpose_p_fc1_weight0/GraphBuilder.constant_folding.from/fold(p_fc1_weight)##p_fc1_weight/DynamoInterpret.placeholder.1/P(fc1.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc2_weight0' type=float32 shape=(128, 512)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc2_weight0)##_onx_transpose_p_fc2_weight0/GraphBuilder.constant_folding.from/fold(p_fc2_weight)##p_fc2_weight/DynamoInterpret.placeholder.1/P(fc2.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc3_weight0' type=float32 shape=(10, 128)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc3_weight0)##_onx_transpose_p_fc3_weight0/GraphBuilder.constant_folding.from/fold(p_fc3_weight)##p_fc3_weight/DynamoInterpret.placeholder.1/P(fc3.weight)
init: name='reorder' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv1.bias)
init: name='reorder_token_2' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv2.bias)
init: name='fc1.bias' type=float32 shape=(512,)                       -- DynamoInterpret.placeholder.1/P(fc1.bias)
init: name='fc2.bias' type=float32 shape=(128,)                       -- DynamoInterpret.placeholder.1/P(fc2.bias)
init: name='fc3.bias' type=float32 shape=(10,)                        -- DynamoInterpret.placeholder.1/P(fc3.bias)
Conv[com.microsoft.nchwc](input, reorder, conv1.bias, activation=b'Relu', dilations=[1,1], group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET') -> reorder_token_0
  MaxPool[com.microsoft.nchwc](reorder_token_0, storage_order=0, auto_pad=b'NOTSET', ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> reorder_token_1
    Conv[com.microsoft.nchwc](reorder_token_1, reorder_token_2, conv2.bias, activation=b'Relu', dilations=[1,1], group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET') -> reorder_token_3
      MaxPool[com.microsoft.nchwc](reorder_token_3, storage_order=0, auto_pad=b'NOTSET', ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> reorder_token_4
        ReorderOutput[com.microsoft.nchwc](reorder_token_4, channels_last=0, channels=16) -> max_pool2d_1
          Reshape(max_pool2d_1, _onx_concat_gatherelements__shape_max_pool2d_1000, allowzero=0) -> flatten
            FusedGemm[com.microsoft](flatten, GemmTransposePattern--_onx_transpose_p_fc1_weight0, fc1.bias, transA=0, beta=1.00, activation=b'Relu', transB=1, alpha=1.00) -> relu_2
              FusedGemm[com.microsoft](relu_2, GemmTransposePattern--_onx_transpose_p_fc2_weight0, fc2.bias, transA=0, beta=1.00, activation=b'Relu', transB=1, alpha=1.00) -> relu_3
                Gemm(relu_3, GemmTransposePattern--_onx_transpose_p_fc3_weight0, fc3.bias, transA=0, beta=1.00, transB=1, alpha=1.00) -> output_0
output: name='output_0' type=dtype('float32') shape=[1, 10]

cus_p2¶

model = "ort-plot_torch_export_cus_p2-cpu-aot0.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='input' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='_onx_concat_gatherelements__shape_max_pool2d_1000' type=int64 shape=(2,) -- array([ 1, -1])-- GraphBuilder.constant_folding.from/fold(_onx_gatherelements__shape_max_pool2d_100,init7_s1_-1)##_onx_gatherelements__shape_max_pool2d_100/GraphBuilder.constant_folding.from/fold(_shape_max_pool2d_10,init7_s1_0)##_shape_max_pool2d_10/##init7_s1_0/Opset.make_node.1/Shape##init7_s1_-1/Opset.make_node.1/Shape
init: name='GemmTransposePattern--_onx_transpose_p_fc1_weight0' type=float32 shape=(512, 16)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc1_weight0)##_onx_transpose_p_fc1_weight0/GraphBuilder.constant_folding.from/fold(p_fc1_weight)##p_fc1_weight/DynamoInterpret.placeholder.1/P(fc1.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc2_weight0' type=float32 shape=(128, 512)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc2_weight0)##_onx_transpose_p_fc2_weight0/GraphBuilder.constant_folding.from/fold(p_fc2_weight)##p_fc2_weight/DynamoInterpret.placeholder.1/P(fc2.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc3_weight0' type=float32 shape=(10, 128)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc3_weight0)##_onx_transpose_p_fc3_weight0/GraphBuilder.constant_folding.from/fold(p_fc3_weight)##p_fc3_weight/DynamoInterpret.placeholder.1/P(fc3.weight)
init: name='reorder' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv1.bias)
init: name='reorder_token_2' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv2.bias)
init: name='fc1.bias' type=float32 shape=(512,)                       -- DynamoInterpret.placeholder.1/P(fc1.bias)
init: name='fc2.bias' type=float32 shape=(128,)                       -- DynamoInterpret.placeholder.1/P(fc2.bias)
init: name='fc3.bias' type=float32 shape=(10,)                        -- DynamoInterpret.placeholder.1/P(fc3.bias)
Conv[com.microsoft.nchwc](input, reorder, conv1.bias, activation=b'Relu', dilations=[1,1], group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET') -> reorder_token_0
  MaxPool[com.microsoft.nchwc](reorder_token_0, storage_order=0, auto_pad=b'NOTSET', ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> reorder_token_1
    Conv[com.microsoft.nchwc](reorder_token_1, reorder_token_2, conv2.bias, activation=b'Relu', dilations=[1,1], group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET') -> reorder_token_3
      MaxPool[com.microsoft.nchwc](reorder_token_3, storage_order=0, auto_pad=b'NOTSET', ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> reorder_token_4
        ReorderOutput[com.microsoft.nchwc](reorder_token_4, channels_last=0, channels=16) -> max_pool2d_1
          Reshape(max_pool2d_1, _onx_concat_gatherelements__shape_max_pool2d_1000, allowzero=0) -> flatten
            FusedGemm[com.microsoft](flatten, GemmTransposePattern--_onx_transpose_p_fc1_weight0, fc1.bias, transA=0, beta=1.00, activation=b'Relu', transB=1, alpha=1.00) -> relu_2
              FusedGemm[com.microsoft](relu_2, GemmTransposePattern--_onx_transpose_p_fc2_weight0, fc2.bias, transA=0, beta=1.00, activation=b'Relu', transB=1, alpha=1.00) -> relu_3
                Gemm(relu_3, GemmTransposePattern--_onx_transpose_p_fc3_weight0, fc3.bias, transA=0, beta=1.00, transB=1, alpha=1.00) -> output_0
output: name='output_0' type=dtype('float32') shape=[1, 10]

dynopt¶

model = "ort-plot_torch_export_dynopt-cpu-aot1.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='pkg.onnxscript.torch_lib.common' version=1
opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='x' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='reorder' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)
init: name='reorder_token_2' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)
init: name='fc1.weight' type=float32 shape=(512, 16)
init: name='fc1.bias' type=float32 shape=(512,)
init: name='fc2.weight' type=float32 shape=(128, 512)
init: name='fc2.bias' type=float32 shape=(128,)
init: name='fc3.weight' type=float32 shape=(10, 128)
init: name='fc3.bias' type=float32 shape=(10,)
init: name='val_3' type=int64 shape=(2,) -- array([ 1, 16])
Conv[com.microsoft.nchwc](x, reorder, conv1.bias, activation=b'Relu', group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET', dilations=[1,1]) -> reorder_token_0
  MaxPool[com.microsoft.nchwc](reorder_token_0, pads=[0,0,0,0], kernel_shape=[2,2], ceil_mode=0, auto_pad=b'NOTSET', dilations=[1,1], strides=[2,2], storage_order=0) -> reorder_token_1
    Conv[com.microsoft.nchwc](reorder_token_1, reorder_token_2, conv2.bias, activation=b'Relu', group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET', dilations=[1,1]) -> reorder_token_3
      MaxPool[com.microsoft.nchwc](reorder_token_3, pads=[0,0,0,0], kernel_shape=[2,2], ceil_mode=0, auto_pad=b'NOTSET', dilations=[1,1], strides=[2,2], storage_order=0) -> reorder_token_4
        ReorderOutput[com.microsoft.nchwc](reorder_token_4, channels_last=0, channels=16) -> max_pool2d_1
          Reshape(max_pool2d_1, val_3, allowzero=0) -> view
            FusedGemm[com.microsoft](view, fc1.weight, fc1.bias, transA=0, alpha=1.00, activation=b'Relu', transB=1, beta=1.00) -> relu_2
              FusedGemm[com.microsoft](relu_2, fc2.weight, fc2.bias, transA=0, alpha=1.00, activation=b'Relu', transB=1, beta=1.00) -> relu_3
                Gemm(relu_3, fc3.weight, fc3.bias, transA=0, alpha=1.00, transB=1, beta=1.00) -> linear_2
output: name='linear_2' type=dtype('float32') shape=[1, 10]

dynamo¶

model = "ort-plot_torch_export_dynamo-cpu-aot1.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='pkg.onnxscript.torch_lib.common' version=1
opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='x' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='reorder' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)
init: name='reorder_token_2' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)
init: name='fc1.weight' type=float32 shape=(512, 16)
init: name='fc1.bias' type=float32 shape=(512,)
init: name='fc2.weight' type=float32 shape=(128, 512)
init: name='fc2.bias' type=float32 shape=(128,)
init: name='fc3.weight' type=float32 shape=(10, 128)
init: name='fc3.bias' type=float32 shape=(10,)
init: name='val_3' type=int64 shape=(2,) -- array([ 1, 16])
Conv[com.microsoft.nchwc](x, reorder, conv1.bias, activation=b'Relu', group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET', dilations=[1,1]) -> reorder_token_0
  MaxPool[com.microsoft.nchwc](reorder_token_0, pads=[0,0,0,0], kernel_shape=[2,2], ceil_mode=0, auto_pad=b'NOTSET', dilations=[1,1], strides=[2,2], storage_order=0) -> reorder_token_1
    Conv[com.microsoft.nchwc](reorder_token_1, reorder_token_2, conv2.bias, activation=b'Relu', group=1, strides=[1,1], pads=[0,0,0,0], auto_pad=b'NOTSET', dilations=[1,1]) -> reorder_token_3
      MaxPool[com.microsoft.nchwc](reorder_token_3, pads=[0,0,0,0], kernel_shape=[2,2], ceil_mode=0, auto_pad=b'NOTSET', dilations=[1,1], strides=[2,2], storage_order=0) -> reorder_token_4
        ReorderOutput[com.microsoft.nchwc](reorder_token_4, channels_last=0, channels=16) -> max_pool2d_1
          Reshape(max_pool2d_1, val_3, allowzero=0) -> view
            FusedGemm[com.microsoft](view, fc1.weight, fc1.bias, transA=0, alpha=1.00, activation=b'Relu', transB=1, beta=1.00) -> relu_2
              FusedGemm[com.microsoft](relu_2, fc2.weight, fc2.bias, transA=0, alpha=1.00, activation=b'Relu', transB=1, beta=1.00) -> relu_3
                Gemm(relu_3, fc3.weight, fc3.bias, transA=0, alpha=1.00, transB=1, beta=1.00) -> linear_2
output: name='linear_2' type=dtype('float32') shape=[1, 10]

Show the interesting models for CUDA¶

script¶

model = "ort-plot_torch_export_cus_p2-cuda-aot0.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='input' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='_onx_concat_gatherelements__shape_max_pool2d_1000' type=int64 shape=(2,) -- array([ 1, -1])-- GraphBuilder.constant_folding.from/fold(_onx_gatherelements__shape_max_pool2d_100,init7_s1_-1)##_onx_gatherelements__shape_max_pool2d_100/GraphBuilder.constant_folding.from/fold(_shape_max_pool2d_10,init7_s1_0)##_shape_max_pool2d_10/##init7_s1_0/Opset.make_node.1/Shape##init7_s1_-1/Opset.make_node.1/Shape
init: name='GemmTransposePattern--_onx_transpose_p_fc1_weight0' type=float32 shape=(512, 16)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc1_weight0)##_onx_transpose_p_fc1_weight0/GraphBuilder.constant_folding.from/fold(p_fc1_weight)##p_fc1_weight/DynamoInterpret.placeholder.1/P(fc1.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc2_weight0' type=float32 shape=(128, 512)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc2_weight0)##_onx_transpose_p_fc2_weight0/GraphBuilder.constant_folding.from/fold(p_fc2_weight)##p_fc2_weight/DynamoInterpret.placeholder.1/P(fc2.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc3_weight0' type=float32 shape=(10, 128)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc3_weight0)##_onx_transpose_p_fc3_weight0/GraphBuilder.constant_folding.from/fold(p_fc3_weight)##p_fc3_weight/DynamoInterpret.placeholder.1/P(fc3.weight)
init: name='conv1.weight' type=float32 shape=(16, 1, 5, 5)            -- DynamoInterpret.placeholder.1/P(conv1.weight)
init: name='conv1.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv1.bias)
init: name='conv2.weight' type=float32 shape=(16, 16, 5, 5)           -- DynamoInterpret.placeholder.1/P(conv2.weight)
init: name='conv2.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv2.bias)
init: name='fc1.bias' type=float32 shape=(512,)                       -- DynamoInterpret.placeholder.1/P(fc1.bias)
init: name='fc2.bias' type=float32 shape=(128,)                       -- DynamoInterpret.placeholder.1/P(fc2.bias)
init: name='fc3.bias' type=float32 shape=(10,)                        -- DynamoInterpret.placeholder.1/P(fc3.bias)
Conv(input, conv1.weight, conv1.bias, dilations=[1,1], group=1, pads=[0,0,0,0], strides=[1,1]) -> conv2d
  Relu(conv2d) -> relu
    MaxPool(relu, ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> max_pool2d
      Conv(max_pool2d, conv2.weight, conv2.bias, dilations=[1,1], group=1, pads=[0,0,0,0], strides=[1,1]) -> conv2d_1
        Relu(conv2d_1) -> relu_1
          MaxPool(relu_1, ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> max_pool2d_1
            Reshape(max_pool2d_1, _onx_concat_gatherelements__shape_max_pool2d_1000) -> flatten
              Gemm(flatten, GemmTransposePattern--_onx_transpose_p_fc1_weight0, fc1.bias, transB=1) -> linear
                Relu(linear) -> relu_2
                  Gemm(relu_2, GemmTransposePattern--_onx_transpose_p_fc2_weight0, fc2.bias, transB=1) -> linear_1
                    Relu(linear_1) -> relu_3
                      Gemm(relu_3, GemmTransposePattern--_onx_transpose_p_fc3_weight0, fc3.bias, transB=1) -> output_0
output: name='output_0' type=dtype('float32') shape=[1, 10]

cus_p2¶

model = "ort-plot_torch_export_cus_p2-cuda-aot0.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='input' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='_onx_concat_gatherelements__shape_max_pool2d_1000' type=int64 shape=(2,) -- array([ 1, -1])-- GraphBuilder.constant_folding.from/fold(_onx_gatherelements__shape_max_pool2d_100,init7_s1_-1)##_onx_gatherelements__shape_max_pool2d_100/GraphBuilder.constant_folding.from/fold(_shape_max_pool2d_10,init7_s1_0)##_shape_max_pool2d_10/##init7_s1_0/Opset.make_node.1/Shape##init7_s1_-1/Opset.make_node.1/Shape
init: name='GemmTransposePattern--_onx_transpose_p_fc1_weight0' type=float32 shape=(512, 16)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc1_weight0)##_onx_transpose_p_fc1_weight0/GraphBuilder.constant_folding.from/fold(p_fc1_weight)##p_fc1_weight/DynamoInterpret.placeholder.1/P(fc1.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc2_weight0' type=float32 shape=(128, 512)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc2_weight0)##_onx_transpose_p_fc2_weight0/GraphBuilder.constant_folding.from/fold(p_fc2_weight)##p_fc2_weight/DynamoInterpret.placeholder.1/P(fc2.weight)
init: name='GemmTransposePattern--_onx_transpose_p_fc3_weight0' type=float32 shape=(10, 128)-- GraphBuilder.constant_folding.from/fold(_onx_transpose_p_fc3_weight0)##_onx_transpose_p_fc3_weight0/GraphBuilder.constant_folding.from/fold(p_fc3_weight)##p_fc3_weight/DynamoInterpret.placeholder.1/P(fc3.weight)
init: name='conv1.weight' type=float32 shape=(16, 1, 5, 5)            -- DynamoInterpret.placeholder.1/P(conv1.weight)
init: name='conv1.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv1.bias)
init: name='conv2.weight' type=float32 shape=(16, 16, 5, 5)           -- DynamoInterpret.placeholder.1/P(conv2.weight)
init: name='conv2.bias' type=float32 shape=(16,)                      -- DynamoInterpret.placeholder.1/P(conv2.bias)
init: name='fc1.bias' type=float32 shape=(512,)                       -- DynamoInterpret.placeholder.1/P(fc1.bias)
init: name='fc2.bias' type=float32 shape=(128,)                       -- DynamoInterpret.placeholder.1/P(fc2.bias)
init: name='fc3.bias' type=float32 shape=(10,)                        -- DynamoInterpret.placeholder.1/P(fc3.bias)
Conv(input, conv1.weight, conv1.bias, dilations=[1,1], group=1, pads=[0,0,0,0], strides=[1,1]) -> conv2d
  Relu(conv2d) -> relu
    MaxPool(relu, ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> max_pool2d
      Conv(max_pool2d, conv2.weight, conv2.bias, dilations=[1,1], group=1, pads=[0,0,0,0], strides=[1,1]) -> conv2d_1
        Relu(conv2d_1) -> relu_1
          MaxPool(relu_1, ceil_mode=0, dilations=[1,1], kernel_shape=[2,2], pads=[0,0,0,0], strides=[2,2]) -> max_pool2d_1
            Reshape(max_pool2d_1, _onx_concat_gatherelements__shape_max_pool2d_1000) -> flatten
              Gemm(flatten, GemmTransposePattern--_onx_transpose_p_fc1_weight0, fc1.bias, transB=1) -> linear
                Relu(linear) -> relu_2
                  Gemm(relu_2, GemmTransposePattern--_onx_transpose_p_fc2_weight0, fc2.bias, transB=1) -> linear_1
                    Relu(linear_1) -> relu_3
                      Gemm(relu_3, GemmTransposePattern--_onx_transpose_p_fc3_weight0, fc3.bias, transB=1) -> output_0
output: name='output_0' type=dtype('float32') shape=[1, 10]

dynopt¶

model = "ort-plot_torch_export_dynopt-cuda-aot1.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='pkg.onnxscript.torch_lib.common' version=1
opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='x' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='conv1.weight' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)
init: name='conv2.weight' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)
init: name='fc1.weight' type=float32 shape=(512, 16)
init: name='fc1.bias' type=float32 shape=(512,)
init: name='fc2.weight' type=float32 shape=(128, 512)
init: name='fc2.bias' type=float32 shape=(128,)
init: name='fc3.weight' type=float32 shape=(10, 128)
init: name='fc3.bias' type=float32 shape=(10,)
init: name='val_3' type=int64 shape=(2,) -- array([ 1, 16])
Conv(x, conv1.weight, conv1.bias, group=1, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[1,1], dilations=[1,1]) -> conv2d
  Relu(conv2d) -> relu
    MaxPool(relu, storage_order=0, dilations=[1,1], ceil_mode=0, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[2,2], kernel_shape=[2,2]) -> max_pool2d
      Conv(max_pool2d, conv2.weight, conv2.bias, group=1, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[1,1], dilations=[1,1]) -> conv2d_1
        Relu(conv2d_1) -> relu_1
          MaxPool(relu_1, storage_order=0, dilations=[1,1], ceil_mode=0, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[2,2], kernel_shape=[2,2]) -> max_pool2d_1
            Reshape(max_pool2d_1, val_3, allowzero=0) -> view
              Gemm(view, fc1.weight, fc1.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear
                Relu(linear) -> relu_2
                  Gemm(relu_2, fc2.weight, fc2.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear_1
                    Relu(linear_1) -> relu_3
                      Gemm(relu_3, fc3.weight, fc3.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear_2
output: name='linear_2' type=dtype('float32') shape=[1, 10]

dynamo¶

model = "ort-plot_torch_export_dynamo-cuda-aot1.onnx"
if os.path.exists(model):
    print(pretty_onnx(onnx.load(model)))

opset: domain='pkg.onnxscript.torch_lib.common' version=1
opset: domain='' version=18
opset: domain='ai.onnx.ml' version=5
opset: domain='onnx_extended.ortops.optim.cuda' version=1000
opset: domain='ai.onnx.training' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='com.microsoft' version=1
opset: domain='com.microsoft.experimental' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='org.pytorch.aten' version=1
input: name='x' type=dtype('float32') shape=[1, 1, 16, 16]
init: name='conv1.weight' type=float32 shape=(16, 1, 5, 5)
init: name='conv1.bias' type=float32 shape=(16,)
init: name='conv2.weight' type=float32 shape=(16, 16, 5, 5)
init: name='conv2.bias' type=float32 shape=(16,)
init: name='fc1.weight' type=float32 shape=(512, 16)
init: name='fc1.bias' type=float32 shape=(512,)
init: name='fc2.weight' type=float32 shape=(128, 512)
init: name='fc2.bias' type=float32 shape=(128,)
init: name='fc3.weight' type=float32 shape=(10, 128)
init: name='fc3.bias' type=float32 shape=(10,)
init: name='val_3' type=int64 shape=(2,) -- array([ 1, 16])
Conv(x, conv1.weight, conv1.bias, group=1, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[1,1], dilations=[1,1]) -> conv2d
  Relu(conv2d) -> relu
    MaxPool(relu, storage_order=0, dilations=[1,1], ceil_mode=0, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[2,2], kernel_shape=[2,2]) -> max_pool2d
      Conv(max_pool2d, conv2.weight, conv2.bias, group=1, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[1,1], dilations=[1,1]) -> conv2d_1
        Relu(conv2d_1) -> relu_1
          MaxPool(relu_1, storage_order=0, dilations=[1,1], ceil_mode=0, pads=[0,0,0,0], auto_pad=b'NOTSET', strides=[2,2], kernel_shape=[2,2]) -> max_pool2d_1
            Reshape(max_pool2d_1, val_3, allowzero=0) -> view
              Gemm(view, fc1.weight, fc1.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear
                Relu(linear) -> relu_2
                  Gemm(relu_2, fc2.weight, fc2.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear_1
                    Relu(linear_1) -> relu_3
                      Gemm(relu_3, fc3.weight, fc3.bias, beta=1.00, transB=1, alpha=1.00, transA=0) -> linear_2
output: name='linear_2' type=dtype('float32') shape=[1, 10]

Total running time of the script: (1 minutes 21.352 seconds)