Note
Go to the end to download the full example code.
201: Evaluate DORT Training¶
It compares DORT to eager mode and onnxrt backend.
To run the script:
python _doc/examples/plot_torch_aot --help
Some helpers¶
import warnings
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
import onnxruntime
has_cuda = "CUDAExecutionProvider" in onnxruntime.get_available_providers()
except ImportError:
print("onnxruntime not available.")
import sys
sys.exit(0)
import torch._dynamo
import contextlib
import itertools
import os
import gc
import platform
# import pickle
import pprint
import multiprocessing
import time
import cProfile
import pstats
import io
import logging
from pstats import SortKey
import numpy as np
import matplotlib.pyplot as plt
import pandas
import onnx
from onnx_array_api.profiling import profile2graph
import torch
from torch import nn
import torch.nn.functional as F
import experimental_experiment
from experimental_experiment.plotting.memory import memory_peak_plot
from experimental_experiment.ext_test_case import measure_time, get_figure
from experimental_experiment.args import get_parsed_args
from experimental_experiment.memory_peak import start_spying_on
from experimental_experiment.torch_models.training_helper import make_aot_ort
from tqdm import tqdm
has_cuda = has_cuda and torch.cuda.is_available()
logging.disable(logging.ERROR)
def system_info():
obs = {}
obs["processor"] = platform.processor()
obs["cores"] = multiprocessing.cpu_count()
try:
obs["cuda"] = 1 if torch.cuda.is_available() else 0
obs["cuda_count"] = torch.cuda.device_count()
obs["cuda_name"] = torch.cuda.get_device_name()
obs["cuda_capa"] = torch.cuda.get_device_capability()
except (RuntimeError, AssertionError):
# no cuda
pass
return obs
pprint.pprint(system_info())
{'cores': 20,
'cuda': 1,
'cuda_capa': (8, 9),
'cuda_count': 1,
'cuda_name': 'NVIDIA GeForce RTX 4060 Laptop GPU',
'processor': 'x86_64'}
Scripts arguments
script_args = get_parsed_args(
"plot_torch_aot",
description=__doc__,
scenarios={
"small": "small model to test",
"middle": "55Mb model",
"large": "1Gb model",
},
warmup=5,
repeat=5,
repeat1=(1, "repeat for the first iteration"),
maxtime=(
2,
"maximum time to run a model to measure the computation time, "
"it is 0.1 when scenario is small",
),
expose="scenarios,repeat,repeat1,warmup",
)
if script_args.scenario in (None, "small"):
script_args.maxtime = 0.1
print(f"scenario={script_args.scenario or 'small'}")
print(f"warmup={script_args.warmup}")
print(f"repeat={script_args.repeat}")
print(f"repeat1={script_args.repeat1}")
print(f"maxtime={script_args.maxtime}")
scenario=small
warmup=5
repeat=5
repeat1=1
maxtime=0.1
The model¶
A simple model to convert.
class MyModelClass(nn.Module):
def __init__(self, scenario=script_args.scenario):
super().__init__()
if scenario == "middle":
self.large = False
self.conv1 = nn.Conv2d(1, 32, 5)
# self.conv2 = nn.Conv2d(128, 16, 5)
self.fc1 = nn.Linear(30752, 1024)
self.fcs = []
self.fc2 = nn.Linear(1024, 128)
self.fc3 = nn.Linear(128, 10)
elif scenario in (None, "small"):
self.large = False
self.conv1 = nn.Conv2d(1, 16, 5)
# self.conv2 = nn.Conv2d(16, 16, 5)
self.fc1 = nn.Linear(144, 512)
self.fcs = []
self.fc2 = nn.Linear(512, 128)
self.fc3 = nn.Linear(128, 10)
elif scenario in (None, "large"):
self.large = True
self.conv1 = nn.Conv2d(1, 32, 5)
# self.conv2 = nn.Conv2d(128, 16, 5)
self.fc1 = nn.Linear(30752, 4096)
# torch script does not support loops.
self.fca = nn.Linear(4096, 4096)
self.fcb = nn.Linear(4096, 4096)
self.fcc = nn.Linear(4096, 4096)
self.fcd = nn.Linear(4096, 4096)
self.fce = nn.Linear(4096, 4096)
self.fcf = nn.Linear(4096, 4096)
self.fcg = nn.Linear(4096, 4096)
self.fch = nn.Linear(4096, 4096)
self.fci = nn.Linear(4096, 4096)
# end of the unfolded loop.
self.fc2 = nn.Linear(4096, 128)
self.fc3 = nn.Linear(128, 10)
else:
raise ValueError(f"Unsupported scenario={scenario!r}.")
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (4, 4))
# x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
if self.large:
# loop
x = F.relu(self.fca(x))
x = F.relu(self.fcb(x))
x = F.relu(self.fcc(x))
x = F.relu(self.fcd(x))
x = F.relu(self.fce(x))
x = F.relu(self.fcf(x))
x = F.relu(self.fcg(x))
x = F.relu(self.fch(x))
x = F.relu(self.fci(x))
# end of the loop
x = F.relu(self.fc2(x))
y = self.fc3(x)
return y
def create_model_and_input(scenario=script_args.scenario):
if scenario == "middle":
shape = [1, 1, 128, 128]
elif scenario in (None, "small"):
shape = [1, 1, 16, 16]
elif scenario == "large":
shape = [1, 1, 128, 128]
else:
raise ValueError(f"Unsupported scenario={scenario!r}.")
input_tensor = torch.rand(*shape).to(torch.float32)
y = torch.rand((1, 10)).to(torch.float32)
model = MyModelClass(scenario=scenario)
assert model(input_tensor) is not None
return model, (input_tensor, y)
def torch_model_size(model):
size_model = 0
for param in model.parameters():
size = param.numel() * torch.finfo(param.data.dtype).bits / 8
size_model += size
return size_model
model, input_tensors = create_model_and_input()
model_size = torch_model_size(model)
print(f"model size={model_size / 2 ** 20} Mb")
model size=0.5401992797851562 Mb
Backends¶
def run(model, tensor_x, tensor_y):
tensor_x = tensor_x.detach()
tensor_y = tensor_y.detach()
for param in model.parameters():
param.grad = None
try:
output = model(tensor_x)
except Exception as e:
raise AssertionError(f"issue with {type(tensor_x)}") from e
loss = F.mse_loss(output, tensor_y)
# return loss
def _backward_():
loss.backward()
_backward_()
return loss, (param.grad for param in model.parameters())
def get_torch_eager(model, *args):
def my_compiler(gm, example_inputs):
return gm.forward
with contextlib.redirect_stdout(io.StringIO()):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
optimized_mod = torch.compile(model, fullgraph=True, backend=my_compiler)
assert run(optimized_mod, *args)
return optimized_mod
def get_torch_default(model, *args):
with contextlib.redirect_stdout(io.StringIO()):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
optimized_mod = torch.compile(model, fullgraph=True, mode="reduce-overhead")
assert run(optimized_mod, *args)
return optimized_mod
def get_torch_dort(model, *args):
with contextlib.redirect_stdout(io.StringIO()):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
local_aot_ort, _ = make_aot_ort(dynamic=True, rewrite=True)
optimized_mod = torch.compile(model, backend=local_aot_ort, fullgraph=True)
run(optimized_mod, *args)
assert run(optimized_mod, *args)
return optimized_mod
Let’s check they are working.
export_functions = [
get_torch_eager,
get_torch_default,
get_torch_dort,
]
exporters = {f.__name__.replace("get_", ""): f for f in export_functions}
supported_exporters = {}
for k, v in exporters.items():
print(f"run function {k}")
filename = f"plot_torch_aot_{k}.onnx"
torch._dynamo.reset()
model, input_tensors = create_model_and_input()
try:
run(model, *input_tensors)
except Exception as e:
print(f"skipped due to {str(e)[:1000]}") # noqa: F821
continue
supported_exporters[k] = v
del model
gc.collect()
time.sleep(1)
run function torch_eager
run function torch_default
run function torch_dort
Compile and Memory¶
def flatten(ps):
obs = ps["cpu"].to_dict(unit=2**20)
if "gpus" in ps:
for i, g in enumerate(ps["gpus"]):
for k, v in g.to_dict(unit=2**20).items():
obs[f"gpu{i}_{k}"] = v
return obs
data = []
for k in supported_exporters:
print(f"run compile for memory {k} on cpu")
filename = f"plot_torch_aot_{k}.onnx"
if has_cuda:
torch.cuda.set_device(0)
torch._dynamo.reset()
# CPU
model, input_tensors = create_model_and_input()
stat = start_spying_on(cuda=1 if has_cuda else 0)
run(model, *input_tensors)
obs = flatten(stat.stop())
print("done.")
obs.update(dict(export=k, p="cpu"))
data.append(obs)
del model
gc.collect()
time.sleep(1)
if not has_cuda:
continue
torch._dynamo.reset()
# CUDA
model, input_tensors = create_model_and_input()
model = model.cuda()
input_tensors = [i.cuda() for i in input_tensors]
print(f"run compile for memory {k} on cuda")
stat = start_spying_on(cuda=1 if has_cuda else 0)
run(model, *input_tensors)
obs = flatten(stat.stop())
print("done.")
obs.update(dict(export=k, p="cuda"))
data.append(obs)
del model
gc.collect()
time.sleep(1)
run compile for memory torch_eager on cpu
done.
run compile for memory torch_eager on cuda
done.
run compile for memory torch_default on cpu
done.
run compile for memory torch_default on cuda
done.
run compile for memory torch_dort on cpu
done.
run compile for memory torch_dort on cuda
done.
The result.
df1 = pandas.DataFrame(data)
df1.to_csv("plot_torch_aot_1_memory.csv", index=False)
df1.to_excel("plot_torch_aot_1_memory.xlsx", index=False)
print(df1)
for p in ["cpu", "cuda"]:
if not has_cuda and p == "cuda":
continue
ax = memory_peak_plot(
df1[df1["p"] == p],
key=("export",),
bars=[model_size * i / 2**20 for i in range(1, 5)],
suptitle=f"Memory Consumption of the Compilation on {p}\n"
f"model size={model_size / 2**20:1.0f} Mb",
)
get_figure(ax).savefig(f"plot_torch_aot_1_memory_{p}.png")
peak mean n begin end gpu0_peak gpu0_mean gpu0_n gpu0_begin gpu0_end export p
0 3103.039062 3101.729167 3 3101.089844 3101.058594 552.617188 552.617188 3 552.617188 552.617188 torch_eager cpu
1 3197.410156 3140.345459 48 3101.042969 3197.410156 560.617188 553.950521 48 552.617188 560.617188 torch_eager cuda
2 3199.574219 3198.843750 3 3199.394531 3199.574219 560.617188 560.617188 3 560.617188 560.617188 torch_default cpu
3 3197.902344 3197.750000 2 3197.597656 3197.902344 560.617188 560.617188 2 560.617188 560.617188 torch_default cuda
4 3197.898438 3197.895833 3 3197.898438 3197.898438 560.617188 560.617188 3 560.617188 560.617188 torch_dort cpu
5 3197.902344 3197.902344 2 3197.902344 3197.902344 560.617188 560.617188 2 560.617188 560.617188 torch_dort cuda
dort first iteration speed¶
data = []
for k in supported_exporters:
print(f"run dort cpu {k}: {script_args.repeat1}")
times = []
for _ in range(int(script_args.repeat1)):
model, input_tensors = create_model_and_input()
torch._dynamo.reset()
begin = time.perf_counter()
run(model, *input_tensors)
duration = time.perf_counter() - begin
times.append(duration)
del model
gc.collect()
time.sleep(1)
print(f"done: {times[-1]}")
data.append(
dict(
export=k,
time=np.mean(times),
min=min(times),
max=max(times),
first=times[0],
last=times[-1],
std=np.std(times),
p="cpu",
)
)
if not has_cuda:
continue
print(f"run dort cuda {k}: {script_args.repeat1}")
times = []
for i in range(int(script_args.repeat1)):
model, input_tensors = create_model_and_input()
model = model.cuda()
input_tensors = [i.cuda() for i in input_tensors]
torch._dynamo.reset()
begin = time.perf_counter()
run(model, *input_tensors)
duration = time.perf_counter() - begin
times.append(duration)
del model
gc.collect()
time.sleep(1)
print(f"done: {times[-1]}")
data.append(
dict(
export=k,
time=np.mean(times),
min=min(times),
max=max(times),
first=times[0],
last=times[-1],
std=np.std(times),
p="cuda",
)
)
run dort cpu torch_eager: 1
done: 0.014885390999552328
run dort cuda torch_eager: 1
done: 0.003067197998461779
run dort cpu torch_default: 1
done: 0.007820370999979787
run dort cuda torch_default: 1
done: 0.0023014689977571834
run dort cpu torch_dort: 1
done: 0.006999388999247458
run dort cuda torch_dort: 1
done: 0.004341561001638183
The result.
df1 = pandas.DataFrame(data)
df1.to_csv("plot_torch_aot_1_time.csv", index=False)
df1.to_excel("plot_torch_aot_1_time.xlsx", index=False)
print(df1)
fig, ax = plt.subplots(1, 1)
dfi = df1[["export", "p", "time", "std"]].set_index(["export", "p"])
dfi["time"].plot.bar(ax=ax, title="Compilation time", yerr=dfi["std"], rot=30)
fig.tight_layout()
fig.savefig("plot_torch_aot_1_time.png")
export time min max first last std p
0 torch_eager 0.014885 0.014885 0.014885 0.014885 0.014885 0.0 cpu
1 torch_eager 0.003067 0.003067 0.003067 0.003067 0.003067 0.0 cuda
2 torch_default 0.007820 0.007820 0.007820 0.007820 0.007820 0.0 cpu
3 torch_default 0.002301 0.002301 0.002301 0.002301 0.002301 0.0 cuda
4 torch_dort 0.006999 0.006999 0.006999 0.006999 0.006999 0.0 cpu
5 torch_dort 0.004342 0.004342 0.004342 0.004342 0.004342 0.0 cuda
Compilation Profiling¶
def clean_text(text):
pathes = [
os.path.abspath(os.path.normpath(os.path.join(os.path.dirname(torch.__file__), ".."))),
os.path.abspath(os.path.normpath(os.path.join(os.path.dirname(onnx.__file__), ".."))),
os.path.abspath(
os.path.normpath(
os.path.join(os.path.dirname(experimental_experiment.__file__), "..")
)
),
]
for p in pathes:
text = text.replace(p, "")
text = text.replace("experimental_experiment", "experimental_experiment".upper())
return text
def profile_function(name, export_function, with_args=True, verbose=False, suffix="export"):
if verbose:
print(f"profile {name}: {export_function}")
if with_args:
model, input_tensors = create_model_and_input()
export_function(model, input_tensors)
pr = cProfile.Profile()
pr.enable()
for _ in range(int(script_args.repeat1)):
export_function(model, input_tensors)
pr.disable()
else:
pr = cProfile.Profile()
pr.enable()
for _ in range(int(script_args.repeat1)):
export_function()
pr.disable()
s = io.StringIO()
sortby = SortKey.CUMULATIVE
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
# with open(f"plot_torch_aot_profile_{name}_{suffix}.pickle", "wb") as f:
# pickle.dump(ps, f)
raw = s.getvalue()
text = "\n".join(raw.split("\n")[:200])
if verbose:
print(text)
with open(f"plot_torch_aot_profile_{name}_{suffix}.txt", "w") as f:
f.write(raw)
root, nodes = profile2graph(ps, clean_text=clean_text)
text = root.to_text()
with open(f"plot_torch_aot_profile_{name}_{suffix}_h.txt", "w") as f:
f.write(text)
if verbose:
print("done.")
model, input_tensors = create_model_and_input()
def function_to_profile(model=model, input_tensors=input_tensors):
return get_torch_dort(model, *input_tensors)
profile_function("dort", function_to_profile, verbose=True, suffix="1")
profile dort: <function function_to_profile at 0x7f00a0713130>
1382362 function calls (1346683 primitive calls) in 1.256 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 1.388 1.388 /home/xadupre/github/experimental-experiment/_doc/examples/plot_torch_aot_201.py:512(function_to_profile)
1 0.000 0.000 1.388 1.388 /home/xadupre/github/experimental-experiment/_doc/examples/plot_torch_aot_201.py:260(get_torch_dort)
2 0.000 0.000 0.935 0.467 /home/xadupre/github/experimental-experiment/_doc/examples/plot_torch_aot_201.py:220(run)
11/4 0.000 0.000 0.836 0.209 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/nn/modules/module.py:1732(_wrapped_call_impl)
11/4 0.001 0.000 0.836 0.209 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/nn/modules/module.py:1740(_call_impl)
7/6 0.000 0.000 0.684 0.114 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:715(_fn)
3 0.000 0.000 0.529 0.176 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/autograd/graph.py:816(_engine_run_backward)
3 0.004 0.001 0.529 0.176 {method 'run_backward' of 'torch._C._EngineBase' objects}
2 0.000 0.000 0.491 0.245 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:523(_fn)
1 0.000 0.000 0.452 0.452 /home/xadupre/github/experimental-experiment/experimental_experiment/torch_models/training_helper.py:7(make_aot_ort)
1 0.000 0.000 0.451 0.451 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/onnxruntime.py:763(__init__)
6/4 0.000 0.000 0.447 0.112 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:116(call_func_at_runtime_with_args)
12/4 0.001 0.000 0.443 0.111 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph_module.py:821(call_wrapped)
4 0.000 0.000 0.443 0.111 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph_module.py:382(__call__)
2 0.000 0.000 0.443 0.221 /home/xadupre/github/experimental-experiment/_doc/examples/plot_torch_aot_201.py:232(_backward_)
2 0.000 0.000 0.443 0.221 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_tensor.py:568(backward)
2 0.000 0.000 0.442 0.221 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/autograd/__init__.py:242(backward)
2 0.000 0.000 0.441 0.220 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/autograd/function.py:292(apply)
2 0.000 0.000 0.441 0.220 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:1645(backward)
2 0.000 0.000 0.440 0.220 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:1912(call_compiled_backward)
1 0.000 0.000 0.392 0.392 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/_exporter_legacy.py:308(__init__)
1 0.000 0.000 0.388 0.388 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:1331(__call__)
1 0.000 0.000 0.388 0.388 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:449(__call__)
1 0.000 0.000 0.388 0.388 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:597(_compile)
1 0.000 0.000 0.387 0.387 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:689(compile_inner)
1 0.000 0.000 0.386 0.386 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_utils_internal.py:89(wrapper_function)
1 0.000 0.000 0.386 0.386 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:709(_compile_inner)
1 0.000 0.000 0.363 0.363 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py:1329(transform_code_object)
1 0.000 0.000 0.361 0.361 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:204(_fn)
1 0.000 0.000 0.360 0.360 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py:632(transform)
8 0.002 0.000 0.359 0.045 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/onnxruntime.py:884(_ort_acclerated_call)
1 0.000 0.000 0.359 0.359 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:2907(run)
6/1 0.000 0.000 0.359 0.359 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:1110(run)
100/44 0.000 0.000 0.358 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:998(step)
2 0.000 0.000 0.345 0.172 <eval_with_key>.244:4(forward)
1 0.000 0.000 0.334 0.334 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/_exporter_legacy.py:95(__init__)
1 0.001 0.001 0.334 0.334 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/_exporter_legacy.py:123(_initiate_registry_from_torchlib)
1 0.007 0.007 0.331 0.331 /home/xadupre/github/onnxscript/onnxscript/_framework_apis/torch_2_5.py:107(get_torchlib_ops)
184 0.002 0.000 0.323 0.002 /home/xadupre/github/onnxscript/onnxscript/values.py:640(function_ir)
1 0.000 0.000 0.314 0.314 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:3098(RETURN_VALUE)
1 0.000 0.000 0.314 0.314 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:3070(_return)
1 0.000 0.000 0.314 0.314 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:989(compile_subgraph)
1 0.000 0.000 0.313 0.313 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1324(compile_and_call_fx_graph)
1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1444(call_user_compiler)
1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/output_graph.py:1450(_call_user_compiler)
2/1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py:73(__call__)
1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/__init__.py:2318(__call__)
1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/onnxruntime.py:1153(__call__)
1 0.000 0.000 0.307 0.307 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:23(__call__)
1 0.000 0.000 0.306 0.306 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:940(aot_module_simplified)
1 0.000 0.000 0.301 0.301 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:1061(dispatch_and_compile)
1 0.000 0.000 0.301 0.301 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:516(create_aot_dispatcher_function)
1 0.000 0.000 0.301 0.301 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:529(_create_aot_dispatcher_function)
1 0.000 0.000 0.251 0.251 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py:337(aot_dispatch_autograd)
184 0.001 0.000 0.215 0.001 /home/xadupre/github/onnxscript/onnxscript/_internal/ast_utils.py:16(get_src_and_ast)
185 0.000 0.000 0.179 0.001 /usr/lib/python3.10/inspect.py:1133(getsource)
185 0.005 0.000 0.178 0.001 /usr/lib/python3.10/inspect.py:1112(getsourcelines)
593/496 0.001 0.000 0.164 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_ops.py:722(__call__)
890/570 0.002 0.000 0.160 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/utils/_stats.py:16(wrapper)
1 0.000 0.000 0.160 0.160 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:234(aot_dispatch_autograd_graph)
184 0.026 0.000 0.159 0.001 /usr/lib/python3.10/inspect.py:1101(getblock)
155 0.007 0.000 0.148 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py:372(__torch_dispatch__)
1 0.000 0.000 0.145 0.145 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py:46(_create_graph)
1 0.000 0.000 0.145 0.145 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:2170(wrapped)
1 0.000 0.000 0.145 0.145 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:2108(trace)
1 0.000 0.000 0.145 0.145 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:1999(_trace_inner)
1 0.000 0.000 0.144 0.144 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_compile.py:22(inner)
1 0.000 0.000 0.144 0.144 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:1131(dispatch_trace)
1 0.000 0.000 0.141 0.141 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py:711(trace)
4 0.000 0.000 0.140 0.035 /home/xadupre/github/onnxscript/onnxscript/optimizer/__init__.py:15(optimize)
4 0.000 0.000 0.140 0.035 /home/xadupre/github/onnxscript/onnxscript/optimizer/_legacy/_optimizer.py:24(optimize)
1 0.000 0.000 0.136 0.136 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/_symbolic_trace.py:698(flatten_fn)
305/12 0.006 0.000 0.135 0.011 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py:66(wrapper)
1 0.000 0.000 0.135 0.135 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:1181(wrapped)
1 0.000 0.000 0.132 0.132 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:663(inner_fn)
1 0.000 0.000 0.131 0.131 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:643(joint_helper)
1 0.000 0.000 0.131 0.131 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:396(_functionalized_f_helper)
6 0.001 0.000 0.120 0.020 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/interpreter.py:117(run)
429 0.001 0.000 0.120 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:1230(__torch_function__)
26710 0.070 0.000 0.119 0.000 /usr/lib/python3.10/tokenize.py:431(_tokenize)
1 0.000 0.000 0.119 0.119 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:276(inner_fn_with_anomaly)
1 0.000 0.000 0.119 0.119 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:193(inner_fn)
2 0.026 0.013 0.112 0.056 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/decomposition_table.py:14(_create_onnx_supports_op_overload_table)
2 0.000 0.000 0.110 0.055 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/onnxruntime.py:1099(compile)
2 0.000 0.000 0.109 0.055 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/passes/infra/partitioner.py:370(partition_and_fuse)
591/586 0.002 0.000 0.106 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1242(__torch_dispatch__)
2 0.000 0.000 0.104 0.052 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/passes/infra/partitioner.py:283(fuse_partitions)
2 0.000 0.000 0.104 0.052 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/passes/utils/fuser_utils.py:244(fuse_by_partitions)
591/586 0.005 0.000 0.103 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1768(dispatch)
2 0.000 0.000 0.102 0.051 /home/xadupre/github/experimental-experiment/_doc/examples/plot_torch_aot_201.py:165(forward)
2 0.000 0.000 0.102 0.051 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py:1113(forward)
2 0.000 0.000 0.102 0.051 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:284(runtime_wrapper)
4/2 0.000 0.000 0.101 0.051 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:99(g)
2 0.000 0.000 0.101 0.051 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/autograd/function.py:559(apply)
2 0.000 0.000 0.101 0.050 {built-in method apply}
2 0.000 0.000 0.101 0.050 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:1520(forward)
5 0.000 0.000 0.100 0.020 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/overrides.py:1668(handle_torch_function)
2 0.000 0.000 0.100 0.050 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:474(wrapper)
2 0.000 0.000 0.100 0.050 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py:659(inner_fn)
184 0.000 0.000 0.098 0.001 /home/xadupre/github/onnxscript/onnxscript/converter.py:1463(translate_function_signature)
2 0.000 0.000 0.098 0.049 <eval_with_key>.240:4(forward)
130 0.001 0.000 0.098 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/interpreter.py:210(run_node)
184 0.007 0.000 0.097 0.001 /home/xadupre/github/onnxscript/onnxscript/converter.py:1378(_translate_function_signature_common)
212 0.002 0.000 0.096 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1326(_cached_dispatch_impl)
1 0.000 0.000 0.094 0.094 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/backends/common.py:49(_wrapped_bw_compiler)
1 0.000 0.000 0.093 0.093 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/_lazy_graph_module.py:115(_lazy_forward)
82 0.000 0.000 0.092 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/interpreter.py:288(call_function)
2/1 0.000 0.000 0.087 0.087 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/autograd/__init__.py:358(grad)
297/276 0.001 0.000 0.084 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:1328(__torch_dispatch__)
69/54 0.003 0.000 0.075 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/proxy_tensor.py:761(proxy_call)
186085/180763 0.035 0.000 0.074 0.000 {built-in method builtins.isinstance}
8 0.000 0.000 0.069 0.009 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/_pass.py:240(run)
4 0.000 0.000 0.066 0.017 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py:1696(_run)
32 0.000 0.000 0.066 0.002 /home/xadupre/github/onnxscript/onnxscript/_legacy_ir/visitor.py:786(visit_model)
4 0.000 0.000 0.065 0.016 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py:463(run)
12 0.000 0.000 0.060 0.005 /home/xadupre/github/onnxscript/onnxscript/rewriter/__init__.py:28(rewrite)
89 0.001 0.000 0.060 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py:388(run_node)
86 0.001 0.000 0.058 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py:1601(run_node)
1 0.001 0.001 0.057 0.057 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/decomposition_table.py:73(create_onnx_friendly_decomposition_table)
7575/1820 0.010 0.000 0.057 0.000 /home/xadupre/github/onnxscript/onnxscript/type_annotation.py:131(is_value_type)
1 0.000 0.000 0.056 0.056 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/partitioners.py:1779(min_cut_rematerialization_partition)
2 0.000 0.000 0.056 0.028 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:848(functional_call)
38 0.000 0.000 0.054 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/experimental/symbolic_shapes.py:6471(run_node)
61 0.001 0.000 0.054 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py:604(call_function)
32 0.001 0.000 0.054 0.002 /home/xadupre/github/onnxscript/onnxscript/_legacy_ir/visitor.py:646(visit_graph)
4 0.001 0.000 0.054 0.013 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/passes/utils/fuser_utils.py:95(fuse_as_graphmodule)
590 0.005 0.000 0.053 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:1104(create_node)
11 0.000 0.000 0.050 0.005 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:1562(python_code)
16562 0.008 0.000 0.050 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/_exporter_legacy.py:210(is_registered_op)
1573 0.050 0.000 0.050 0.000 {built-in method builtins.compile}
331/147 0.001 0.000 0.049 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/utils/_pytree.py:923(tree_map)
397 0.002 0.000 0.047 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:1493(node_copy)
593/536 0.002 0.000 0.047 0.000 /home/xadupre/github/onnxscript/onnxscript/_legacy_ir/visitor.py:799(visit_node)
133/104 0.001 0.000 0.045 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/nn/modules/module.py:1935(__setattr__)
8 0.000 0.000 0.045 0.006 /home/xadupre/github/onnxscript/onnxscript/optimizer/_legacy/constant_folding.py:270(fold_constants)
8 0.000 0.000 0.045 0.006 /home/xadupre/github/onnxscript/onnxscript/optimizer/_legacy/constant_folding.py:264(visit_model)
8 0.000 0.000 0.045 0.006 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph_module.py:792(recompile)
5942/63 0.003 0.000 0.044 0.001 /home/xadupre/github/onnxscript/onnxscript/ir/serde.py:94(wrapper)
13 0.000 0.000 0.043 0.003 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph_module.py:548(graph)
1513/224 0.004 0.000 0.043 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/utils/_pytree.py:801(unflatten)
16621 0.011 0.000 0.043 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/_exporter_legacy.py:188(get_op_functions)
198 0.001 0.000 0.043 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1701(_output_from_cache_entry)
11524/5330 0.020 0.000 0.042 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/node.py:883(map_aggregate)
202 0.005 0.000 0.041 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1635(_get_output_tensor_from_cache_entry)
11 0.000 0.000 0.041 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:1639(_python_code)
11 0.004 0.000 0.041 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:408(_gen_python_code)
14/9 0.000 0.000 0.039 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:627(wrapper)
212 0.001 0.000 0.039 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1369(_cache_key)
14/9 0.000 0.000 0.039 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:1728(CALL_FUNCTION)
14/9 0.000 0.000 0.039 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:941(call_function)
6208 0.013 0.000 0.038 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/node.py:854(__setattr__)
314 0.001 0.000 0.038 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/utils/_pytree.py:1130(tree_map_only)
1 0.000 0.000 0.038 0.038 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/collect_metadata_analysis.py:171(inner)
176 0.002 0.000 0.038 0.000 /home/xadupre/github/onnxscript/onnxscript/optimizer/_legacy/constant_folding.py:165(process_node)
1024 0.001 0.000 0.037 0.000 /home/xadupre/github/onnxscript/onnxscript/type_annotation.py:172(is_valid_type)
10 0.000 0.000 0.037 0.004 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph_module.py:437(__init__)
7059/6619 0.004 0.000 0.036 0.000 {built-in method builtins.next}
804/219 0.005 0.000 0.035 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_subclasses/fake_tensor.py:1445(_prep_args_for_hash)
3697 0.004 0.000 0.034 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/node.py:874(map_arg)
17/6 0.000 0.000 0.033 0.006 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/variables/lazy.py:166(realize_and_forward)
4 0.000 0.000 0.033 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/variables/nn_module.py:850(call_function)
25891 0.032 0.000 0.032 0.000 {method 'match' of 're.Pattern' objects}
5/4 0.000 0.000 0.031 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/variables/functions.py:325(call_function)
5/4 0.000 0.000 0.031 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/variables/functions.py:119(call_function)
5/4 0.000 0.000 0.031 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:963(inline_user_function_return)
5/4 0.000 0.000 0.031 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:3120(inline_call)
1 0.000 0.000 0.031 0.031 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py:109(inner_fn)
897/801 0.002 0.000 0.031 0.000 /home/xadupre/github/onnxscript/onnxscript/type_annotation.py:150(<listcomp>)
5/4 0.000 0.000 0.031 0.008 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:3157(inline_call_)
2800/2665 0.003 0.000 0.030 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/node.py:889(<listcomp>)
4 0.000 0.000 0.030 0.007 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/passes/utils/common.py:27(lift_subgraph_as_module)
184 0.000 0.000 0.028 0.000 /usr/lib/python3.10/ast.py:33(parse)
4 0.000 0.000 0.028 0.007 /home/xadupre/github/onnxruntime/build/linux_cuda/Release/onnxruntime/capi/onnxruntime_inference_collection.py:406(__init__)
20 0.000 0.000 0.028 0.001 /home/xadupre/github/onnxscript/onnxscript/ir/serde.py:461(deserialize_model)
4 0.027 0.007 0.027 0.007 /home/xadupre/github/onnxruntime/build/linux_cuda/Release/onnxruntime/capi/onnxruntime_inference_collection.py:484(_create_inference_session)
12 0.000 0.000 0.027 0.002 /home/xadupre/github/onnxscript/onnxscript/rewriter/pattern.py:1445(apply_to_model)
12 0.001 0.000 0.025 0.002 /home/xadupre/github/onnxscript/onnxscript/rewriter/pattern.py:1412(_apply_to_graph_or_function)
112 0.001 0.000 0.025 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/proxy.py:209(create_proxy)
20 0.000 0.000 0.024 0.001 /home/xadupre/github/onnxscript/onnxscript/ir/serde.py:551(_deserialize_graph)
4 0.001 0.000 0.024 0.006 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/partitioners.py:153(_extract_graph_with_inputs_outputs)
12/9 0.002 0.000 0.023 0.003 {built-in method torch._C._nn.linear}
614 0.004 0.000 0.023 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/node.py:367(prepend)
59 0.000 0.000 0.023 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/onnx/_internal/fx/onnxfunction_dispatcher.py:96(dispatch)
1 0.000 0.000 0.023 0.023 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/guards.py:2107(__init__)
7575 0.007 0.000 0.023 0.000 /home/xadupre/github/onnxscript/onnxscript/type_annotation.py:123(_is_tensor_type)
2553 0.002 0.000 0.022 0.000 /home/xadupre/github/onnxscript/onnxscript/rewriter/pattern.py:1284(try_rewrite)
28 0.000 0.000 0.022 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:1869(LOAD_ATTR)
28 0.000 0.000 0.022 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py:1862(_load_attr)
20 0.000 0.000 0.022 0.001 /home/xadupre/github/onnxscript/onnxscript/optimizer/_remove_unused_function.py:64(remove_unused_functions)
12/9 0.000 0.000 0.022 0.002 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/nn/functional.py:1693(relu)
370 0.004 0.000 0.022 0.000 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/fx/graph.py:634(emit_node)
2873/2654 0.002 0.000 0.022 0.000 /usr/lib/python3.10/contextlib.py:130(__enter__)
9 0.001 0.000 0.022 0.002 {built-in method torch.relu}
31641 0.010 0.000 0.022 0.000 {method 'get' of 'dict' objects}
31/29 0.000 0.000 0.021 0.001 /home/xadupre/vv/this/lib/python3.10/site-packages/torch/_dynamo/variables/builtin.py:980(call_function)
done.
Benchmark exported models with ORT¶
def benchmark(shape):
data = []
data_mem_first_run = []
data_mem_run = []
confs = list(
itertools.product(
export_functions,
["CPU", "CUDA"],
)
)
loop = tqdm(confs)
print(f"number of experiments: {len(loop)}")
for export_fct, p in loop:
name = export_fct.__name__.replace("get_torch_", "")
obs = {} # system_info()
obs["name"] = name
obs["compute"] = p
obs["export"] = name
model, input_tensors = create_model_and_input()
if p == "CUDA":
if not has_cuda:
continue
model = model.cuda()
input_tensors = [i.cuda() for i in input_tensors]
try:
exported_model = export_fct(model, *input_tensors)
except Exception as e:
obs["error"] = str(e)
data.append(obs)
continue
def call_model(
export_fct=export_fct,
exported_model=exported_model,
input_tensors=input_tensors,
):
res = run(exported_model, *input_tensors)
return res
stat = start_spying_on(cuda=1 if has_cuda else 0)
try:
call_model()
except Exception as e:
loop.set_description(f"ERROR-run: {name} {e}")
obs.update({"error": e, "step": "load"})
data.append(obs)
stat.stop()
continue
memobs = flatten(stat.stop())
memobs.update(obs)
data_mem_first_run.append(memobs)
# memory consumption
stat = start_spying_on(cuda=1 if has_cuda else 0)
for _ in range(0, script_args.warmup):
call_model()
memobs = flatten(stat.stop())
memobs.update(obs)
data_mem_run.append(memobs)
obs.update(
measure_time(
call_model,
max_time=script_args.maxtime,
repeat=script_args.repeat,
number=1,
)
)
profile_function(name, call_model, with_args=False, suffix=f"run_{p}")
loop.set_description(f"{obs['average']} {name} {p}")
data.append(obs)
del model
del exported_model
gc.collect()
time.sleep(1)
df = pandas.DataFrame(data)
df.to_csv("plot_torch_aot_ort_time.csv", index=False)
df.to_excel("plot_torch_aot_ort_time.xlsx", index=False)
dfmemr = pandas.DataFrame(data_mem_run)
dfmemr.to_csv("plot_torch_aot_ort_run_mem.csv", index=False)
dfmemr.to_excel("plot_torch_aot_ort_run_mem.xlsx", index=False)
dfmemfr = pandas.DataFrame(data_mem_first_run)
dfmemfr.to_csv("plot_torch_aot_ort_first_run_mem.csv", index=False)
dfmemfr.to_excel("plot_torch_aot_ort_first_run_mem.xlsx", index=False)
return df, dfmemfr, dfmemr
df, dfmemfr, dfmemr = benchmark(list(input_tensors[0].shape))
print(df)
0%| | 0/6 [00:00<?, ?it/s]number of experiments: 6
0.009931689636604542 eager CPU: 0%| | 0/6 [00:00<?, ?it/s]
0.009931689636604542 eager CPU: 17%|█▋ | 1/6 [00:02<00:10, 2.10s/it]
0.0015548287599813193 eager CUDA: 17%|█▋ | 1/6 [00:02<00:10, 2.10s/it]
0.0015548287599813193 eager CUDA: 33%|███▎ | 2/6 [00:03<00:07, 1.96s/it]
0.009628557636460755 default CPU: 33%|███▎ | 2/6 [00:16<00:07, 1.96s/it]
0.009628557636460755 default CPU: 50%|█████ | 3/6 [00:17<00:21, 7.22s/it]
0.0009324981768500316 default CUDA: 50%|█████ | 3/6 [00:23<00:21, 7.22s/it]
0.0009324981768500316 default CUDA: 67%|██████▋ | 4/6 [00:24<00:14, 7.28s/it]/home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:130: UserWarning: Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.
warnings.warn(
0.005942785190459939 dort CPU: 67%|██████▋ | 4/6 [00:26<00:14, 7.28s/it]
0.005942785190459939 dort CPU: 83%|████████▎ | 5/6 [00:27<00:05, 5.72s/it]/home/xadupre/vv/this/lib/python3.10/site-packages/torch/_functorch/_aot_autograd/utils.py:130: UserWarning: Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.
warnings.warn(
0.0029184219165472314 dort CUDA: 83%|████████▎ | 5/6 [00:29<00:05, 5.72s/it]
0.0029184219165472314 dort CUDA: 100%|██████████| 6/6 [00:30<00:00, 4.74s/it]
0.0029184219165472314 dort CUDA: 100%|██████████| 6/6 [00:30<00:00, 5.10s/it]
name compute export average deviation min_exec max_exec repeat number ttime context_size warmup_time
0 eager CPU eager 0.009932 0.000668 0.008584 0.010456 1 11.0 0.109249 64 0.008224
1 eager CUDA eager 0.001555 0.000574 0.001247 0.003799 1 75.0 0.116612 64 0.004597
2 default CPU default 0.009629 0.001138 0.008059 0.010996 1 11.0 0.105914 64 0.006512
3 default CUDA default 0.000932 0.000089 0.000859 0.001395 1 147.0 0.137077 64 0.002328
4 dort CPU dort 0.005943 0.002426 0.003015 0.010610 1 21.0 0.124798 64 0.003301
5 dort CUDA dort 0.002918 0.000118 0.002435 0.002954 1 36.0 0.105063 64 0.003967
Other view
def view_time(df, title, suffix="time"):
piv = pandas.pivot_table(df, index="export", columns=["compute"], values="average")
print(piv)
piv.to_csv(f"plot_torch_aot_{suffix}_compute.csv")
piv.to_excel(f"plot_torch_aot_{suffix}_compute.xlsx")
piv_cpu = pandas.pivot_table(
df[df.compute == "CPU"],
index="export",
columns=["compute"],
values="average",
)
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(title)
piv_cpu.plot.barh(ax=ax[0], title="CPU", logx=True)
if has_cuda:
piv_gpu = pandas.pivot_table(
df[df.compute == "CUDA"],
index="export",
columns=["compute"],
values="average",
)
piv_gpu.plot.barh(ax=ax[1], title="CUDA", logx=True)
fig.tight_layout()
fig.savefig(f"plot_torch_aot_{suffix}.png")
return ax
view_time(df, "Compares processing time on backends")
compute CPU CUDA
export
default 0.009629 0.000932
dort 0.005943 0.002918
eager 0.009932 0.001555
array([<Axes: title={'center': 'CPU'}, ylabel='export'>,
<Axes: title={'center': 'CUDA'}, ylabel='export'>], dtype=object)
Memory First Running Time (ORT)¶
for compute in ["CPU", "CUDA"]:
if not has_cuda and compute == "CUDA":
continue
ax = memory_peak_plot(
dfmemfr[dfmemfr.compute == compute],
("export",),
suptitle=f"Memory Consumption of backend, first running time"
f"\nrunning on {compute}",
bars=[model_size * i / 2**20 for i in range(1, 3)],
figsize=(18, 6),
)
get_figure(ax).savefig(f"plot_torch_aot_first_run_mem_{compute}.png")
Memory Running Time (ORT)¶
for compute in ["CPU", "CUDA"]:
if not has_cuda and compute == "CUDA":
continue
ax = memory_peak_plot(
dfmemr[dfmemr.compute == compute],
("export",),
suptitle=f"Memory Consumption of backens, running time\nrunning on {compute}",
bars=[model_size * i / 2**20 for i in range(1, 3)],
figsize=(18, 6),
)
get_figure(ax).savefig(f"plot_torch_aot_run_mem_{compute}.png")
Total running time of the script: (0 minutes 59.956 seconds)