Note
Go to the end to download the full example code.
301: Compares LLAMA exporters¶
The script compares the two exporters implemented in pytorch for a part of llama model. The model are compared after all optimizations were made with and onnxruntime.
TorchScript-based ONNX Exporter, let’s call it script
TorchDynamo-based ONNX Exporter, let’s call it dynamo
To run the script:
python _doc/examples/plot_llama_diff_export --help
Some helpers¶
from experimental_experiment.args import get_parsed_args
script_args = get_parsed_args(
"plot_llama_diff_export",
description=__doc__,
part=("attention", "one value among attention, decoder, model"),
exporter=("dynamo", "one value among dynamo, custom"),
ortopt=(1, "run onnxruntime optimization"),
opset=(18, "onnx opset"),
expose="part,exporter,ortopt,opset",
)
import contextlib
import os
import io
import warnings
import logging
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
import onnxruntime
has_cuda = "CUDAExecutionProvider" in onnxruntime.get_available_providers()
except ImportError:
print("onnxruntime not available.")
import sys
sys.exit(0)
import numpy as np
import onnx
from onnx_array_api.reference import compare_onnx_execution, ExtendedReferenceEvaluator
import torch
from experimental_experiment.ext_test_case import unit_test_going
from experimental_experiment.torch_interpreter import to_onnx
from experimental_experiment.xbuilder import OptimizationOptions
from experimental_experiment.convert.convert_helper import (
optimize_model_proto_oxs,
ort_optimize,
)
from experimental_experiment.torch_models.llama_helper import (
get_llama_model,
get_llama_attention,
get_llama_decoder,
)
from experimental_experiment.torch_models.dump_helper import reorder_functions_in_proto
has_cuda = has_cuda and torch.cuda.is_available()
logging.disable(logging.ERROR)
provider = "cuda" if has_cuda else "cpu"
The exporting functions¶
print(f"part={script_args.part}")
print(f"exporter={script_args.exporter}")
ortopt = script_args.ortopt in (1, "1")
print(f"ortopt={ortopt}")
opset = int(script_args.opset)
print(f"opset={opset}")
def opt_filename(filename: str) -> str:
name, ext = os.path.splitext(filename)
return f"{name}.opt{ext}"
def export_script(filename, model, *args):
with contextlib.redirect_stdout(io.StringIO()):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
torch.onnx.export(
model, args, filename, input_names=["input"], opset_version=opset
)
if ortopt:
onx = onnx.load(filename)
ort_optimize(onx, opt_filename(filename), providers=provider)
def export_dynamo(filename, model, *args):
with contextlib.redirect_stdout(io.StringIO()):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
export_output = torch.onnx.export(model, args, dynamo=True)
model = export_output.model_proto
try:
new_model = optimize_model_proto_oxs(model)
except ImportError as e:
print("skipping optimization, missing package or failure:", e)
new_model = model
with open(filename, "wb") as f:
f.write(new_model.SerializeToString())
if ortopt:
ort_optimize(new_model, opt_filename(filename), providers=provider)
def export_custom(filename, model, *args):
new_model = to_onnx(
model,
tuple(args),
input_names=[f"input{i}" for i in range(len(args))],
options=OptimizationOptions(
remove_unused=True,
constant_folding=False,
),
target_opset=opset,
)
with open(filename, "wb") as f:
f.write(new_model.SerializeToString())
if ortopt:
ort_optimize(new_model, opt_filename(filename), providers=provider)
part=attention
exporter=dynamo
ortopt=True
opset=18
Model and data¶
if unit_test_going():
kwargs = dict(input_dims=[(2, 1024)] * 2)
else:
kwargs = dict(
input_dims=[(2, 1024)] * 2,
_attn_implementation="eager",
num_hidden_layers=1,
hidden_size=512,
vocab_size=4000,
intermediate_size=2000,
max_position_embeddings=2048,
num_attention_heads=8,
)
if script_args.part == "attention":
model, inputs = get_llama_attention(**kwargs)
elif script_args.part == "decoder":
model, inputs = get_llama_decoder(**kwargs)
elif script_args.part == "model":
model, inputs = get_llama_model(**kwargs)
else:
raise RuntimeError(f"Unexpected value for part={script_args.part!r}")
print(f"simple run with {len(inputs)} inputs")
expected = model(*inputs[0])
if isinstance(expected, tuple):
for t in expected:
if not isinstance(t, tuple):
print(f"eager worked {t.shape}, {t.dtype}")
else:
print(f"eager worked {type(t)}")
else:
print(f"eager mode worked {expected.shape}, {expected.dtype}")
simple run with 2 inputs
eager mode worked torch.Size([2, 1024, 512]), torch.float32
Exporting¶
exporter = script_args.exporter
file1 = f"llama.{script_args.part}.script.onnx"
file2 = f"llama.{script_args.part}.{exporter}.onnx"
print("torch script exporter")
export_script(file1, model, *inputs[0])
if exporter == "dynamo":
print("torch dynamo exporter")
export_dynamo(file2, model, *inputs[0])
elif exporter == "custom":
print("torch custom exporter")
export_custom(file2, model, *inputs[0])
else:
raise AssertionError(f"Unexpected value for exporter={exporter!r}.")
torch script exporter
torch dynamo exporter
Applied 7 of general pattern rewrite rules.
Verification¶
if ortopt:
print("Using models optimized by onnxruntime")
file1 = f"llama.{script_args.part}.script.opt.onnx"
file2 = f"llama.{script_args.part}.{exporter}.opt.onnx"
providers = (
["CPUExecutionProvider"]
if provider == "cpu"
else [("CUDAExecutionProvider", {}), ("CPUExecutionProvider", {})]
)
model1 = onnx.load(file1)
model2 = onnx.load(file2)
feeds1, feeds2 = {}, {}
for i in range(len(inputs[0])):
x = inputs[0][i].detach().numpy()
feeds1[model1.graph.input[i].name] = x
feeds2[model2.graph.input[i].name] = x
if ortopt:
sess1 = onnxruntime.InferenceSession(file1, providers=providers)
sess2 = onnxruntime.InferenceSession(file2, providers=providers)
got1 = sess1.run(None, feeds1)
got2 = sess2.run(None, feeds2)
diff1 = np.abs(expected.detach().numpy() - got1[0]).max()
diff2 = np.abs(expected.detach().numpy() - got2[0]).max()
print(f"Error with the eager model and onnxruntime: {diff1}, {diff2}")
Using models optimized by onnxruntime
Error with the eager model and onnxruntime: 3.195460885763168e-05, 3.195460885763168e-05
Verification with the reference evaluator¶
reorder_functions_in_proto(file1)
reorder_functions_in_proto(file2)
sess1 = ExtendedReferenceEvaluator(file1)
try:
sess2 = ExtendedReferenceEvaluator(file2)
except NotImplementedError as e:
print(e)
sess2 = None
got1 = sess1.run(None, feeds1)
got2 = got1 if sess2 is None else sess2.run(None, feeds2)
if isinstance(expected, tuple):
diff1 = np.abs(expected[0].detach().numpy() - got1[0]).max()
diff2 = np.abs(expected[0].detach().numpy() - got2[0]).max()
else:
diff1 = np.abs(expected.detach().numpy() - got1[0]).max()
diff2 = np.abs(expected.detach().numpy() - got2[0]).max()
print(f"Error with the eager model and the reference evaluator: {diff1}, {diff2}")
Error with the eager model and the reference evaluator: 4.470348358154297e-08, 4.470348358154297e-08
Comparison and execution¶
def clean_name(name):
return name.replace(
"_inlfunc_transformers_models_llama_modeling_llama_LlamaAttention", ""
).replace("_inlfunc_torch_nn_modules_linear_Linear", "")
if sess2 is not None:
try:
np_inputs = [i.detach().numpy() for i in inputs[0]]
res1, res2, align, dc = compare_onnx_execution(
model1, model2, inputs=np_inputs, verbose=1, raise_exc=False
)
for r in res2:
r.name = clean_name(r.name)
text = dc.to_str(res1, res2, align, column_size=90)
print(text)
except AssertionError as e:
if "Unexpected type <class 'list'> for value, it must be a numpy array." not in str(e):
raise
print(e)
[compare_onnx_execution] execute with 3 inputs
[compare_onnx_execution] execute first model
[compare_onnx_execution] got 60 results
[compare_onnx_execution] execute second model
[compare_onnx_execution] got 60 results (first model)
[compare_onnx_execution] got 56 results (second model)
[compare_onnx_execution] compute edit distance
[compare_onnx_execution] got 61 pairs
[compare_onnx_execution] done
001 = | INITIA float32 2:512x512 IATA onnx::MatMul_170 | INITIA float32 2:512x512 IATA t
002 = | INITIA float32 2:512x512 WBZW onnx::MatMul_171 | INITIA float32 2:512x512 WBZW t_1
003 = | INITIA float32 2:512x512 QDET onnx::MatMul_172 | INITIA float32 2:512x512 QDET t_2
004 - | INITIA float32 2:512x512 ZCNL onnx::MatMul_218 |
005 - | INITIA int64 1:1 BAAA /attention/Constant_25_output_0 |
006 = | INITIA int64 1:2 GGAA splits | INITIA int64 1:2 GGAA splits_token_14
007 = | INITIA int64 1:4 CKZM /attention/Constant_2_output_0 | INITIA int64 1:4 CKZM val_2
008 + | | INITIA float32 2:512x512 ZCNL t_3
009 ~ | INITIA int64 1:1 AAAA /attention/Constant_6_output_0 | INITIA int64 BAAA node_aten_unsqueeze_46_dim_0
010 = | INITIA float32 3:1x32x1 DAAA /attention/rotary_emb/Expand_out | INITIA float32 3:1x32x1 DAAA _to_copy_2
011 - | INITIA int64 1:1 KAAA /attention/Constant_24_output_0 |
012 = | INITIA int64 1:2 GGAA splits_token_14 | INITIA int64 1:2 GGAA splits
013 - | INITIA int64 1:1 DAAA const_transpose_optimizer_token_ |
014 = | INITIA int64 1:3 CKZA /attention/Constant_26_output_0 | INITIA int64 1:3 CKZA val_115
015 = | INPUT float32 3:2x1024x512 GQDE input | INPUT float32 3:2x1024x512 GQDE hidden_states
016 = | INPUT float32 4:2x1x1024x1024 AAAA onnx::Slice_1 | INPUT float32 4:2x1x1024x1024 AAAA attention_mask
017 = | INPUT int64 2:1x1024 KAQG onnx::Unsqueeze_2 | INPUT int64 2:1x1024 KAQG position_ids
018 = | RESULT int64 3:1x1x1024 KAQG Unsqueeze /attention/rotary_emb/Unsqueeze_ | RESULT int64 3:1x1x1024 KAQG Unsqueeze unsqueeze_2
019 = | RESULT float32 3:1x1x1024 KAQG Cast /attention/rotary_emb/Cast_outpu | RESULT float32 3:1x1x1024 KAQG Cast _to_copy_1
020 = | RESULT float32 3:1x32x1024 EFXM MatMul /attention/rotary_emb/MatMul_out | RESULT float32 3:1x32x1024 EFXM MatMul matmul_3
021 = | RESULT float32 3:1x64x1024 JKJK Concat /attention/rotary_emb/Concat | RESULT float32 3:1x64x1024 JKJK Concat node_Concat_64
022 = | RESULT float32 3:1x64x1024 RMRM Sin /attention/rotary_emb/Sin | RESULT float32 3:1x64x1024 RMRM Sin node_Sin_66
023 = | RESULT float32 4:1x1x64x1024 RMRM Unsqueeze /attention/Unsqueeze_1 | RESULT float32 4:1x1x64x1024 RMRM Unsqueeze node_aten_unsqueeze_73_n2
024 = | RESULT float32 4:1x1024x1x64 GSEC Transpose Transpose_token_7_out0 | RESULT float32 4:1x1024x1x64 GSEC Transpose Transpose_token_7_out0
025 = | RESULT float32 3:2x1024x512 XFHQ MatMul /attention/k_proj/MatMul_output_ | RESULT float32 3:2x1024x512 XFHQ MatMul matmul_1
026 = | RESULT float32 4:2x1024x8x64 XFHQ Reshape /attention/Reshape_1_output_0 | RESULT float32 4:2x1024x8x64 XFHQ Reshape view_1
027 = | RESULT float32 4:2x1024x8x32 ELQE Split /attention/Slice_2 | RESULT float32 4:2x1024x8x32 ELQE Split node_Slice_114
028 = | RESULT float32 4:2x1024x8x32 TVRL Split /attention/Slice_3 | RESULT float32 4:2x1024x8x32 TVRL Split node_Slice_125
029 = | RESULT float32 4:2x1024x8x32 HFJP Neg /attention/Neg_1 | RESULT float32 4:2x1024x8x32 HFJP Neg node_Neg_126
030 = | RESULT float32 4:2x1024x8x64 LPAU Concat /attention/Concat_1 | RESULT float32 4:2x1024x8x64 LPAU Concat node_Concat_127
031 = | RESULT float32 4:2x1024x8x64 RPCZ Mul /attention/Mul_3 | RESULT float32 4:2x1024x8x64 RPCZ Mul node_Mul_128
032 = | RESULT float32 3:1x64x1024 NHNH Cos /attention/rotary_emb/Cos | RESULT float32 3:1x64x1024 NHNH Cos node_Cos_65
033 = | RESULT float32 4:1x1x64x1024 NHNH Unsqueeze /attention/Unsqueeze | RESULT float32 4:1x1x64x1024 NHNH Unsqueeze node_aten_unsqueeze_72_n2
034 = | RESULT float32 4:1x1024x1x64 CJYF Transpose Transpose_token_11_out0 | RESULT float32 4:1x1024x1x64 CJYF Transpose Transpose_token_11_out0
035 = | RESULT float32 4:2x1024x8x64 WPCM Mul /attention/Mul_2 | RESULT float32 4:2x1024x8x64 WPCM Mul node_Mul_103
036 = | RESULT float32 4:2x1024x8x64 NEFL Add /attention/Add_1 | RESULT float32 4:2x1024x8x64 NEFL Add node_Add_129
037 = | RESULT float32 4:2x8x64x1024 RBQA Transpose /attention/Transpose_3_output_0 | RESULT float32 4:2x8x64x1024 RBQA Transpose transpose_4
038 = | RESULT float32 4:1x1x1024x64 GSEC Transpose /attention/Unsqueeze_1_output_0 | RESULT float32 4:1x1x1024x64 GSEC Transpose unsqueeze_4
039 = | RESULT float32 3:2x1024x512 EAEU MatMul /attention/q_proj/MatMul_output_ | RESULT float32 3:2x1024x512 EAEU MatMul matmul
040 = | RESULT float32 4:2x1024x8x64 EAEU Reshape /attention/Reshape_output_0 | RESULT float32 4:2x1024x8x64 EAEU Reshape view
041 = | RESULT float32 4:2x8x1024x64 SLIS Transpose /attention/Transpose_output_0 | RESULT float32 4:2x8x1024x64 SLIS Transpose transpose
042 = | RESULT float32 4:2x8x1024x32 SPIF Split /attention/Slice_output_0 | RESULT float32 4:2x8x1024x32 SPIF Split slice_4
043 = | RESULT float32 4:2x8x1024x32 BWAM Split /attention/Slice_1_output_0 | RESULT float32 4:2x8x1024x32 BWAM Split slice_5
044 = | RESULT float32 4:2x8x1024x32 ZEAO Neg /attention/Neg_output_0 | RESULT float32 4:2x8x1024x32 ZEAO Neg neg
045 = | RESULT float32 4:2x8x1024x64 RSIT Concat /attention/Concat_output_0 | RESULT float32 4:2x8x1024x64 RSIT Concat cat_1
046 = | RESULT float32 4:2x8x1024x64 DMFD Mul /attention/Mul_1_output_0 | RESULT float32 4:2x8x1024x64 DMFD Mul mul_3
047 = | RESULT float32 4:1x1x1024x64 CJYF Transpose /attention/Unsqueeze_output_0 | RESULT float32 4:1x1x1024x64 CJYF Transpose unsqueeze_3
048 = | RESULT float32 4:2x8x1024x64 QIJJ Mul /attention/Mul_output_0 | RESULT float32 4:2x8x1024x64 QIJJ Mul mul_2
049 = | RESULT float32 4:2x8x1024x64 UVOM Add /attention/Add_output_0 | RESULT float32 4:2x8x1024x64 UVOM Add add
050 = | RESULT float32 4:2x8x1024x1024 CNEW FusedMatMul /attention/Div_output_0 | RESULT float32 4:2x8x1024x1024 CNEW FusedMatMul div
051 - | RESULT float32 4:2x1x1024x1024 AAAA Slice /attention/Slice_4_output_0 |
052 = | RESULT float32 4:2x8x1024x1024 CNEW Add /attention/Add_2_output_0 | RESULT float32 4:2x8x1024x1024 CNEW Add add_2
053 = | RESULT float32 4:2x8x1024x1024 ONNO Softmax /attention/Softmax_output_0 | RESULT float32 4:2x8x1024x1024 ONNO Softmax val_113
054 = | RESULT float32 3:2x1024x512 BLMP MatMul /attention/v_proj/MatMul_output_ | RESULT float32 3:2x1024x512 BLMP MatMul matmul_2
055 = | RESULT float32 4:2x1024x8x64 BLMP Reshape /attention/Reshape_2_output_0 | RESULT float32 4:2x1024x8x64 BLMP Reshape view_2
056 = | RESULT float32 4:2x8x1024x64 LCLQ Transpose /attention/Transpose_2_output_0 | RESULT float32 4:2x8x1024x64 LCLQ Transpose transpose_2
057 = | RESULT float32 4:2x8x1024x64 VXNW MatMul /attention/MatMul_1_output_0 | RESULT float32 4:2x8x1024x64 VXNW MatMul matmul_5
058 = | RESULT float32 4:2x1024x8x64 UYSP Transpose /attention/Transpose_4_output_0 | RESULT float32 4:2x1024x8x64 UYSP Transpose transpose_5
059 = | RESULT float32 3:2x1024x512 UYSP Reshape /attention/Reshape_3_output_0 | RESULT float32 3:2x1024x512 UYSP Reshape view_3
060 = | RESULT float32 3:2x1024x512 UQLL MatMul 169 | RESULT float32 3:2x1024x512 UQLL MatMul matmul_6
061 = | OUTPUT float32 3:2x1024x512 UQLL 169 | OUTPUT float32 3:2x1024x512 UQLL matmul_6
See plot_llama_diff_export for a better view.
Total running time of the script: (0 minutes 6.254 seconds)
Related examples
301: Compares LLAMA exporters for onnxrt backend
301: Compares LLAMA exporters for onnxrt backend
201: Evaluate different ways to export a torch model to ONNX
201: Evaluate different ways to export a torch model to ONNX
102: Fuse kernels in a small Llama Model
102: Fuse kernels in a small Llama Model