301: Compares LLAMA exporters¶

The script compares the two exporters implemented in pytorch for a part of llama model. The model are compared after all optimizations were made with and onnxruntime.

TorchScript-based ONNX Exporter, let’s call it script
TorchDynamo-based ONNX Exporter, let’s call it dynamo

To run the script:

python _doc/examples/plot_llama_diff_export --help

Some helpers¶

from experimental_experiment.args import get_parsed_args

script_args = get_parsed_args(
    "plot_llama_diff_export",
    description=__doc__,
    part=("model", "one value among model, ..."),
    exporter=("dynamo", "one value among dynamo, custom"),
    ortopt=(1, "run onnxruntime optimization"),
    opset=(18, "onnx opset"),
    expose="part,exporter,ortopt,opset",
)

import contextlib
import os
import io
import warnings
import logging

try:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        import onnxruntime

        has_cuda = "CUDAExecutionProvider" in onnxruntime.get_available_providers()
except ImportError:
    print("onnxruntime not available.")
    import sys

    sys.exit(0)

import numpy as np
import onnx
from onnx_array_api.reference import compare_onnx_execution
import torch
from experimental_experiment.ext_test_case import unit_test_going
from experimental_experiment.reference import ExtendedReferenceEvaluator
from experimental_experiment.torch_interpreter import to_onnx
from experimental_experiment.helpers import string_type
from experimental_experiment.xbuilder import OptimizationOptions
from experimental_experiment.convert.convert_helper import ort_optimize
from experimental_experiment.torch_models.llama_helper import get_llama_model
from experimental_experiment.torch_models.dump_helper import reorder_functions_in_proto

has_cuda = has_cuda and torch.cuda.device_count() > 0
logging.disable(logging.ERROR)
provider = "cuda" if has_cuda else "cpu"

The exporting functions¶

print(f"part={script_args.part}")
print(f"exporter={script_args.exporter}")
ortopt = script_args.ortopt in (1, "1")
print(f"ortopt={ortopt}")
opset = int(script_args.opset)
print(f"opset={opset}")


def opt_filename(filename: str) -> str:
    name, ext = os.path.splitext(filename)
    return f"{name}.opt{ext}"


def export_script(filename, model, *args):
    with contextlib.redirect_stdout(io.StringIO()):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            torch.onnx.export(
                model, args, filename, input_names=["input"], opset_version=opset
            )
    if ortopt:
        onx = onnx.load(filename)
        ort_optimize(onx, opt_filename(filename), providers=provider)


def export_dynamo(filename, model, *args):
    with contextlib.redirect_stdout(io.StringIO()):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            export_output = torch.onnx.export(model, args, dynamo=True)
            export_output.optimize()
            model = export_output.model_proto
    with open(filename, "wb") as f:
        f.write(model.SerializeToString())
    if ortopt:
        ort_optimize(model, opt_filename(filename), providers=provider)


def export_custom(filename, model, *args):
    new_model = to_onnx(
        model,
        tuple(args),
        input_names=[f"input{i}" for i in range(len(args))],
        options=OptimizationOptions(
            remove_unused=True,
            constant_folding=False,
        ),
        target_opset=opset,
    )
    with open(filename, "wb") as f:
        f.write(new_model.SerializeToString())
    if ortopt:
        ort_optimize(new_model, opt_filename(filename), providers=provider)

part=model
exporter=dynamo
ortopt=True
opset=18

Model and data¶

if unit_test_going():
    kwargs = dict(input_dims=[(2, 1024)] * 2)
else:
    kwargs = dict(
        input_dims=[(2, 1024)] * 2,
        _attn_implementation="eager",
        num_hidden_layers=1,
        hidden_size=512,
        vocab_size=4000,
        intermediate_size=2000,
        max_position_embeddings=2048,
        num_attention_heads=8,
    )

if script_args.part == "model":
    model, inputs = get_llama_model(**kwargs)
else:
    raise RuntimeError(f"Unexpected value for part={script_args.part!r}")

print(f"simple run with {len(inputs)} inputs")
expected = model(*inputs[0])
print(f"eager worked: {string_type(expected, with_shape=True)}")

simple run with 2 inputs
eager worked: (T1s2x1024x512,)

Exporting¶

exporter = script_args.exporter
file1 = f"llama.{script_args.part}.script.onnx"
file2 = f"llama.{script_args.part}.{exporter}.onnx"

print("torch script exporter")
export_script(file1, model, *inputs[0])

if exporter == "dynamo":
    print("torch dynamo exporter")
    export_dynamo(file2, model, *inputs[0])
elif exporter == "custom":
    print("torch custom exporter")
    export_custom(file2, model, *inputs[0])
else:
    raise AssertionError(f"Unexpected value for exporter={exporter!r}.")

torch script exporter
torch dynamo exporter

Verification¶

if ortopt:
    print("Using models optimized by onnxruntime")
    file1 = f"llama.{script_args.part}.script.opt.onnx"
    file2 = f"llama.{script_args.part}.{exporter}.opt.onnx"


providers = (
    ["CPUExecutionProvider"]
    if provider == "cpu"
    else [("CUDAExecutionProvider", {}), ("CPUExecutionProvider", {})]
)

model1 = onnx.load(file1)
model2 = onnx.load(file2)

feeds1, feeds2 = {}, {}
for i in range(len(inputs[0])):
    x = inputs[0][i].detach().numpy()
    feeds1[model1.graph.input[i].name] = x
    feeds2[model2.graph.input[i].name] = x

if ortopt:
    sess1 = onnxruntime.InferenceSession(file1, providers=providers)
    sess2 = onnxruntime.InferenceSession(file2, providers=providers)

    got1 = sess1.run(None, feeds1)
    got2 = sess2.run(None, feeds2)

    if isinstance(expected, tuple) and len(expected) == 1:
        expected = expected[0]
    diff1 = np.abs(expected.detach().numpy() - got1[0]).max()
    diff2 = np.abs(expected.detach().numpy() - got2[0]).max()

    print(f"Error with the eager model and onnxruntime: {diff1}, {diff2}")

Using models optimized by onnxruntime
Error with the eager model and onnxruntime: 0.0034363865852355957, 0.003467559814453125

Verification with the reference evaluator¶

reorder_functions_in_proto(file1)
reorder_functions_in_proto(file2)

sess1 = ExtendedReferenceEvaluator(file1)
try:
    sess2 = ExtendedReferenceEvaluator(file2)
except NotImplementedError as e:
    print(e)
    sess2 = None

got1 = sess1.run(None, feeds1)
got2 = got1 if sess2 is None else sess2.run(None, feeds2)

if isinstance(expected, tuple):
    diff1 = np.abs(expected[0].detach().numpy() - got1[0]).max()
    diff2 = np.abs(expected[0].detach().numpy() - got2[0]).max()
else:
    diff1 = np.abs(expected.detach().numpy() - got1[0]).max()
    diff2 = np.abs(expected.detach().numpy() - got2[0]).max()

print(f"Error with the eager model and the reference evaluator: {diff1}, {diff2}")

Error with the eager model and the reference evaluator: 3.4570693969726562e-06, 4.0531158447265625e-06

Comparison and execution¶

def clean_name(name):
    return name.replace(
        "_inlfunc_transformers_models_llama_modeling_llama_LlamaAttention", ""
    ).replace("_inlfunc_torch_nn_modules_linear_Linear", "")


if sess2 is not None:
    try:
        np_inputs = [i.detach().numpy() for i in inputs[0]]
        res1, res2, align, dc = compare_onnx_execution(
            model1,
            model2,
            inputs=np_inputs,
            verbose=1,
            raise_exc=False,
            cls=ExtendedReferenceEvaluator,
        )
        for r in res2:
            r.name = clean_name(r.name)
        text = dc.to_str(res1, res2, align, column_size=90)
        print(text)
    except AssertionError as e:
        if "Unexpected type <class 'list'> for value, it must be a numpy array." not in str(e):
            raise
        print(e)

[compare_onnx_execution] execute with 2 inputs
[compare_onnx_execution] execute first model
[compare_onnx_execution] got 83 results
[compare_onnx_execution] execute second model
[compare_onnx_execution] got 83 results (first model)
[compare_onnx_execution] got 97 results (second model)
[compare_onnx_execution] compute edit distance
[compare_onnx_execution] got 111 pairs
[compare_onnx_execution] done
= | INITIA float32  2:4000x512           KZOD                 model.embed_tokens.weight        | INITIA float32  2:4000x512           KZOD                 model.embed_tokens.weight
+ |                                                                                            | INITIA float32  4:1x1024x1x64        CJYF                 unsqueeze_10
+ |                                                                                            | INITIA float32  2:2000x512           CIPD                 val_322
- | INITIA float32  1:512                YYYY                 model.layers.0.input_layernorm.w |
- | INITIA float32  2:512x512            BDCV                 onnx::MatMul_383                 |
= | INITIA float32  2:512x512            AZEF                 onnx::MatMul_399                 | INITIA float32  2:512x512            AZEF                 val_240
- | INITIA float32  2:512x512            PFCX                 onnx::MatMul_400                 |
- | INITIA float32  2:512x512            KCZA                 onnx::MatMul_425                 |
- | INITIA float32  2:512x2000           WYRV                 onnx::MatMul_426                 |
- | INITIA float32  2:512x2000           TAAL                 onnx::MatMul_427                 |
- | INITIA float32  2:2000x512           CIPD                 onnx::MatMul_428                 |
~ | INITIA float32  4:1x1024x1x64        CJYF                 /model/layers.0/self_attn/Unsque | INITIA float32  4:1x1024x1x64        GSEC                 unsqueeze_11
- | INITIA int64    5:2x1x1024x1024x4    AQYO                 /model/Concat_output_0           |
- | INITIA float32  4:2x1x1024x1024      ????                 /model/Slice_2_output_0          |
= | INITIA int64    1:2                  GGAA                 splits                           | INITIA int64    1:2                  GGAA                 splits_token_10
~ | INITIA int64    1:3                  CKZA                 /model/layers.0/self_attn/Consta | INITIA int64    1:2                  GGAA                 splits
+ |                                                                                            | INITIA float32  2:512x512            PFCX                 val_242
+ |                                                                                            | INITIA float32  1:512                YYYY                 model.layers.0.input_layernorm.w
+ |                                                                                            | INITIA float32  1:512                YYYY                 model.layers.0.post_attention_la
- | INITIA float32  4:1x1x1024x64        GSEC                 Transpose_token_4_out0           |
~ | INITIA int64    1:1                  DAAA                 const_transpose_optimizer_token_ | INITIA float32  1:512                YYYY                 model.norm.weight
+ |                                                                                            | INITIA float32  4:1024x1x2x1024      ????                 val_177
+ |                                                                                            | INITIA float32  2:512x2000           TAAL                 val_321
+ |                                                                                            | INITIA float32  2:512x512            BDCV                 val_237
= | INITIA float32  4:2x1x1024x1024      ????                 /model/Expand_output_0           | INITIA float32  4:2x1x1024x1024      ????                 expand_1
= | INITIA float32                       ?AAA                 /model/Constant_19_output_0      | INITIA float32                       ?AAA                 val_6
+ |                                                                                            | INITIA float32  2:512x512            KCZA                 val_315
~ | INITIA int64    1:1                  KAAA                 /model/layers.0/self_attn/Consta | INITIA int64    1:1                  ZAAA                 val_317
+ |                                                                                            | INITIA float32  2:512x2000           WYRV                 val_319
+ |                                                                                            | INITIA float32  4:1x1x1024x64        GSEC                 Transpose_token_4_out0
~ | INITIA int64    1:4                  CBKK                 /model/Concat_1_output_0         | INITIA int64    1:4                  CKZM                 val_243
~ | INITIA int64    1:1                  BAAA                 /model/layers.0/self_attn/Consta | INITIA int64                         CAAA                 val_22
- | INITIA float32  4:1x1024x1x64        GSEC                 /model/layers.0/self_attn/Unsque |
~ | INITIA int64    1:1                  CAAA                 /model/Constant_13_output_0      | INITIA int64    2:1024x1             KAQG                 val_175
~ | INITIA float32                       AAAA                 /model/Constant_14_output_0      | INITIA int64    1:2                  BCAA                 val_327
~ | INITIA int64    1:4                  CKZM                 /model/layers.0/self_attn/Consta | INITIA float32                       AAAA                 scalar_tensor_default
~ | INITIA int64    1:1                  AAAA                 /model/layers.0/self_attn/Consta | INITIA int64    1:3                  CKZA                 val_314
~ | INITIA int64    1:2                  GGAA                 splits_token_12                  | INITIA float32                       AAAA                 val_235
= | INPUT  int64    2:2x1024             MKXO                 input                            | INPUT  int64    2:2x1024             MKXO                 input_ids
= | INPUT  float32  2:2x1024             BACA                 attention_mask.1                 | INPUT  float32  2:2x1024             BACA                 attention_mask
= | RESULT float32  3:2x1024x512         XLAW Gather          /model/embed_tokens/Gather_outpu | RESULT float32  3:2x1024x512         XLAW Gather          embedding
~ | RESULT float32  3:2x1024x512         IDLR SimplifiedLayer /model/layers.0/input_layernorm/ | RESULT float32  3:2x1024x512         AAAB Pow             pow_1
+ |                                                                                            | RESULT float32  3:2x1024x1           AAAA ReduceMean      mean
+ |                                                                                            | RESULT float32  3:2x1024x1           AAAA Add             add_1
+ |                                                                                            | RESULT float32  3:2x1024x1           KKKK Sqrt            val_236
~ | RESULT float32  3:2x1024x1           ZJFC SimplifiedLayer saved_inv_std_var                | RESULT float32  3:2x1024x1           ZJFC Reciprocal      rsqrt
+ |                                                                                            | RESULT float32  3:2x1024x512         IDLR Mul             mul_3
+ |                                                                                            | RESULT float32  3:2x1024x512         IDLR Mul             mul_4
= | RESULT float32  3:2x1024x512         XTBJ MatMul          /model/layers.0/self_attn/k_proj | RESULT float32  3:2x1024x512         XTBJ MatMul          linear_1
= | RESULT float32  4:2x1024x8x64        XTBJ Reshape         /model/layers.0/self_attn/Reshap | RESULT float32  4:2x1024x8x64        XTBJ Reshape         view_2
= | RESULT float32  4:2x1024x8x32        GRGV Split           /model/layers.0/self_attn/Slice_ | RESULT float32  4:2x1024x8x32        GRGV Split           node_Slice_356
= | RESULT float32  4:2x1024x8x32        SDUP Split           /model/layers.0/self_attn/Slice_ | RESULT float32  4:2x1024x8x32        SDUP Split           node_Slice_367
= | RESULT float32  4:2x1024x8x32        IXGL Neg             /model/layers.0/self_attn/Neg_1  | RESULT float32  4:2x1024x8x32        IXGL Neg             node_Neg_368
= | RESULT float32  4:2x1024x8x64        OOMG Concat          /model/layers.0/self_attn/Concat | RESULT float32  4:2x1024x8x64        OOMG Concat          node_Concat_369
= | RESULT float32  4:2x1024x8x64        TLWN Mul             /model/layers.0/self_attn/Mul_3  | RESULT float32  4:2x1024x8x64        TLWN Mul             node_Mul_370
= | RESULT float32  4:2x1024x8x64        OEHW Mul             /model/layers.0/self_attn/Mul_2  | RESULT float32  4:2x1024x8x64        OEHW Mul             node_Mul_345
= | RESULT float32  4:2x1024x8x64        GQDK Add             /model/layers.0/self_attn/Add_1  | RESULT float32  4:2x1024x8x64        GQDK Add             node_Add_371
= | RESULT float32  4:2x8x64x1024        REOB Transpose       /model/layers.0/self_attn/Transp | RESULT float32  4:2x8x64x1024        REOB Transpose       transpose_4
= | RESULT float32  3:2x1024x512         HRCW MatMul          /model/layers.0/self_attn/q_proj | RESULT float32  3:2x1024x512         HRCW MatMul          linear
= | RESULT float32  4:2x1024x8x64        HRCW Reshape         /model/layers.0/self_attn/Reshap | RESULT float32  4:2x1024x8x64        HRCW Reshape         view_1
= | RESULT float32  4:2x1024x8x64        HMSV Mul             /model/layers.0/self_attn/Mul    | RESULT float32  4:2x1024x8x64        HMSV Mul             node_Mul_317
= | RESULT float32  4:2x8x1024x64        JKWQ Transpose       /model/layers.0/self_attn/Mul_ou | RESULT float32  4:2x8x1024x64        JKWQ Transpose       mul_5
= | RESULT float32  4:2x8x1024x64        NNSG Transpose       /model/layers.0/self_attn/Transp | RESULT float32  4:2x8x1024x64        NNSG Transpose       transpose_1
= | RESULT float32  4:2x8x1024x32        INRL Split           /model/layers.0/self_attn/Slice_ | RESULT float32  4:2x8x1024x32        INRL Split           slice_26
= | RESULT float32  4:2x8x1024x32        EZBV Split           /model/layers.0/self_attn/Slice_ | RESULT float32  4:2x8x1024x32        EZBV Split           slice_27
= | RESULT float32  4:2x8x1024x32        WBZF Neg             /model/layers.0/self_attn/Neg_ou | RESULT float32  4:2x8x1024x32        WBZF Neg             neg
= | RESULT float32  4:2x8x1024x64        ENRQ Concat          /model/layers.0/self_attn/Concat | RESULT float32  4:2x8x1024x64        ENRQ Concat          cat_1
= | RESULT float32  4:2x8x1024x64        GSGE Mul             /model/layers.0/self_attn/Mul_1_ | RESULT float32  4:2x8x1024x64        GSGE Mul             mul_6
= | RESULT float32  4:2x8x1024x64        PBDT Add             /model/layers.0/self_attn/Add_ou | RESULT float32  4:2x8x1024x64        PBDT Add             add_2
= | RESULT float32  4:2x8x1024x1024      CEHD FusedMatMul     /model/layers.0/self_attn/Mul_4_ | RESULT float32  4:2x8x1024x1024      CEHD FusedMatMul     mul_9
- | RESULT float32  3:2x1x1024           BACA Unsqueeze       /model/Unsqueeze_2_output_0      |
= | RESULT float32  4:2x1x1x1024         BACA Unsqueeze       /model/Unsqueeze_3_output_0      | RESULT float32  4:2x1x1x1024         BACA Unsqueeze       unsqueeze_6
= | RESULT float32  4:2x1x1024x1024      ???? Add             /model/Add_output_0              | RESULT float32  4:2x1x1024x1024      ???? Add             add
= | RESULT bool     4:2x1x1024x1024      KWTE Equal           /model/Equal_1_output_0          | RESULT bool     4:2x1x1024x1024      KWTE Equal           eq
= | RESULT float32  4:2x1x1024x1024      ???? Where           /model/Where_1_output_0          | RESULT float32  4:2x1x1024x1024      ???? Where           masked_fill
+ |                                                                                            | RESULT float32  4:1024x1x2x1024      ???? Transpose       val_176
+ |                                                                                            | RESULT float32  4:1024x1x2x1024      ???? ScatterND       val_178
- | RESULT float32  4:2x1x1024x1024      ???? Reshape         /model/Reshape_output_0          |
- | RESULT float32  4:2x1x1024x1024      ???? ScatterND       /model/ScatterND_output_0        |
~ | RESULT float32  4:2x1x1024x1024      ???? Slice           /model/layers.0/self_attn/Slice_ | RESULT float32  4:2x1x1024x1024      ???? Transpose       slice_scatter_1
= | RESULT float32  4:2x8x1024x1024      ???? Add             /model/layers.0/self_attn/Add_2_ | RESULT float32  4:2x8x1024x1024      ???? Add             add_4
~ | RESULT float32  4:2x8x1024x1024      OOOO Softmax         /model/layers.0/self_attn/Softma | RESULT float32  4:2x8x1024x1024      OONO Softmax         val_312
= | RESULT float32  3:2x1024x512         SNVC MatMul          /model/layers.0/self_attn/v_proj | RESULT float32  3:2x1024x512         SNVC MatMul          linear_2
= | RESULT float32  4:2x1024x8x64        SNVC Reshape         /model/layers.0/self_attn/Reshap | RESULT float32  4:2x1024x8x64        SNVC Reshape         view_3
= | RESULT float32  4:2x8x1024x64        KVNL Transpose       /model/layers.0/self_attn/Transp | RESULT float32  4:2x8x1024x64        KVNL Transpose       transpose_3
= | RESULT float32  4:2x8x1024x64        EBGY MatMul          /model/layers.0/self_attn/MatMul | RESULT float32  4:2x8x1024x64        EBGY MatMul          matmul_2
= | RESULT float32  4:2x1024x8x64        PPIW Transpose       /model/layers.0/self_attn/Transp | RESULT float32  4:2x1024x8x64        PPIW Transpose       transpose_5
= | RESULT float32  3:2x1024x512         PPIW Reshape         /model/layers.0/self_attn/Reshap | RESULT float32  3:2x1024x512         PPIW Reshape         view_4
= | RESULT float32  3:2x1024x512         GGVP MatMul          /model/layers.0/self_attn/o_proj | RESULT float32  3:2x1024x512         GGVP MatMul          linear_3
= | RESULT float32  3:2x1024x512         DRVK Add             /model/layers.0/Add_output_0     | RESULT float32  3:2x1024x512         DRVK Add             add_5
~ | RESULT float32  3:2x1024x512         YUIE SimplifiedLayer /model/layers.0/post_attention_l | RESULT float32  3:2x1024x512         HNSA Pow             pow_2
+ |                                                                                            | RESULT float32  3:2x1024x1           WWKK ReduceMean      mean_1
+ |                                                                                            | RESULT float32  3:2x1024x1           WWKK Add             add_6
+ |                                                                                            | RESULT float32  3:2x1024x1           DDVV Sqrt            val_318
~ | RESULT float32  3:2x1024x1           XXYD SimplifiedLayer saved_inv_std_var_token_10       | RESULT float32  3:2x1024x1           XXYD Reciprocal      rsqrt_1
+ |                                                                                            | RESULT float32  3:2x1024x512         YUIE Mul             mul_10
+ |                                                                                            | RESULT float32  3:2x1024x512         YUIE Mul             mul_11
= | RESULT float32  3:2x1024x2000        QEHO MatMul          /model/layers.0/mlp/gate_proj/Ma | RESULT float32  3:2x1024x2000        QEHO MatMul          linear_4
= | RESULT float32  3:2x1024x2000        JIQR QuickGelu       /model/layers.0/mlp/act_fn/Mul_o | RESULT float32  3:2x1024x2000        JIQR QuickGelu       silu
= | RESULT float32  3:2x1024x2000        RJCH MatMul          /model/layers.0/mlp/up_proj/MatM | RESULT float32  3:2x1024x2000        RJCH MatMul          linear_5
= | RESULT float32  3:2x1024x2000        WAWN Mul             /model/layers.0/mlp/Mul_output_0 | RESULT float32  3:2x1024x2000        WAWN Mul             mul_12
= | RESULT float32  3:2x1024x512         SSGC MatMul          /model/layers.0/mlp/down_proj/Ma | RESULT float32  3:2x1024x512         SSGC MatMul          linear_6
= | RESULT float32  3:2x1024x512         UIAL Add             /model/layers.0/Add_1_output_0   | RESULT float32  3:2x1024x512         UIAL Add             add_7
~ | RESULT float32  3:2x1024x512         XYGK SimplifiedLayer 349                              | RESULT float32  3:2x1024x512         CPRC Pow             pow_3
+ |                                                                                            | RESULT float32  3:2x1024x1           AAOO ReduceMean      mean_2
+ |                                                                                            | RESULT float32  3:2x1024x1           AAOO Add             add_8
+ |                                                                                            | RESULT float32  3:2x1024x1           MNJJ Sqrt            val_325
~ | RESULT float32  3:2x1024x1           FEGI SimplifiedLayer saved_inv_std_var_token_11       | RESULT float32  3:2x1024x1           FEGI Reciprocal      rsqrt_2
+ |                                                                                            | RESULT float32  3:2x1024x512         XYGK Mul             mul_13
+ |                                                                                            | RESULT float32  3:2x1024x512         XYGK Mul             mul_14
= | OUTPUT float32  3:2x1024x512         XYGK                 349                              | OUTPUT float32  3:2x1024x512         XYGK                 mul_14

See plot_llama_diff_export for a better view.

Total running time of the script: (0 minutes 47.380 seconds)