to_onnx and Phi-2¶

Exports model Phi-2. We use a dummy model. The main difficulty is to set the dynamic shapes properly. If there is an issue, you can go to the following line: torch/fx/experimental/symbolic_shapes.py#L5965 and look for log.info("set_replacement %s = %s (%s) %s", a, tgt, msg, tgt_bound) and add before or after, something like:

if isinstance(tgt, int):
    raise AssertionError(
        f"dynamic shape becomes a constant "
        f"{[a, tgt, type(tgt), msg, tgt_bound]}"
    )

Adding TORCH_LOGS="+dynamo" TORCHDYNAMO_VERBOSE=1 prints out more information about dynamic shapes.

Model¶

import copy
from typing import Any, Dict
import onnx
import torch
import transformers
from onnx_array_api.plotting.graphviz_helper import plot_dot
from experimental_experiment.helpers import string_type
from experimental_experiment.xbuilder import GraphBuilder, InferShapesOptions
from experimental_experiment.torch_interpreter import to_onnx, ExportOptions


def get_phi2_untrained(batch_size: int = 2, **kwargs) -> Dict[str, Any]:
    """
    Gets a non initialized model with its inputs

    :param batch_size: batch size
    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
    :return: dictionary

    See `Phi-2/config.json
    <https://huggingface.co/microsoft/phi-2/blob/main/config.json>`_.
    """
    config = {
        "_name_or_path": "microsoft/phi-2",
        "architectures": ["PhiForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 50256,
        "embd_pdrop": 0.0,
        "eos_token_id": 50256,
        "hidden_act": "gelu_new",
        "hidden_size": 2560,
        "initializer_range": 0.02,
        "intermediate_size": 10240,
        "layer_norm_eps": 1e-05,
        "max_position_embeddings": 2048,
        "model_type": "phi",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "partial_rotary_factor": 0.4,
        "qk_layernorm": False,
        "resid_pdrop": 0.1,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.37.0",
        "use_cache": True,
        "vocab_size": 51200,
    }
    config.update(**kwargs)
    conf = transformers.PhiConfig(**config)
    model = transformers.PhiForCausalLM(conf)
    model.eval()

    batch = torch.export.Dim("batch", min=1, max=1024)
    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
    shapes = {}

    cache = transformers.cache_utils.DynamicCache(config["num_hidden_layers"])
    for i in range(config["num_hidden_layers"]):
        cache.update(
            torch.randn(batch_size, 32, 30, 80), torch.randn(batch_size, 32, 30, 80), i
        )
    cache2 = transformers.cache_utils.DynamicCache(config["num_hidden_layers"])
    for i in range(config["num_hidden_layers"]):
        cache2.update(
            torch.randn(batch_size + 1, 32, 31, 80),
            torch.randn(batch_size + 1, 32, 31, 80),
            i,
        )

    inputs = dict(
        input_ids=torch.randint(0, 50285, (batch_size, 3)).to(torch.int64),
        attention_mask=torch.ones((batch_size, 33)).to(torch.int64),
        past_key_values=cache,
    )
    inputs2 = dict(
        input_ids=torch.randint(0, 50285, (batch_size + 1, 4)).to(torch.int64),
        attention_mask=torch.ones((batch_size + 1, 35)).to(torch.int64),
        past_key_values=cache2,
    )
    n = len(cache.key_cache)
    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
    shapes.update(
        {
            "input_ids": {0: batch, 1: seq_length},
            "attention_mask": {
                0: batch,
                1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
            },
            "past_key_values": [
                [{0: batch, 2: cache_length} for _ in range(n)],  # 0: batch,
                [{0: batch, 2: cache_length} for _ in range(n)],  # 0: batch,
            ],
        }
    )

    return dict(inputs=inputs, model=model, dynamic_shapes=shapes, inputs2=inputs2)


data = get_phi2_untrained(num_hidden_layers=2)
model = data["model"]
inputs = data["inputs"]
dynamic_shapes = data["dynamic_shapes"]

print("inputs", string_type(inputs, with_shape=True))
print("dynamic_shapes", dynamic_shapes)

inputs dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
dynamic_shapes {'input_ids': {0: <class '__main__.batch'>, 1: <class '__main__.seq_length'>}, 'attention_mask': {0: <class '__main__.batch'>, 1: <_DimHint.DYNAMIC: 3>}, 'past_key_values': [[{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}], [{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}]]}

Let’s check it is working. We need to copy the input before calling the model because it modifies the inputs and they are not properly set up when the export starts.

model(**copy.deepcopy(inputs))

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.5473, -1.5257, -0.4921,  ..., -1.0600,  0.7193,  1.9122],
         [ 0.5501,  0.1354, -1.0489,  ..., -0.4748,  0.6088,  0.7936],
         [ 0.2769, -1.8652, -0.9587,  ..., -1.4784,  0.8955,  1.5270]],

        [[ 1.0052, -2.4517,  0.4282,  ...,  0.2080,  0.8085,  0.8800],
         [ 1.4353,  0.0583,  0.6627,  ...,  0.2138,  1.1032,  0.9457],
         [ 0.4206,  0.7871,  0.0113,  ..., -0.5401,  1.8957,  1.6509]]],
       grad_fn=<ViewBackward0>), past_key_values=DynamicCache(), hidden_states=None, attentions=None)

Export¶

We try to export with experimental_experiment.torch_interpreter.to_onnx().

try:
    to_onnx(
        copy.deepcopy(model),
        (),
        kwargs=copy.deepcopy(inputs),
        dynamic_shapes=dynamic_shapes,
    )
except Exception as e:
    print(f"export failed due to {e}")

export failed due to Cannot associate shape [[{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}], [{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}]] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)
For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation

The export fails for a couple of reason but it is possible to patch the code to make it work. All those modifications are put in place by onnx_export_errors and reverted after the export is done. Among other things, this function registers serialization functions as shown in example Export a model using a custom type as input.

from experimental_experiment.torch_interpreter.onnx_export_errors import (
    bypass_export_some_errors,
)

with bypass_export_some_errors(
    patch_transformers=True, replace_dynamic_cache=True, verbose=1
) as modificator:
    print("inputs before", string_type(inputs, with_shape=True))
    inputs = modificator(inputs)
    print("inputs after", string_type(inputs, with_shape=True))
    # ep = torch.export.export(model, (), inputs, dynamic_shapes=dynamic_shapes, strict=False)
    large_onx = to_onnx(
        model,
        (),
        inputs,
        dynamic_shapes=dynamic_shapes,
        export_options=ExportOptions(strict=False),
        large_model=True,
    )
    large_onx.save("plot_exporter_recipes_c_phi2.onnx", all_tensors_to_one_file=True)

[bypass_export_some_errors] replace torch.jit.isinstance, torch._dynamo.mark_static_address
[bypass_export_some_errors] register MambaCache
[bypass_export_some_errors] register DynamicCache
[bypass_export_some_errors] register patched_DynamicCache
[bypass_export_some_errors] patch sympy
[bypass_export_some_errors] patch pytorch
[bypass_export_some_errors] modifies shape constraints
[bypass_export_some_errors] patch transformers
[bypass_export_some_errors] replace DynamicCache
inputs before dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
inputs after dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:patched_DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
[bypass_export_some_errors] restored sympy functions
[bypass_export_some_errors] restored pytorch functions
[bypass_export_some_errors] restored shape constraints
[bypass_export_some_errors] restored transformer
[bypass_export_some_errors] restored DynamicCache
[bypass_export_some_errors] unregistered MambaCache
[bypass_export_some_errors] unregistered DynamicCache
[bypass_export_some_errors] unregistered patched_DynamicCache

Exported Model¶

Let’s display the model.

onx = onnx.load("plot_exporter_recipes_c_phi2.onnx")
gr = GraphBuilder(onx, infer_shapes_options=InferShapesOptions.NONE)
print(gr.pretty_text())

dyn---: batch -> WrapSym(batch)
dyn---: batch*seq_length -> 'batch*seq_length'
dyn---: cache_length -> WrapSym(cache_length)
dyn---: cache_length+seq_length -> WrapSym(cache_length+seq_length)
dyn---: channel -> WrapSym(channel)
dyn---: seq_length -> WrapSym(seq_length)
dynrev: batch -> [('batch', SymInt(batch))]
dynrev: cache_length -> [('cache_length', SymInt(cache_length))]
dynrev: cache_length+seq_length -> [('cache_length+seq_length', SymInt(cache_length+seq_length))]
dynrev: channel -> [('channel', SymInt(channel))]
dynrev: seq_length -> [('seq_length', SymInt(seq_length))]
dynsrc: batch -> [{batch:('input_name', 'input_ids'), batch:('axis', 0)}, {batch:('input_name', 'attention_mask'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_key_cache_0'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_key_cache_1'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_value_cache_0'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_value_cache_1'), batch:('axis', 0)}, {batch:('input_name', 'output_0'), batch:('axis', 0)}, {batch:('input_name', 'output_1'), batch:('axis', 0)}, {batch:('input_name', 'output_2'), batch:('axis', 0)}, {batch:('input_name', 'output_3'), batch:('axis', 0)}, {batch:('input_name', 'output_4'), batch:('axis', 0)}, {batch:('input_name', 'embedding'), batch:('axis', 0)}, {batch:('input_name', 'expand'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_cast_unsqueeze_40'), batch:('axis', 0)}, {batch:('input_name', 'add_2'), batch:('axis', 0)}, {batch:('input_name', 'eq_7'), batch:('axis', 0)}, {batch:('input_name', 'masked_fill'), batch:('axis', 0)}, {batch:('input_name', 'expand_as'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm0'), batch:('axis', 0)}, {batch:('input_name', 'view'), batch:('axis', 0)}, {batch:('input_name', 'transpose_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_17'), batch:('axis', 0)}, {batch:('input_name', 'slice_18'), batch:('axis', 0)}, {batch:('input_name', 'slice_21'), batch:('axis', 0)}, {batch:('input_name', 'slice_22'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm02'), batch:('axis', 0)}, {batch:('input_name', 'view_1'), batch:('axis', 0)}, {batch:('input_name', 'transpose_2'), batch:('axis', 0)}, {batch:('input_name', 'slice_19'), batch:('axis', 0)}, {batch:('input_name', 'slice_20'), batch:('axis', 0)}, {batch:('input_name', 'slice_23'), batch:('axis', 0)}, {batch:('input_name', 'slice_24'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm03'), batch:('axis', 0)}, {batch:('input_name', 'view_2'), batch:('axis', 0)}, {batch:('input_name', 'transpose_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_2'), batch:('axis', 0)}, {batch:('input_name', 'neg'), batch:('axis', 0)}, {batch:('input_name', 'cat_1'), batch:('axis', 0)}, {batch:('input_name', 'mul_3'), batch:('axis', 0)}, {batch:('input_name', 'add_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_4'), batch:('axis', 0)}, {batch:('input_name', 'neg_1'), batch:('axis', 0)}, {batch:('input_name', 'cat_2'), batch:('axis', 0)}, {batch:('input_name', 'mul_5'), batch:('axis', 0)}, {batch:('input_name', 'add_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_3'), batch:('axis', 0)}, {batch:('input_name', 'cat_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_50'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention'), batch:('axis', 0)}, {batch:('input_name', 'transpose_4'), batch:('axis', 0)}, {batch:('input_name', 'reshape_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm04'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_40'), batch:('axis', 0)}, {batch:('input_name', 'pow_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_10'), batch:('axis', 0)}, {batch:('input_name', 'add_6'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_60'), batch:('axis', 0)}, {batch:('input_name', 'tanh'), batch:('axis', 0)}, {batch:('input_name', 'add_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_9'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_90'), batch:('axis', 0)}, {batch:('input_name', 'add_8'), batch:('axis', 0)}, {batch:('input_name', 'add_9'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_10'), batch:('axis', 0)}, {batch:('input_name', 'view_3'), batch:('axis', 0)}, {batch:('input_name', 'transpose_5'), batch:('axis', 0)}, {batch:('input_name', 'slice_28'), batch:('axis', 0)}, {batch:('input_name', 'slice_29'), batch:('axis', 0)}, {batch:('input_name', 'slice_32'), batch:('axis', 0)}, {batch:('input_name', 'slice_33'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_102'), batch:('axis', 0)}, {batch:('input_name', 'view_4'), batch:('axis', 0)}, {batch:('input_name', 'transpose_6'), batch:('axis', 0)}, {batch:('input_name', 'slice_30'), batch:('axis', 0)}, {batch:('input_name', 'slice_31'), batch:('axis', 0)}, {batch:('input_name', 'slice_34'), batch:('axis', 0)}, {batch:('input_name', 'slice_35'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_103'), batch:('axis', 0)}, {batch:('input_name', 'view_5'), batch:('axis', 0)}, {batch:('input_name', 'transpose_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_10'), batch:('axis', 0)}, {batch:('input_name', 'neg_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_11'), batch:('axis', 0)}, {batch:('input_name', 'add_10'), batch:('axis', 0)}, {batch:('input_name', 'mul_12'), batch:('axis', 0)}, {batch:('input_name', 'neg_3'), batch:('axis', 0)}, {batch:('input_name', 'cat_8'), batch:('axis', 0)}, {batch:('input_name', 'mul_13'), batch:('axis', 0)}, {batch:('input_name', 'add_11'), batch:('axis', 0)}, {batch:('input_name', 'cat_9'), batch:('axis', 0)}, {batch:('input_name', 'cat_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_110'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention_1'), batch:('axis', 0)}, {batch:('input_name', 'transpose_8'), batch:('axis', 0)}, {batch:('input_name', 'reshape_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_104'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_100'), batch:('axis', 0)}, {batch:('input_name', 'pow_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_20'), batch:('axis', 0)}, {batch:('input_name', 'add_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_120'), batch:('axis', 0)}, {batch:('input_name', 'tanh_1'), batch:('axis', 0)}, {batch:('input_name', 'add_13'), batch:('axis', 0)}, {batch:('input_name', 'mul_17'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_170'), batch:('axis', 0)}, {batch:('input_name', 'add_14'), batch:('axis', 0)}, {batch:('input_name', 'add_15'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm03'), batch:('axis', 0)}, {batch:('input_name', 'pow_1'), batch:('axis', 0)}, {batch:('input_name', 'neg_1'), batch:('axis', 0)}, {batch:('input_name', 'add_13'), batch:('axis', 0)}, {batch:('input_name', 'clone'), batch:('axis', 0)}, {batch:('input_name', 'linear_6'), batch:('axis', 0)}, {batch:('input_name', 'mul_13'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm_1'), batch:('axis', 0)}, {batch:('input_name', 'mul_16'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_dropout0'), batch:('axis', 0)}, {batch:('input_name', 'add_7'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_add_90000'), batch:('axis', 0)}, {batch:('input_name', 'slice_21'), batch:('axis', 0)}, {batch:('input_name', 'mul_6'), batch:('axis', 0)}, {batch:('input_name', 'linear_4'), batch:('axis', 0)}, {batch:('input_name', 'mul_11'), batch:('axis', 0)}, {batch:('input_name', 'view_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_17'), batch:('axis', 0)}, {batch:('input_name', 'add_15'), batch:('axis', 0)}, {batch:('input_name', 'slice_29'), batch:('axis', 0)}, {batch:('input_name', 'add_9'), batch:('axis', 0)}, {batch:('input_name', 'linear_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_35'), batch:('axis', 0)}, {batch:('input_name', 'mul_14'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_dropout0000'), batch:('axis', 0)}, {batch:('input_name', 'dropout_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_transpose_cat_1100'), batch:('axis', 0)}, {batch:('input_name', 'add_14'), batch:('axis', 0)}, {batch:('input_name', 'slice_22'), batch:('axis', 0)}, {batch:('input_name', 'dropout_2'), batch:('axis', 0)}, {batch:('input_name', 'contiguous'), batch:('axis', 0)}, {batch:('input_name', 'add_8'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_110'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention'), batch:('axis', 0)}, {batch:('input_name', 'mul_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_20'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm02'), batch:('axis', 0)}, {batch:('input_name', 'slice_32'), batch:('axis', 0)}, {batch:('input_name', 'view_4'), batch:('axis', 0)}, {batch:('input_name', 'slice_28'), batch:('axis', 0)}, {batch:('input_name', 'linear_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_dropout0'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_dropout0000'), batch:('axis', 0)}, {batch:('input_name', 'transpose_1'), batch:('axis', 0)}, {batch:('input_name', 'view_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_3'), batch:('axis', 0)}, {batch:('input_name', 'linear_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_50'), batch:('axis', 0)}, {batch:('input_name', 'cat_6'), batch:('axis', 0)}, {batch:('input_name', 'slice_30'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_add_90'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_add_90'), batch:('axis', 0)}, {batch:('input_name', 'mul_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm0'), batch:('axis', 0)}, {batch:('input_name', 'neg_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_12'), batch:('axis', 0)}, {batch:('input_name', 'cat_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_cat_90'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', 'linear_10'), batch:('axis', 0)}, {batch:('input_name', 'slice_20'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_102'), batch:('axis', 0)}, {batch:('input_name', 'expand_as'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_1500000'), batch:('axis', 0)}, {batch:('input_name', 'contiguous_1'), batch:('axis', 0)}, {batch:('input_name', 'mul_4'), batch:('axis', 0)}, {batch:('input_name', 'transpose_8'), batch:('axis', 0)}, {batch:('input_name', 'slice_24'), batch:('axis', 0)}, {batch:('input_name', 'add_12'), batch:('axis', 0)}, {batch:('input_name', 'transpose_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_add_9000'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_170'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', 'pow_2'), batch:('axis', 0)}, {batch:('input_name', 'linear_5'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_dropout000'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_40'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_3'), batch:('axis', 0)}, {batch:('input_name', 'slice_23'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm_2'), batch:('axis', 0)}, {batch:('input_name', 'add_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_103'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', 'cat_7'), batch:('axis', 0)}, {batch:('input_name', 'slice_33'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_add_90000'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_add_150'), batch:('axis', 0)}, {batch:('input_name', 'add_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_reshape_10'), batch:('axis', 0)}, {batch:('input_name', 'linear_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_11'), batch:('axis', 0)}, {batch:('input_name', 'slice_18'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_60'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_cat_30'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_reshape_20'), batch:('axis', 0)}, {batch:('input_name', 'neg'), batch:('axis', 0)}, {batch:('input_name', 'mul_9'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm'), batch:('axis', 0)}, {batch:('input_name', 'slice_17'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_dropout00000'), batch:('axis', 0)}, {batch:('input_name', 'dropout'), batch:('axis', 0)}, {batch:('input_name', 'view_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_900000'), batch:('axis', 0)}, {batch:('input_name', 'linear_7'), batch:('axis', 0)}, {batch:('input_name', 'transpose_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_10'), batch:('axis', 0)}, {batch:('input_name', 'mul_8'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_transpose_cat_500'), batch:('axis', 0)}, {batch:('input_name', 'add_11'), batch:('axis', 0)}, {batch:('input_name', 'add_3'), batch:('axis', 0)}, {batch:('input_name', 'slice_34'), batch:('axis', 0)}, {batch:('input_name', 'linear_11'), batch:('axis', 0)}, {batch:('input_name', 'transpose_3'), batch:('axis', 0)}, {batch:('input_name', 'add_6'), batch:('axis', 0)}, {batch:('input_name', 'expand'), batch:('axis', 0)}, {batch:('input_name', 'dropout_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_104'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm04'), batch:('axis', 0)}, {batch:('input_name', 'tanh'), batch:('axis', 0)}, {batch:('input_name', '_onx_cast_unsqueeze_40'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_add_150000'), batch:('axis', 0)}, {batch:('input_name', 'linear'), batch:('axis', 0)}, {batch:('input_name', 'linear_9'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_add_150'), batch:('axis', 0)}, {batch:('input_name', 'mul_5'), batch:('axis', 0)}, {batch:('input_name', 'transpose_7'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_20'), batch:('axis', 0)}, {batch:('input_name', 'transpose_6'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_dropout000'), batch:('axis', 0)}, {batch:('input_name', 'linear_8'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_10'), batch:('axis', 0)}, {batch:('input_name', 'cat_9'), batch:('axis', 0)}, {batch:('input_name', 'masked_fill'), batch:('axis', 0)}, {batch:('input_name', 'slice_31'), batch:('axis', 0)}, {batch:('input_name', 'eq_7'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention_1'), batch:('axis', 0)}, {batch:('input_name', 'dropout_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_add_15000'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_add_9000'), batch:('axis', 0)}, {batch:('input_name', 'add_2'), batch:('axis', 0)}, {batch:('input_name', 'tanh_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_19'), batch:('axis', 0)}, {batch:('input_name', 'neg_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_add_150000'), batch:('axis', 0)}, {batch:('input_name', 'embedding'), batch:('axis', 0)}, {batch:('input_name', 'reshape_1'), batch:('axis', 0)}, {batch:('input_name', 'mul_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_100'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_120'), batch:('axis', 0)}, {batch:('input_name', 'cat_8'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', 'cat_5'), batch:('axis', 0)}, {batch:('input_name', 'view'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_4'), batch:('axis', 0)}, {batch:('input_name', 'mul_15'), batch:('axis', 0)}, {batch:('input_name', 'transpose_5'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', 'view_5'), batch:('axis', 0)}, {batch:('input_name', 'reshape_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_2'), batch:('axis', 0)}, {batch:('input_name', 'mul_7'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_90'), batch:('axis', 0)}, {batch:('input_name', 'cat_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_add_15000'), batch:('axis', 0)}]
dynsrc: batch*seq_length -> [{batch*seq_length:('input_name', 'MatMulAddPattern--reshape_1'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_12'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_13'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_2'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_22'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_23'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_13'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_22'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_12'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_23'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_2'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_1'), batch*seq_length:('axis', 0)}]
dynsrc: cache_length -> [{cache_length:('input_name', 'past_key_values_key_cache_0'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_key_cache_1'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_value_cache_0'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_value_cache_1'), cache_length:('axis', 2)}]
dynsrc: cache_length+seq_length -> [{cache_length+seq_length:('input_name', 'output_1'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_2'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_3'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_4'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_50'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_110'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_mul_transpose_cat_1100'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_110'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_50'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', 'cat_6'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'cat_12'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'cat_11'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_mul_transpose_cat_500'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', 'cat_5'), cache_length+seq_length:('axis', 2)}]
dynsrc: channel -> [{channel:('input_name', 'attention_mask'), channel:('axis', 1)}, {channel:('input_name', 'full'), channel:('axis', 1)}, {channel:('input_name', 'triu'), channel:('axis', 1)}, {channel:('input_name', 'arange_1'), channel:('axis', 0)}, {channel:('input_name', 'gt'), channel:('axis', 1)}, {channel:('input_name', '_onx_cast_gt0'), channel:('axis', 1)}, {channel:('input_name', '_onx_mul_triu0'), channel:('axis', 1)}, {channel:('input_name', 'unsqueeze_2'), channel:('axis', 3)}, {channel:('input_name', 'expand'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_4'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_unsqueeze_40'), channel:('axis', 3)}, {channel:('input_name', 'add_2'), channel:('axis', 3)}, {channel:('input_name', 'eq_7'), channel:('axis', 3)}, {channel:('input_name', 'masked_fill'), channel:('axis', 3)}, {channel:('input_name', 'expand_as'), channel:('axis', 3)}, {channel:('input_name', 'clone'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_gt0'), channel:('axis', 1)}, {channel:('input_name', 'gt'), channel:('axis', 1)}, {channel:('input_name', 'triu'), channel:('axis', 1)}, {channel:('input_name', 'full'), channel:('axis', 1)}, {channel:('input_name', 'unsqueeze_1'), channel:('axis', 2)}, {channel:('input_name', 'expand_as'), channel:('axis', 3)}, {channel:('input_name', '_onx_mul_triu0'), channel:('axis', 1)}, {channel:('input_name', 'unsqueeze_2'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_3'), channel:('axis', 2)}, {channel:('input_name', 'arange_1'), channel:('axis', 0)}, {channel:('input_name', 'expand'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_unsqueeze_40'), channel:('axis', 3)}, {channel:('input_name', 'masked_fill'), channel:('axis', 3)}, {channel:('input_name', 'mul_'), channel:('axis', 1)}, {channel:('input_name', 'eq_7'), channel:('axis', 3)}, {channel:('input_name', 'add_2'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_4'), channel:('axis', 3)}]
dynsrc: seq_length -> [{seq_length:('input_name', 'input_ids'), seq_length:('axis', 1)}, {seq_length:('input_name', 'output_0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'embedding'), seq_length:('axis', 1)}, {seq_length:('input_name', 'arange'), seq_length:('axis', 0)}, {seq_length:('input_name', 'unsqueeze'), seq_length:('axis', 1)}, {seq_length:('input_name', 'full'), seq_length:('axis', 0)}, {seq_length:('input_name', 'triu'), seq_length:('axis', 0)}, {seq_length:('input_name', 'reshape'), seq_length:('axis', 0)}, {seq_length:('input_name', 'gt'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_cast_gt0'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_mul_triu0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'unsqueeze_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'expand'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'eq_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'masked_fill'), seq_length:('axis', 2)}, {seq_length:('input_name', 'expand_as'), seq_length:('axis', 2)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_8'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_17'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_18'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_21'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_22'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm02'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_19'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_20'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_23'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_24'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm03'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm04'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_40'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_6'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_60'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_7'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_9'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_9'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_3'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_28'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_29'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_32'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_33'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_102'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_6'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_30'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_31'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_34'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_35'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_103'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_5'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_12'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_8'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_13'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_104'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_100'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_12'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_120'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_13'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_17'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_170'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_14'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_15'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm03'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'neg_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_13'), seq_length:('axis', 1)}, {seq_length:('input_name', 'clone'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_6'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_cast_gt0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'mul_13'), seq_length:('axis', 2)}, {seq_length:('input_name', 'layer_norm_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_16'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sub_dropout0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_7'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_mul_div_sub_add_90000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_21'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_6'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'view_3'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_17'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_15'), seq_length:('axis', 1)}, {seq_length:('input_name', 'gt'), seq_length:('axis', 0)}, {seq_length:('input_name', 'to_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_29'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'triu'), seq_length:('axis', 0)}, {seq_length:('input_name', 'slice_35'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_14'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_dropout0000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_14'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_22'), seq_length:('axis', 2)}, {seq_length:('input_name', 'dropout_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'full'), seq_length:('axis', 0)}, {seq_length:('input_name', 'contiguous'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'scaled_dot_product_attention'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_12'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_20'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm02'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_32'), seq_length:('axis', 2)}, {seq_length:('input_name', 'view_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_28'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_12'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_dropout0'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_mul_div_sub_dropout0000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'view_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_3'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_30'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_reducemean_add_90'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sub_add_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_10'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'neg_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_1'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_cat_90'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_20'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_102'), seq_length:('axis', 1)}, {seq_length:('input_name', 'expand_as'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_1500000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'contiguous_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_8'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_triu0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'slice_24'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_12'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_4'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_div_sub_add_9000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_170'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_5'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_dropout000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_40'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_23'), seq_length:('axis', 2)}, {seq_length:('input_name', 'layer_norm_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_4'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_103'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_33'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_add_90000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_add_150'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_10'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_reshape_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_8'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_18'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_add_60'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_9'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_cat_30'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_reshape_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'neg'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'layer_norm'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_17'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_dropout00000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_900000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_7'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_2'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_pow_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_34'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_11'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_6'), seq_length:('axis', 1)}, {seq_length:('input_name', 'expand'), seq_length:('axis', 2)}, {seq_length:('input_name', 'dropout_3'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_104'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm04'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_add_150000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_11'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_sub_add_150'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_7'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_pow_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_6'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_div_sub_dropout000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_8'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'arange'), seq_length:('axis', 0)}, {seq_length:('input_name', 'cat_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'masked_fill'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_31'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_'), seq_length:('axis', 0)}, {seq_length:('input_name', 'eq_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'dropout_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_add_15000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_add_9000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'tanh_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_19'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'to_5'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_mul_div_sub_add_150000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'embedding'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_3'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_linear_100'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_120'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_8'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_15'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_5'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_5'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_7'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'reshape'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_mul_div_sub_add_15000'), seq_length:('axis', 1)}]
opset: : 18
opset: local_functions.0: 1
opset: local_functions: 1
init: b_model_rotary_emb_inv_freq: ?: ?                                -- GraphBuilder._update_structures_with_proto.1/from(b_model_rotary_emb_inv_freq)
init: init7_s1_1: int64: 1                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_1)##GraphBuilder.compute_constant/from(init7_s1_1)##GraphBuilder._update_structures_with_proto.1/from(init7_s1_1)
init: init7_s_1: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init7_s_1)
init: init7_s1_0: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_0)
init: init7_s_0: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init7_s_0)
init: init7_s2_-1_1: int64: 2                                          -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_1)##GraphBuilder.compute_constant/from(init7_s2_-1_1)##GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_1)
init: init1_s_: ?: ?                                                   -- GraphBuilder._update_structures_with_proto.1/from(init1_s_)
init: init1_s1_: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_)
init: init7_s1_-1: ?: ?                                                -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_-1)
init: init7_s1_80: ?: ?                                                -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_80)
init: init1_s_3: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_3)
init: init1_s1_5: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_5)
init: init1_s_4: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_4)
init: init1_s_5: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_5)
init: init1_s_6: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_6)
init: init7_s2_0_1: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_0_1)
init: init7_s2_1_2: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_1_2)
init: init1_s2560_: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_)
init: init1_s2560_2: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_2)
init: init1_s2560_3: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_3)
init: init1_s2560_4: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_4)
init: init1_s2560_5: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_5)
init: init1_s2560_6: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_6)
init: init1_s1_6: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_6)
init: init7_s2_32_48: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_32_48)
init: init7_s2_16_16: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_16_16)
init: init7_s2_-1_2560: int64: 2                                       -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_2560)##GraphBuilder.compute_constant/from(init7_s2_-1_2560)##GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_2560)
init: init7_s1_-12: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_-12)
init: model.embed_tokens.weight: ?: ?                                  -- GraphBuilder._update_structures_with_proto.1/from(model.embed_tokens.weight)
init: model.layers.0.self_attn.q_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.q_proj.weight)
init: model.layers.0.self_attn.k_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.k_proj.weight)
init: model.layers.0.self_attn.v_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.v_proj.weight)
init: model.layers.0.self_attn.dense.weight: ?: ?                      -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.dense.weight)
init: model.layers.0.mlp.fc1.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.mlp.fc1.weight)
init: model.layers.0.mlp.fc2.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.mlp.fc2.weight)
init: model.layers.1.self_attn.q_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.q_proj.weight)
init: model.layers.1.self_attn.k_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.k_proj.weight)
init: model.layers.1.self_attn.v_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.v_proj.weight)
init: model.layers.1.self_attn.dense.weight: ?: ?                      -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.dense.weight)
init: model.layers.1.mlp.fc1.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.mlp.fc1.weight)
init: model.layers.1.mlp.fc2.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.mlp.fc2.weight)
init: lm_head.weight: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(lm_head.weight)
input:: input_ids                                                               |T7: batch x seq_length
input:: attention_mask                                                          |T7: batch x channel
input:: past_key_values_key_cache_0                                             |T1: batch x 32 x cache_length x 80
input:: past_key_values_key_cache_1                                             |T1: batch x 32 x cache_length x 80
input:: past_key_values_value_cache_0                                           |T1: batch x 32 x cache_length x 80
input:: past_key_values_value_cache_1                                           |T1: batch x 32 x cache_length x 80
Shape: input_ids -> _shape_input_ids0                                           |T7: 1                        - sym_size_int
Shape: input_ids -> _shape_input_ids02                                          |T7: 1                        - sym_size_int3
Squeeze: _shape_input_ids02 -> sym_size_int_20                                  |T7:                          - sym_size_int4
Unsqueeze: sym_size_int_20, init7_s1_0 -> _onx_unsqueeze_sym_size_int_200       |T7: 1                        - _mkshape1_sym_size_int_20
Shape: past_key_values_key_cache_0 -> _shape_past_key_values_key_cache_00       |T7: 1                        - sym_size_int5
Squeeze: _shape_past_key_values_key_cache_00 -> sym_size_int_21                 |T7:                          - sym_size_int6
Gather: model.embed_tokens.weight, input_ids -> embedding                       |T1: batch x seq_length x 2560- embedding
Add: sym_size_int_21, sym_size_int_20 -> add                                    |T7:                          - add3
Range: sym_size_int_21, add, init7_s_1 -> arange                                |T7: seq_length               - arange
Unsqueeze: arange, init7_s1_0 -> unsqueeze                                      |T7: 1 x seq_length           - unsqueeze
Unsqueeze: add, init7_s1_0 -> _onx_unsqueeze_add0                               |T7: 1                        - _mkshape1_add
Concat: _onx_unsqueeze_sym_size_int_200, _onx_unsqueeze_add0 -> _onx_concat_unsqueeze_sym_size_int_2000                       |T7: 2- _mkshape_add
ConstantOfShape: _onx_concat_unsqueeze_sym_size_int_2000 -> full                |T1: seq_length x channel     - fullD2
Trilu: full, init7_s_1 -> triu                                                  |T1: seq_length x channel     - triu
Range: init7_s_0, add, init7_s_1 -> arange_1                                    |T7: channel                  - arange2
Reshape: arange, init7_s2_-1_1 -> reshape                                       |T7: seq_length x 1           - reshape
Greater: arange_1, reshape -> gt                                                |T9: seq_length x channel     - gt_Tensor
Cast: gt -> _onx_cast_gt0                                                       |T1: seq_length x channel     - mul__Tensor
Mul: triu, _onx_cast_gt0 -> _onx_mul_triu0                                      |T1: seq_length x channel     - mul__Tensor2
Unsqueeze: _onx_mul_triu0, init7_s2_0_1 -> unsqueeze_2                          |T1: 1 x 1 x seq_length x channel- UnsqueezeUnsqueezePattern--unsqueeze2
Shape: unsqueeze_2 -> _shape_unsqueeze_20                                       |T7: 1                        - expand_B
Shape: unsqueeze_2 -> _shape_unsqueeze_202                                      |T7: 1                        - expand_B2
Concat: _shape_input_ids0, init7_s1_1, _shape_unsqueeze_20, _shape_unsqueeze_202 -> _onx_concat_unsqueeze_sym_size_int_1900                                           |T7: 4- _mkshape__shape_unsqueeze_202
Expand: unsqueeze_2, _onx_concat_unsqueeze_sym_size_int_1900 -> expand          |T1: batch x 1 x seq_length x channel- expand_B_neg
Unsqueeze: attention_mask, init7_s2_1_2 -> unsqueeze_4                          |T7: batch x 1 x 1 x channel  - UnsqueezeUnsqueezePattern--unsqueeze4
Cast: unsqueeze_4 -> _onx_cast_unsqueeze_40                                     |T1: batch x 1 x 1 x channel  - Opset
Add: expand, _onx_cast_unsqueeze_40 -> add_2                                    |T1: batch x 1 x seq_length x channel- add_Tensor
Reshape: init1_s_, init7_s1_1 -> _reshape_init1_s_0                             |T1: 1                        - Opset2
Equal: add_2, _reshape_init1_s_0 -> eq_7                                        |T9: batch x 1 x seq_length x channel- eq
Where: eq_7, init1_s1_, expand -> masked_fill                                   |T1: batch x 1 x seq_length x channel- masked_fill_Scalar
Shape: expand -> _shape_clone0                                                  |T7: 4                        - aten_meth_expand_as
Expand: masked_fill, _shape_clone0 -> expand_as                                 |T1: batch x 1 x seq_length x channel- aten_meth_expand_as2
submod_3[local_functions]: b_model_rotary_emb_inv_freq, unsqueeze -> wrap_with_set_grad_enabled#0, wrap_with_set_grad_enabled#1                                               |T1: 1 x seq_length x 32 T1: 1 x seq_length x 32- wrap_with_set_grad_enabled
Unsqueeze: wrap_with_set_grad_enabled#0, init7_s1_1 -> unsqueeze_8              |T1: 1 x 1 x seq_length x 32  - unsqueeze6
LayerNormalization: embedding, init1_s2560_, init1_s2560_2 -> _onx_div_sub_dropout00    |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm4
Transpose: model.layers.0.self_attn.q_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_q_proj_weight0                             |T1: 2560 x 2560- linear
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_q_proj_weight0 -> _onx_matmul_layer_norm0                                   |T1: batch x seq_length x 2560- Opset3
Concat: _shape_input_ids0, _onx_unsqueeze_sym_size_int_200, init7_s1_-1, init7_s1_80 -> _onx_concat_unsqueeze_sym_size_int_19020                                                |T7: 4- _mkshape_sym_size_int_20
Reshape: _onx_matmul_layer_norm0, _onx_concat_unsqueeze_sym_size_int_19020 -> view  |T1: batch x seq_length x 32 x 80- view
Transpose: view -> transpose_1                                                  |T1: batch x 32 x seq_length x 80- transpose_int
Split: transpose_1, init7_s2_32_48 -> slice_17, slice_18                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor
Split: slice_17, init7_s2_16_16 -> slice_21, slice_22                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor5
Transpose: model.layers.0.self_attn.k_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_k_proj_weight0                             |T1: 2560 x 2560- linear2
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_k_proj_weight0 -> _onx_matmul_layer_norm02                                    |T1: batch x seq_length x 2560- Opset5
Reshape: _onx_matmul_layer_norm02, _onx_concat_unsqueeze_sym_size_int_19020 -> view_1     |T1: batch x seq_length x 32 x 80- view2
Transpose: view_1 -> transpose_2                                                |T1: batch x 32 x seq_length x 80- transpose_int2
Split: transpose_2, init7_s2_32_48 -> slice_19, slice_20                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor3
Split: slice_19, init7_s2_16_16 -> slice_23, slice_24                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor7
Transpose: model.layers.0.self_attn.v_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_v_proj_weight0                             |T1: 2560 x 2560- linear3
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_v_proj_weight0 -> _onx_matmul_layer_norm03                                    |T1: batch x seq_length x 2560- Opset7
Reshape: _onx_matmul_layer_norm03, _onx_concat_unsqueeze_sym_size_int_19020 -> view_2     |T1: batch x seq_length x 32 x 80- view3
Transpose: view_2 -> transpose_3                                                |T1: batch x 32 x seq_length x 80- transpose_int3
Unsqueeze: wrap_with_set_grad_enabled#1, init7_s1_1 -> unsqueeze_9              |T1: 1 x 1 x seq_length x 32  - unsqueeze7
Mul: slice_17, unsqueeze_8 -> mul_2                                             |T1: batch x 32 x seq_length x 32- mul_Tensor
Neg: slice_22 -> neg                                                            |T1: batch x 32 x seq_length x 16- neg
Concat: neg, slice_21 -> cat_1                                                  |T1: batch x 32 x seq_length x 32- cat
Mul: cat_1, unsqueeze_9 -> mul_3                                                |T1: batch x 32 x seq_length x 32- mul_Tensor2
Add: mul_2, mul_3 -> add_3                                                      |T1: batch x 32 x seq_length x 32- add_Tensor2
Mul: slice_19, unsqueeze_8 -> mul_4                                             |T1: batch x 32 x seq_length x 32- mul_Tensor3
Neg: slice_24 -> neg_1                                                          |T1: batch x 32 x seq_length x 16- neg2
Concat: neg_1, slice_23 -> cat_2                                                |T1: batch x 32 x seq_length x 32- cat2
Mul: cat_2, unsqueeze_9 -> mul_5                                                |T1: batch x 32 x seq_length x 32- mul_Tensor4
Add: mul_4, mul_5 -> add_4                                                      |T1: batch x 32 x seq_length x 32- add_Tensor3
Concat: add_3, slice_18 -> cat_3                                                |T1: batch x 32 x seq_length x 80- cat3
Concat: add_4, slice_20 -> cat_4                                                |T1: batch x 32 x seq_length x 80- cat4
Concat: past_key_values_key_cache_0, cat_4 -> output_1                          |T1: batch x 32 x cache_length+seq_length x 80- cat5
Concat: past_key_values_value_cache_0, transpose_3 -> output_3                  |T1: batch x 32 x cache_length+seq_length x 80- cat6
Transpose: output_1 -> _onx_transpose_cat_50                                    |T1: batch x 32 x 80 x cache_length+seq_length- aten_scaled_dot_product_attention
MatMul: cat_3, _onx_transpose_cat_50 -> MulMulMatMulPattern__onx_matmul_mul_cat_300   |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention5-1
Mul: MulMulMatMulPattern__onx_matmul_mul_cat_300, init1_s1_6 -> _onx_matmul_mul_cat_300       |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention5-2
Add: _onx_matmul_mul_cat_300, expand_as -> _onx_add_matmul_mul_cat_3000         |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention6
Softmax: _onx_add_matmul_mul_cat_3000 -> _onx_softmax_add_matmul_mul_cat_30000  |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention7
MatMul: _onx_softmax_add_matmul_mul_cat_30000, output_3 -> scaled_dot_product_attention       |T1: batch x 32 x seq_length x 80- aten_scaled_dot_product_attention8
Transpose: scaled_dot_product_attention -> transpose_4                          |T1: batch x seq_length x 32 x 80- transpose_int4
Concat: _shape_input_ids0, _onx_unsqueeze_sym_size_int_200, init7_s1_-1 -> _onx_concat_unsqueeze_sym_size_int_19030                                   |T7: 3- _mkshape_sym_size_int_202
Reshape: transpose_4, _onx_concat_unsqueeze_sym_size_int_19030 -> reshape_1     |T1: batch x seq_length x 2560- reshape2
Transpose: model.layers.0.mlp.fc1.weight -> _onx_transpose_p_model_layers_0_mlp_fc1_weight0           |T1: 2560 x 10240- linear5
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_mlp_fc1_weight0 -> _onx_matmul_layer_norm04                           |T1: batch x seq_length x 10240- Opset11
Reshape: init1_s_3, init7_s1_1 -> _reshape_init1_s_30                           |T1: 1                        - mul_Tensor5
Mul: _onx_matmul_layer_norm04, _reshape_init1_s_30 -> _onx_mul_linear_40        |T1: batch x seq_length x 10240- mul_Tensor6
Pow: _onx_matmul_layer_norm04, init1_s1_5 -> pow_1                              |T1: batch x seq_length x 10240- pow_Tensor_Scalar
Reshape: init1_s_4, init7_s1_1 -> _reshape_init1_s_40                           |T1: 1                        - mul_Tensor8
Mul: pow_1, _reshape_init1_s_40 -> _onx_mul_pow_10                              |T1: batch x seq_length x 10240- mul_Tensor9
Add: _onx_matmul_layer_norm04, _onx_mul_pow_10 -> add_6                         |T1: batch x seq_length x 10240- add_Tensor4
Reshape: init1_s_5, init7_s1_1 -> _reshape_init1_s_50                           |T1: 1                        - mul_Tensor11
Mul: add_6, _reshape_init1_s_50 -> _onx_mul_add_60                              |T1: batch x seq_length x 10240- mul_Tensor12
Tanh: _onx_mul_add_60 -> tanh                                                   |T1: batch x seq_length x 10240- tanh
Reshape: init1_s_6, init7_s1_1 -> _reshape_init1_s_60                           |T1: 1                        - Opset13
Add: tanh, _reshape_init1_s_60 -> add_7                                         |T1: batch x seq_length x 10240- add_Tensor5
Mul: _onx_mul_linear_40, add_7 -> mul_9                                         |T1: batch x seq_length x 10240- mul_Tensor14
Transpose: model.layers.0.mlp.fc2.weight -> _onx_transpose_p_model_layers_0_mlp_fc2_weight0           |T1: 10240 x 2560- linear6
MatMul: mul_9, _onx_transpose_p_model_layers_0_mlp_fc2_weight0 -> _onx_matmul_mul_90    |T1: batch x seq_length x 2560- Opset14
Reshape: reshape_1, init7_s2_-1_2560 -> MatMulAddPattern--reshape_1             |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset9
Reshape: _onx_matmul_mul_90, init7_s2_-1_2560 -> MatMulAddPattern--reshape_12   |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset92
Shape: reshape_1 -> MatMulAddPattern--reshape_14                                |T7: 2                        - MatMulAddPattern--Opset93
Concat: MatMulAddPattern--reshape_14, init7_s1_-12 -> MatMulAddPattern--reshape_15  |T7: 3                    - MatMulAddPattern--Opset94
Gemm: MatMulAddPattern--reshape_1, model.layers.0.self_attn.dense.weight, MatMulAddPattern--reshape_12 -> MatMulAddPattern--reshape_13                                                      |T1: batch*seq_length x 2560- GemmTransposePattern--MatMulAddPattern--Opset962
Reshape: MatMulAddPattern--reshape_13, MatMulAddPattern--reshape_15 -> add_8    |T1: batch x seq_length x 2560- MatMulAddPattern--Opset95
Add: add_8, embedding -> add_9                                                  |T1: batch x seq_length x 2560- add_Tensor7
LayerNormalization: add_9, init1_s2560_3, init1_s2560_4 -> _onx_div_sub_add_900 |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm13
Transpose: model.layers.1.self_attn.q_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_q_proj_weight0                             |T1: 2560 x 2560- linear7
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_q_proj_weight0 -> _onx_matmul_layer_norm_10                                   |T1: batch x seq_length x 2560- Opset16
Reshape: _onx_matmul_layer_norm_10, _onx_concat_unsqueeze_sym_size_int_19020 -> view_3      |T1: batch x seq_length x 32 x 80- view4
Transpose: view_3 -> transpose_5                                                |T1: batch x 32 x seq_length x 80- transpose_int5
Split: transpose_5, init7_s2_32_48 -> slice_28, slice_29                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor9
Split: slice_28, init7_s2_16_16 -> slice_32, slice_33                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor13
Transpose: model.layers.1.self_attn.k_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_k_proj_weight0                             |T1: 2560 x 2560- linear8
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_k_proj_weight0 -> _onx_matmul_layer_norm_102                                    |T1: batch x seq_length x 2560- Opset18
Reshape: _onx_matmul_layer_norm_102, _onx_concat_unsqueeze_sym_size_int_19020 -> view_4       |T1: batch x seq_length x 32 x 80- view5
Transpose: view_4 -> transpose_6                                                |T1: batch x 32 x seq_length x 80- transpose_int6
Split: transpose_6, init7_s2_32_48 -> slice_30, slice_31                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor11
Split: slice_30, init7_s2_16_16 -> slice_34, slice_35                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor15
Transpose: model.layers.1.self_attn.v_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_v_proj_weight0                             |T1: 2560 x 2560- linear9
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_v_proj_weight0 -> _onx_matmul_layer_norm_103                                    |T1: batch x seq_length x 2560- Opset20
Reshape: _onx_matmul_layer_norm_103, _onx_concat_unsqueeze_sym_size_int_19020 -> view_5       |T1: batch x seq_length x 32 x 80- view6
Transpose: view_5 -> transpose_7                                                |T1: batch x 32 x seq_length x 80- transpose_int7
Unsqueeze: wrap_with_set_grad_enabled#1, init7_s1_1 -> unsqueeze_11             |T1: 1 x 1 x seq_length x 32  - unsqueeze9
Mul: slice_28, unsqueeze_8 -> mul_10                                            |T1: batch x 32 x seq_length x 32- mul_Tensor15
Neg: slice_33 -> neg_2                                                          |T1: batch x 32 x seq_length x 16- neg3
Concat: neg_2, slice_32 -> cat_7                                                |T1: batch x 32 x seq_length x 32- cat7
Mul: cat_7, unsqueeze_11 -> mul_11                                              |T1: batch x 32 x seq_length x 32- mul_Tensor16
Add: mul_10, mul_11 -> add_10                                                   |T1: batch x 32 x seq_length x 32- add_Tensor8
Mul: slice_30, unsqueeze_8 -> mul_12                                            |T1: batch x 32 x seq_length x 32- mul_Tensor17
Neg: slice_35 -> neg_3                                                          |T1: batch x 32 x seq_length x 16- neg4
Concat: neg_3, slice_34 -> cat_8                                                |T1: batch x 32 x seq_length x 32- cat8
Mul: cat_8, unsqueeze_11 -> mul_13                                              |T1: batch x 32 x seq_length x 32- mul_Tensor18
Add: mul_12, mul_13 -> add_11                                                   |T1: batch x 32 x seq_length x 32- add_Tensor9
Concat: add_10, slice_29 -> cat_9                                               |T1: batch x 32 x seq_length x 80- cat9
Concat: add_11, slice_31 -> cat_10                                              |T1: batch x 32 x seq_length x 80- cat10
Concat: past_key_values_key_cache_1, cat_10 -> output_2                         |T1: batch x 32 x cache_length+seq_length x 80- cat11
Concat: past_key_values_value_cache_1, transpose_7 -> output_4                  |T1: batch x 32 x cache_length+seq_length x 80- cat12
Transpose: output_2 -> _onx_transpose_cat_110                                   |T1: batch x 32 x 80 x cache_length+seq_length- aten_scaled_dot_product_attention9
MatMul: cat_9, _onx_transpose_cat_110 -> MulMulMatMulPattern__onx_matmul_mul_cat_900    |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention13-1
Mul: MulMulMatMulPattern__onx_matmul_mul_cat_900, init1_s1_6 -> _onx_matmul_mul_cat_900       |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention13-2
Add: _onx_matmul_mul_cat_900, expand_as -> _onx_add_matmul_mul_cat_9000         |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention14
Softmax: _onx_add_matmul_mul_cat_9000 -> _onx_softmax_add_matmul_mul_cat_90000  |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention15
MatMul: _onx_softmax_add_matmul_mul_cat_90000, output_4 -> scaled_dot_product_attention_1         |T1: batch x 32 x seq_length x 80- aten_scaled_dot_product_attention16
Transpose: scaled_dot_product_attention_1 -> transpose_8                        |T1: batch x seq_length x 32 x 80- transpose_int8
Reshape: transpose_8, _onx_concat_unsqueeze_sym_size_int_19030 -> reshape_2     |T1: batch x seq_length x 2560- reshape3
Transpose: model.layers.1.mlp.fc1.weight -> _onx_transpose_p_model_layers_1_mlp_fc1_weight0           |T1: 2560 x 10240- linear11
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_mlp_fc1_weight0 -> _onx_matmul_layer_norm_104                           |T1: batch x seq_length x 10240- Opset24
Reshape: init1_s_3, init7_s1_1 -> _reshape_init1_s_302                          |T1: 1                        - mul_Tensor19
Mul: _onx_matmul_layer_norm_104, _reshape_init1_s_302 -> _onx_mul_linear_100    |T1: batch x seq_length x 10240- mul_Tensor20
Pow: _onx_matmul_layer_norm_104, init1_s1_5 -> pow_2                            |T1: batch x seq_length x 10240- pow_Tensor_Scalar2
Reshape: init1_s_4, init7_s1_1 -> _reshape_init1_s_402                          |T1: 1                        - mul_Tensor22
Mul: pow_2, _reshape_init1_s_402 -> _onx_mul_pow_20                             |T1: batch x seq_length x 10240- mul_Tensor23
Add: _onx_matmul_layer_norm_104, _onx_mul_pow_20 -> add_12                      |T1: batch x seq_length x 10240- add_Tensor10
Reshape: init1_s_5, init7_s1_1 -> _reshape_init1_s_502                          |T1: 1                        - mul_Tensor25
Mul: add_12, _reshape_init1_s_502 -> _onx_mul_add_120                           |T1: batch x seq_length x 10240- mul_Tensor26
Tanh: _onx_mul_add_120 -> tanh_1                                                |T1: batch x seq_length x 10240- tanh2
Reshape: init1_s_6, init7_s1_1 -> _reshape_init1_s_602                          |T1: 1                        - Opset26
Add: tanh_1, _reshape_init1_s_602 -> add_13                                     |T1: batch x seq_length x 10240- add_Tensor11
Mul: _onx_mul_linear_100, add_13 -> mul_17                                      |T1: batch x seq_length x 10240- mul_Tensor28
Transpose: model.layers.1.mlp.fc2.weight -> _onx_transpose_p_model_layers_1_mlp_fc2_weight0           |T1: 10240 x 2560- linear12
MatMul: mul_17, _onx_transpose_p_model_layers_1_mlp_fc2_weight0 -> _onx_matmul_mul_170      |T1: batch x seq_length x 2560- Opset27
Reshape: reshape_2, init7_s2_-1_2560 -> MatMulAddPattern--reshape_2             |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset22
Reshape: _onx_matmul_mul_170, init7_s2_-1_2560 -> MatMulAddPattern--reshape_22  |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset222
Shape: reshape_2 -> MatMulAddPattern--reshape_24                                |T7: 2                        - MatMulAddPattern--Opset223
Concat: MatMulAddPattern--reshape_24, init7_s1_-12 -> MatMulAddPattern--reshape_25  |T7: 3                    - MatMulAddPattern--Opset224
Gemm: MatMulAddPattern--reshape_2, model.layers.1.self_attn.dense.weight, MatMulAddPattern--reshape_22 -> MatMulAddPattern--reshape_23                                                      |T1: batch*seq_length x 2560- GemmTransposePattern--MatMulAddPattern--Opset2262
Reshape: MatMulAddPattern--reshape_23, MatMulAddPattern--reshape_25 -> add_14   |T1: batch x seq_length x 2560- MatMulAddPattern--Opset225
Add: add_14, add_9 -> add_15                                                    |T1: batch x seq_length x 2560- add_Tensor13
LayerNormalization: add_15, init1_s2560_5, init1_s2560_6 -> _onx_div_sub_add_1500 |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm22
Transpose: lm_head.weight -> _onx_transpose_p_lm_head_weight0                   |T1: 2560 x 51200             - linear13
MatMul: _onx_div_sub_add_1500, _onx_transpose_p_lm_head_weight0 -> output_0     |T1: batch x seq_length x 51200- Opset29
output:: output_0                                                               |T1: batch x seq_length x 51200
output:: output_1                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_2                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_3                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_4                                                               |T1: batch x 32 x cache_length+seq_length x 80

FUNCKEY: ('local_functions.0', 'submod_3')
FUNC submod_3[local_functions.0]: ['expand_1', 'to_1'] -> ['output_0', 'output_1']
  opset: '': 18
  MatMul: expand_1, to_1 -> matmul                                                |:                            - Opset
  Transpose: matmul -> transpose                                                  |:                            - transpose_int
  Concat: transpose, transpose -> cat                                             |:                            - cat
  Cos: cat -> output_0                                                            |T1: batch x seq_length x 51200- cos
  Sin: cat -> output_1                                                            |T1: batch x 32 x cache_length+seq_length x 80- sin

FUNCKEY: ('local_functions', 'submod_3')
FUNC submod_3[local_functions]: ['b_model_rotary_emb_inv_freq', 'unsqueeze'] -> ['output_0', 'output_1']
  opset: '': 18
  opset: local_functions.0: 1
  Constant:  -> init7_s1_1                                                        |T7: 1                        - init2cst
  Constant:  -> init7_s2_0_2                                                      |:                            - init2cst2
  Unsqueeze: b_model_rotary_emb_inv_freq, init7_s2_0_2 -> unsqueeze_6             |:                            - UnsqueezeUnsqueezePattern--unsqueeze
  Unsqueeze: unsqueeze, init7_s1_1 -> unsqueeze_7                                 |:                            - unsqueeze3
  Cast: unsqueeze_7 -> to_1                                                       |:                            - to_dtype2
  submod_3[local_functions.0]: unsqueeze_6, to_1 -> output_0, output_1            |T1: batch x seq_length x 51200 T1: batch x 32 x cache_length+seq_length x 80- wrap_with_autocast

Visually.