to_onnx and Phi-2

Exports model Phi-2. We use a dummy model. The main difficulty is to set the dynamic shapes properly. If there is an issue, you can go to the following line: torch/fx/experimental/symbolic_shapes.py#L5965 and look for log.info("set_replacement %s = %s (%s) %s", a, tgt, msg, tgt_bound) and add before or after, something like:

if isinstance(tgt, int):
    raise AssertionError(
        f"dynamic shape becomes a constant "
        f"{[a, tgt, type(tgt), msg, tgt_bound]}"
    )

Adding TORCH_LOGS="+dynamo" TORCHDYNAMO_VERBOSE=1 prints out more information about dynamic shapes.

Model

import copy
from typing import Any, Dict
import onnx
import torch
import transformers
from onnx_array_api.plotting.graphviz_helper import plot_dot
from experimental_experiment.helpers import string_type
from experimental_experiment.xbuilder import GraphBuilder, InferShapesOptions
from experimental_experiment.torch_interpreter import to_onnx, ExportOptions


def get_phi2_untrained(batch_size: int = 2, **kwargs) -> Dict[str, Any]:
    """
    Gets a non initialized model with its inputs

    :param batch_size: batch size
    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
    :return: dictionary

    See `Phi-2/config.json
    <https://huggingface.co/microsoft/phi-2/blob/main/config.json>`_.
    """
    config = {
        "_name_or_path": "microsoft/phi-2",
        "architectures": ["PhiForCausalLM"],
        "attention_dropout": 0.0,
        "bos_token_id": 50256,
        "embd_pdrop": 0.0,
        "eos_token_id": 50256,
        "hidden_act": "gelu_new",
        "hidden_size": 2560,
        "initializer_range": 0.02,
        "intermediate_size": 10240,
        "layer_norm_eps": 1e-05,
        "max_position_embeddings": 2048,
        "model_type": "phi",
        "num_attention_heads": 32,
        "num_hidden_layers": 32,
        "num_key_value_heads": 32,
        "partial_rotary_factor": 0.4,
        "qk_layernorm": False,
        "resid_pdrop": 0.1,
        "rope_scaling": None,
        "rope_theta": 10000.0,
        "tie_word_embeddings": False,
        "torch_dtype": "float16",
        "transformers_version": "4.37.0",
        "use_cache": True,
        "vocab_size": 51200,
    }
    config.update(**kwargs)
    conf = transformers.PhiConfig(**config)
    model = transformers.PhiForCausalLM(conf)
    model.eval()

    batch = torch.export.Dim("batch")
    seq_length = torch.export.Dim("seq_length")
    shapes = {}

    cache = transformers.cache_utils.DynamicCache(config["num_hidden_layers"])
    for i in range(config["num_hidden_layers"]):
        cache.update(
            torch.randn(batch_size, 32, 30, 80), torch.randn(batch_size, 32, 30, 80), i
        )
    cache2 = transformers.cache_utils.DynamicCache(config["num_hidden_layers"])
    for i in range(config["num_hidden_layers"]):
        cache2.update(
            torch.randn(batch_size + 1, 32, 31, 80),
            torch.randn(batch_size + 1, 32, 31, 80),
            i,
        )

    inputs = dict(
        input_ids=torch.randint(0, 50285, (batch_size, 3)).to(torch.int64),
        attention_mask=torch.ones((batch_size, 33)).to(torch.int64),
        past_key_values=cache,
    )
    inputs2 = dict(
        input_ids=torch.randint(0, 50285, (batch_size + 1, 4)).to(torch.int64),
        attention_mask=torch.ones((batch_size + 1, 35)).to(torch.int64),
        past_key_values=cache2,
    )
    n = len(cache.key_cache)
    cache_length = torch.export.Dim("cache_length")
    shapes.update(
        {
            "input_ids": {0: batch, 1: seq_length},
            "attention_mask": {
                0: batch,
                1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
            },
            "past_key_values": [
                [{0: batch, 2: cache_length} for _ in range(n)],  # 0: batch,
                [{0: batch, 2: cache_length} for _ in range(n)],  # 0: batch,
            ],
        }
    )

    return dict(inputs=inputs, model=model, dynamic_shapes=shapes, inputs2=inputs2)


data = get_phi2_untrained(num_hidden_layers=2)
model = data["model"]
inputs = data["inputs"]
dynamic_shapes = data["dynamic_shapes"]

print("inputs", string_type(inputs, with_shape=True))
print("dynamic_shapes", dynamic_shapes)
inputs dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
dynamic_shapes {'input_ids': {0: <class '__main__.batch'>, 1: <class '__main__.seq_length'>}, 'attention_mask': {0: <class '__main__.batch'>, 1: <_DimHint.DYNAMIC: 3>}, 'past_key_values': [[{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}], [{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}]]}

Let’s check it is working. We need to copy the input before calling the model because it modified the inputs and they are not properly set up when the export starts.

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 0.1624, -1.1595,  0.1948,  ..., -0.8213,  1.5715, -0.7277],
         [-0.3277, -0.8472, -0.5322,  ..., -0.0498, -0.2905, -0.5375],
         [ 1.0065, -1.2420, -0.3048,  ..., -0.4370,  0.8701, -0.5533]],

        [[-0.3164, -1.2388, -1.3446,  ..., -1.5189, -1.0894,  0.1812],
         [ 1.1359, -0.5557, -0.4659,  ..., -1.2650,  1.8972, -0.3178],
         [-0.2626, -1.5779,  0.2122,  ..., -1.0226, -0.3149, -0.6777]]],
       grad_fn=<ViewBackward0>), past_key_values=DynamicCache(), hidden_states=None, attentions=None)

Export

We try to export with experimental_experiment.torch_interpreter.to_onnx().

to_onnx(model, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=dynamic_shapes)

This fails because of dynamic shapes issues.

Constraints violated (batch, seq_length)! For more information,
run with TORCH_LOGS="+dynamic".
Cannot associate shape
    [[{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>},
      {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}],
     [{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>},
      {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}]]
    specified at `dynamic_shapes['past_key_values']`
        to non-tensor type <class 'transformers.cache_utils.DynamicCache'>
        at `inputs['past_key_values']` (expected None)

The export fails for a couple of reason but it is possible to patch the code to make it work. All those modifications are put in place by onnx_export_errors and reverted after the export is done. Among other things, this function registers serialization functions as shown in example Export a model using a custom type as input.

from experimental_experiment.torch_interpreter.onnx_export_errors import (
    bypass_export_some_errors,
)

with bypass_export_some_errors(
    patch_transformers=True, replace_dynamic_cache=True, verbose=1
) as modificator:
    print("inputs before", string_type(inputs, with_shape=True))
    inputs = modificator(inputs)
    print("inputs after", string_type(inputs, with_shape=True))
    # ep = torch.export.export(model, (), inputs, dynamic_shapes=dynamic_shapes, strict=False)
    large_onx = to_onnx(
        model,
        (),
        inputs,
        dynamic_shapes=dynamic_shapes,
        export_options=ExportOptions(strict=False),
        large_model=True,
    )
    large_onx.save("plot_exporter_recipes_c_phi2.onnx", all_tensors_to_one_file=True)
[bypass_export_some_errors] replace torch.jit.isinstance, torch._dynamo.mark_static_address
[bypass_export_some_errors] register MambaCache
[bypass_export_some_errors] register DynamicCache
[bypass_export_some_errors] register patched_DynamicCache
[bypass_export_some_errors] patch sympy
[bypass_export_some_errors] patch pytorch
[bypass_export_some_errors] catch produce_guards_and_solve_constraints
[bypass_export_some_errors] patch transformers
[bypass_export_some_errors] replace DynamicCache
inputs before dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
inputs after dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:patched_DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
[_catch_produce_guards_and_solve_constraints] ERRORproduce_guards_and_solve_constraints failed, use SKIP_SOLVE_CONSTRAINTS=0 to avoid skipping
fake_mode=<torch._subclasses.fake_tensor.FakeTensorMode object at 0x7f2f353770b0>
dynamic_shapes={'input_ids': {0: <class '__main__.batch'>, 1: <class '__main__.seq_length'>}, 'attention_mask': {0: <class '__main__.batch'>, 1: <_DimHint.DYNAMIC: 3>}, 'past_key_values': [[{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}], [{0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}, {0: <class '__main__.batch'>, 2: <class '__main__.cache_length'>}]]}
equalities_inputs=EqualityConstraint(warn_only=False, source_pairs=[(TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='attention_mask', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2)), (TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2))], derived_equalities=[], phantom_symbols=[], relaxed_sources={TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='attention_mask', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=1)}, _parents={TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='attention_mask', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='input_ids', index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=0), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2), TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='value_cache', index_is_slice=False), index=1, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2): TensorPropertySource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=GetItemSource(base=LocalSource(local_name='args', is_input=False, is_derefed_cell_contents=False), index=1, index_is_slice=False), index='past_key_values', index_is_slice=False), index='key_cache', index_is_slice=False), index=0, index_is_slice=False), prop=<TensorProperty.SIZE: 0>, idx=2)}, _defs={})
original_signature=(input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Union[transformers.cache_utils.Cache, List[torch.FloatTensor], NoneType] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, **kwargs: Unpack[transformers.models.phi.modeling_phi.KwargsForCausalLM]) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]
_is_torch_jit_trace=False
exc=Constraints violated (batch, seq_length)! For more information, run with TORCH_LOGS="+dynamic".
  - Not all values of seq_length = L['args'][1]['input_ids'].size()[1] in the specified range satisfy the generated guard L['args'][1]['input_ids'].size()[1] != 9223372036854775807.
  - Not all values of batch = L['args'][1]['input_ids'].size()[0] in the specified range satisfy the generated guard L['args'][1]['input_ids'].size()[0] != 9223372036854775807.
gm=<lambda>()



def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1):
    embedding = torch.ops.aten.embedding.default(arg0_1, arg34_1);  arg0_1 = None
    sym_size_int = torch.ops.aten.sym_size.int(arg36_1, 2)
    sym_size_int_1 = torch.ops.aten.sym_size.int(arg34_1, 1)
    add = sym_size_int + sym_size_int_1
    arange = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  sym_size_int = add = None
    unsqueeze = torch.ops.aten.unsqueeze.default(arange, 0)
    sym_size_int_2 = torch.ops.aten.sym_size.int(arg35_1, 1)
    full = torch.ops.aten.full.default([sym_size_int_1, sym_size_int_2], -3.4028234663852886e+38, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
    triu = torch.ops.aten.triu.default(full, 1);  full = None
    arange_1 = torch.ops.aten.arange.default(sym_size_int_2, device = device(type='cpu'), pin_memory = False);  sym_size_int_2 = None
    reshape = torch.ops.aten.reshape.default(arange, [-1, 1]);  arange = None
    gt = torch.ops.aten.gt.Tensor(arange_1, reshape);  arange_1 = reshape = None
    mul_ = torch.ops.aten.mul_.Tensor(triu, gt);  triu = gt = None
    unsqueeze_1 = torch.ops.aten.unsqueeze.default(mul_, 0);  mul_ = None
    unsqueeze_2 = torch.ops.aten.unsqueeze.default(unsqueeze_1, 1);  unsqueeze_1 = None
    slice_1 = torch.ops.aten.slice.Tensor(unsqueeze_2, 2, 0, 9223372036854775807);  unsqueeze_2 = None
    slice_2 = torch.ops.aten.slice.Tensor(slice_1, 3, 0, 9223372036854775807);  slice_1 = None
    sym_size_int_5 = torch.ops.aten.sym_size.int(arg34_1, 0);  arg34_1 = None
    expand = torch.ops.aten.expand.default(slice_2, [sym_size_int_5, 1, -1, -1]);  slice_2 = None
    clone = torch.ops.aten.clone.default(expand);  expand = None
    slice_3 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
    slice_4 = torch.ops.aten.slice.Tensor(slice_3, 1, 0, 9223372036854775807);  slice_3 = None
    slice_5 = torch.ops.aten.slice.Tensor(slice_4, 2, 0, 9223372036854775807);  slice_4 = None
    slice_6 = torch.ops.aten.slice.Tensor(arg35_1, 0, 0, 9223372036854775807);  arg35_1 = None
    unsqueeze_3 = torch.ops.aten.unsqueeze.default(slice_6, 1);  slice_6 = None
    unsqueeze_4 = torch.ops.aten.unsqueeze.default(unsqueeze_3, 2);  unsqueeze_3 = None
    slice_7 = torch.ops.aten.slice.Tensor(unsqueeze_4, 3, 0, 9223372036854775807);  unsqueeze_4 = None
    add_2 = torch.ops.aten.add.Tensor(slice_5, slice_7);  slice_5 = slice_7 = None
    eq_7 = torch.ops.aten.eq.Scalar(add_2, 0);  add_2 = None
    slice_8 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
    slice_9 = torch.ops.aten.slice.Tensor(slice_8, 1, 0, 9223372036854775807);  slice_8 = None
    slice_10 = torch.ops.aten.slice.Tensor(slice_9, 2, 0, 9223372036854775807);  slice_9 = None
    masked_fill = torch.ops.aten.masked_fill.Scalar(slice_10, eq_7, -3.4028234663852886e+38);  slice_10 = eq_7 = None
    slice_11 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
    slice_12 = torch.ops.aten.slice.Tensor(slice_11, 1, 0, 9223372036854775807);  slice_11 = None
    slice_13 = torch.ops.aten.slice.Tensor(slice_12, 2, 0, 9223372036854775807);  slice_12 = None
    copy_ = torch.ops.aten.copy_.default(slice_13, masked_fill);  slice_13 = masked_fill = copy_ = None
    dropout = torch.ops.aten.dropout.default(embedding, 0.0, False);  embedding = None
    _set_grad_enabled = torch._C._set_grad_enabled(False);  _set_grad_enabled = None
    unsqueeze_5 = torch.ops.aten.unsqueeze.default(arg33_1, 0);  arg33_1 = None
    slice_14 = torch.ops.aten.slice.Tensor(unsqueeze_5, 1, 0, 9223372036854775807);  unsqueeze_5 = None
    unsqueeze_6 = torch.ops.aten.unsqueeze.default(slice_14, 2);  slice_14 = None
    to = torch.ops.aten.to.dtype(unsqueeze_6, torch.float32);  unsqueeze_6 = None
    expand_1 = torch.ops.aten.expand.default(to, [1, -1, 1]);  to = None
    slice_15 = torch.ops.aten.slice.Tensor(unsqueeze, 0, 0, 9223372036854775807);  unsqueeze = None
    unsqueeze_7 = torch.ops.aten.unsqueeze.default(slice_15, 1);  slice_15 = None
    slice_16 = torch.ops.aten.slice.Tensor(unsqueeze_7, 2, 0, 9223372036854775807);  unsqueeze_7 = None
    to_1 = torch.ops.aten.to.dtype(slice_16, torch.float32);  slice_16 = None
    _enter_autocast = torch.amp.autocast_mode._enter_autocast('cpu', torch.bfloat16, False, False)
    to_2 = torch.ops.aten.to.dtype(expand_1, torch.float32);  expand_1 = None
    to_3 = torch.ops.aten.to.dtype(to_1, torch.float32);  to_1 = None
    matmul = torch.ops.aten.matmul.default(to_2, to_3);  to_2 = to_3 = None
    transpose = torch.ops.aten.transpose.int(matmul, 1, 2);  matmul = None
    cat = torch.ops.aten.cat.default([transpose, transpose], -1);  transpose = None
    cos = torch.ops.aten.cos.default(cat)
    sin = torch.ops.aten.sin.default(cat);  cat = None
    _exit_autocast = torch.amp.autocast_mode._exit_autocast(_enter_autocast);  _enter_autocast = _exit_autocast = None
    mul = torch.ops.aten.mul.Tensor(cos, 1.0);  cos = None
    mul_1 = torch.ops.aten.mul.Tensor(sin, 1.0);  sin = None
    to_4 = torch.ops.aten.to.dtype(mul, torch.float32);  mul = None
    to_5 = torch.ops.aten.to.dtype(mul_1, torch.float32);  mul_1 = None
    _set_grad_enabled_1 = torch._C._set_grad_enabled(True);  _set_grad_enabled_1 = None
    layer_norm = torch.ops.aten.layer_norm.default(dropout, [2560], arg13_1, arg14_1);  arg13_1 = arg14_1 = None
    linear = torch.ops.aten.linear.default(layer_norm, arg1_1, arg2_1);  arg1_1 = arg2_1 = None
    view = torch.ops.aten.view.default(linear, [sym_size_int_5, sym_size_int_1, -1, 80]);  linear = None
    transpose_1 = torch.ops.aten.transpose.int(view, 1, 2);  view = None
    linear_1 = torch.ops.aten.linear.default(layer_norm, arg3_1, arg4_1);  arg3_1 = arg4_1 = None
    view_1 = torch.ops.aten.view.default(linear_1, [sym_size_int_5, sym_size_int_1, -1, 80]);  linear_1 = None
    transpose_2 = torch.ops.aten.transpose.int(view_1, 1, 2);  view_1 = None
    linear_2 = torch.ops.aten.linear.default(layer_norm, arg5_1, arg6_1);  arg5_1 = arg6_1 = None
    view_2 = torch.ops.aten.view.default(linear_2, [sym_size_int_5, sym_size_int_1, -1, 80]);  linear_2 = None
    transpose_3 = torch.ops.aten.transpose.int(view_2, 1, 2);  view_2 = None
    slice_17 = torch.ops.aten.slice.Tensor(transpose_1, 3, 0, 32)
    slice_18 = torch.ops.aten.slice.Tensor(transpose_1, 3, 32, 9223372036854775807);  transpose_1 = None
    slice_19 = torch.ops.aten.slice.Tensor(transpose_2, 3, 0, 32)
    slice_20 = torch.ops.aten.slice.Tensor(transpose_2, 3, 32, 9223372036854775807);  transpose_2 = None
    unsqueeze_8 = torch.ops.aten.unsqueeze.default(to_4, 1)
    unsqueeze_9 = torch.ops.aten.unsqueeze.default(to_5, 1)
    mul_2 = torch.ops.aten.mul.Tensor(slice_17, unsqueeze_8)
    slice_21 = torch.ops.aten.slice.Tensor(slice_17, 3, 0, 16)
    slice_22 = torch.ops.aten.slice.Tensor(slice_17, 3, 16, 9223372036854775807);  slice_17 = None
    neg = torch.ops.aten.neg.default(slice_22);  slice_22 = None
    cat_1 = torch.ops.aten.cat.default([neg, slice_21], -1);  neg = slice_21 = None
    mul_3 = torch.ops.aten.mul.Tensor(cat_1, unsqueeze_9);  cat_1 = None
    add_3 = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
    mul_4 = torch.ops.aten.mul.Tensor(slice_19, unsqueeze_8);  unsqueeze_8 = None
    slice_23 = torch.ops.aten.slice.Tensor(slice_19, 3, 0, 16)
    slice_24 = torch.ops.aten.slice.Tensor(slice_19, 3, 16, 9223372036854775807);  slice_19 = None
    neg_1 = torch.ops.aten.neg.default(slice_24);  slice_24 = None
    cat_2 = torch.ops.aten.cat.default([neg_1, slice_23], -1);  neg_1 = slice_23 = None
    mul_5 = torch.ops.aten.mul.Tensor(cat_2, unsqueeze_9);  cat_2 = unsqueeze_9 = None
    add_4 = torch.ops.aten.add.Tensor(mul_4, mul_5);  mul_4 = mul_5 = None
    cat_3 = torch.ops.aten.cat.default([add_3, slice_18], -1);  add_3 = slice_18 = None
    cat_4 = torch.ops.aten.cat.default([add_4, slice_20], -1);  add_4 = slice_20 = None
    cat_5 = torch.ops.aten.cat.default([arg36_1, cat_4], -2);  arg36_1 = cat_4 = None
    cat_6 = torch.ops.aten.cat.default([arg38_1, transpose_3], -2);  arg38_1 = transpose_3 = None
    slice_25 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
    slice_26 = torch.ops.aten.slice.Tensor(slice_25, 1, 0, 9223372036854775807);  slice_25 = None
    slice_27 = torch.ops.aten.slice.Tensor(slice_26, 2, 0, 9223372036854775807);  slice_26 = None
    scaled_dot_product_attention = torch.ops.aten.scaled_dot_product_attention.default(cat_3, cat_5, cat_6, slice_27, scale = 0.11180339887498948);  cat_3 = slice_27 = None
    transpose_4 = torch.ops.aten.transpose.int(scaled_dot_product_attention, 1, 2);  scaled_dot_product_attention = None
    contiguous = torch.ops.aten.contiguous.default(transpose_4);  transpose_4 = None
    reshape_1 = torch.ops.aten.reshape.default(contiguous, [sym_size_int_5, sym_size_int_1, -1]);  contiguous = sym_size_int_1 = None
    linear_3 = torch.ops.aten.linear.default(reshape_1, arg7_1, arg8_1);  reshape_1 = arg7_1 = arg8_1 = None
    dropout_1 = torch.ops.aten.dropout.default(linear_3, 0.1, False)
    linear_4 = torch.ops.aten.linear.default(layer_norm, arg9_1, arg10_1);  layer_norm = arg9_1 = arg10_1 = None
    mul_6 = torch.ops.aten.mul.Tensor(linear_4, 0.5)
    pow_1 = torch.ops.aten.pow.Tensor_Scalar(linear_4, 3.0)
    mul_7 = torch.ops.aten.mul.Tensor(pow_1, 0.044715);  pow_1 = None
    add_6 = torch.ops.aten.add.Tensor(linear_4, mul_7);  linear_4 = mul_7 = None
    mul_8 = torch.ops.aten.mul.Tensor(add_6, 0.7978845608028654);  add_6 = None
    tanh = torch.ops.aten.tanh.default(mul_8);  mul_8 = None
    add_7 = torch.ops.aten.add.Tensor(tanh, 1.0);  tanh = None
    mul_9 = torch.ops.aten.mul.Tensor(mul_6, add_7);  mul_6 = add_7 = None
    linear_5 = torch.ops.aten.linear.default(mul_9, arg11_1, arg12_1);  mul_9 = arg11_1 = arg12_1 = None
    dropout_2 = torch.ops.aten.dropout.default(linear_5, 0.1, False);  linear_5 = None
    add_8 = torch.ops.aten.add.Tensor(dropout_1, dropout_2);  dropout_1 = dropout_2 = None
    add_9 = torch.ops.aten.add.Tensor(add_8, dropout);  add_8 = dropout = None
    layer_norm_1 = torch.ops.aten.layer_norm.default(add_9, [2560], arg27_1, arg28_1);  arg27_1 = arg28_1 = None
    linear_6 = torch.ops.aten.linear.default(layer_norm_1, arg15_1, arg16_1);  arg15_1 = arg16_1 = None
    sym_size_int_16 = torch.ops.aten.sym_size.int(linear_3, 1);  linear_3 = None
    view_3 = torch.ops.aten.view.default(linear_6, [sym_size_int_5, sym_size_int_16, -1, 80]);  linear_6 = None
    transpose_5 = torch.ops.aten.transpose.int(view_3, 1, 2);  view_3 = None
    linear_7 = torch.ops.aten.linear.default(layer_norm_1, arg17_1, arg18_1);  arg17_1 = arg18_1 = None
    view_4 = torch.ops.aten.view.default(linear_7, [sym_size_int_5, sym_size_int_16, -1, 80]);  linear_7 = None
    transpose_6 = torch.ops.aten.transpose.int(view_4, 1, 2);  view_4 = None
    linear_8 = torch.ops.aten.linear.default(layer_norm_1, arg19_1, arg20_1);  arg19_1 = arg20_1 = None
    view_5 = torch.ops.aten.view.default(linear_8, [sym_size_int_5, sym_size_int_16, -1, 80]);  linear_8 = None
    transpose_7 = torch.ops.aten.transpose.int(view_5, 1, 2);  view_5 = None
    slice_28 = torch.ops.aten.slice.Tensor(transpose_5, 3, 0, 32)
    slice_29 = torch.ops.aten.slice.Tensor(transpose_5, 3, 32, 9223372036854775807);  transpose_5 = None
    slice_30 = torch.ops.aten.slice.Tensor(transpose_6, 3, 0, 32)
    slice_31 = torch.ops.aten.slice.Tensor(transpose_6, 3, 32, 9223372036854775807);  transpose_6 = None
    unsqueeze_10 = torch.ops.aten.unsqueeze.default(to_4, 1);  to_4 = None
    unsqueeze_11 = torch.ops.aten.unsqueeze.default(to_5, 1);  to_5 = None
    mul_10 = torch.ops.aten.mul.Tensor(slice_28, unsqueeze_10)
    slice_32 = torch.ops.aten.slice.Tensor(slice_28, 3, 0, 16)
    slice_33 = torch.ops.aten.slice.Tensor(slice_28, 3, 16, 9223372036854775807);  slice_28 = None
    neg_2 = torch.ops.aten.neg.default(slice_33);  slice_33 = None
    cat_7 = torch.ops.aten.cat.default([neg_2, slice_32], -1);  neg_2 = slice_32 = None
    mul_11 = torch.ops.aten.mul.Tensor(cat_7, unsqueeze_11);  cat_7 = None
    add_10 = torch.ops.aten.add.Tensor(mul_10, mul_11);  mul_10 = mul_11 = None
    mul_12 = torch.ops.aten.mul.Tensor(slice_30, unsqueeze_10);  unsqueeze_10 = None
    slice_34 = torch.ops.aten.slice.Tensor(slice_30, 3, 0, 16)
    slice_35 = torch.ops.aten.slice.Tensor(slice_30, 3, 16, 9223372036854775807);  slice_30 = None
    neg_3 = torch.ops.aten.neg.default(slice_35);  slice_35 = None
    cat_8 = torch.ops.aten.cat.default([neg_3, slice_34], -1);  neg_3 = slice_34 = None
    mul_13 = torch.ops.aten.mul.Tensor(cat_8, unsqueeze_11);  cat_8 = unsqueeze_11 = None
    add_11 = torch.ops.aten.add.Tensor(mul_12, mul_13);  mul_12 = mul_13 = None
    cat_9 = torch.ops.aten.cat.default([add_10, slice_29], -1);  add_10 = slice_29 = None
    cat_10 = torch.ops.aten.cat.default([add_11, slice_31], -1);  add_11 = slice_31 = None
    cat_11 = torch.ops.aten.cat.default([arg37_1, cat_10], -2);  arg37_1 = cat_10 = None
    cat_12 = torch.ops.aten.cat.default([arg39_1, transpose_7], -2);  arg39_1 = transpose_7 = None
    slice_36 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807);  clone = None
    slice_37 = torch.ops.aten.slice.Tensor(slice_36, 1, 0, 9223372036854775807);  slice_36 = None
    slice_38 = torch.ops.aten.slice.Tensor(slice_37, 2, 0, 9223372036854775807);  slice_37 = None
    scaled_dot_product_attention_1 = torch.ops.aten.scaled_dot_product_attention.default(cat_9, cat_11, cat_12, slice_38, scale = 0.11180339887498948);  cat_9 = slice_38 = None
    transpose_8 = torch.ops.aten.transpose.int(scaled_dot_product_attention_1, 1, 2);  scaled_dot_product_attention_1 = None
    contiguous_1 = torch.ops.aten.contiguous.default(transpose_8);  transpose_8 = None
    reshape_2 = torch.ops.aten.reshape.default(contiguous_1, [sym_size_int_5, sym_size_int_16, -1]);  contiguous_1 = sym_size_int_5 = sym_size_int_16 = None
    linear_9 = torch.ops.aten.linear.default(reshape_2, arg21_1, arg22_1);  reshape_2 = arg21_1 = arg22_1 = None
    dropout_3 = torch.ops.aten.dropout.default(linear_9, 0.1, False);  linear_9 = None
    linear_10 = torch.ops.aten.linear.default(layer_norm_1, arg23_1, arg24_1);  layer_norm_1 = arg23_1 = arg24_1 = None
    mul_14 = torch.ops.aten.mul.Tensor(linear_10, 0.5)
    pow_2 = torch.ops.aten.pow.Tensor_Scalar(linear_10, 3.0)
    mul_15 = torch.ops.aten.mul.Tensor(pow_2, 0.044715);  pow_2 = None
    add_12 = torch.ops.aten.add.Tensor(linear_10, mul_15);  linear_10 = mul_15 = None
    mul_16 = torch.ops.aten.mul.Tensor(add_12, 0.7978845608028654);  add_12 = None
    tanh_1 = torch.ops.aten.tanh.default(mul_16);  mul_16 = None
    add_13 = torch.ops.aten.add.Tensor(tanh_1, 1.0);  tanh_1 = None
    mul_17 = torch.ops.aten.mul.Tensor(mul_14, add_13);  mul_14 = add_13 = None
    linear_11 = torch.ops.aten.linear.default(mul_17, arg25_1, arg26_1);  mul_17 = arg25_1 = arg26_1 = None
    dropout_4 = torch.ops.aten.dropout.default(linear_11, 0.1, False);  linear_11 = None
    add_14 = torch.ops.aten.add.Tensor(dropout_3, dropout_4);  dropout_3 = dropout_4 = None
    add_15 = torch.ops.aten.add.Tensor(add_14, add_9);  add_14 = add_9 = None
    layer_norm_2 = torch.ops.aten.layer_norm.default(add_15, [2560], arg29_1, arg30_1);  add_15 = arg29_1 = arg30_1 = None
    slice_39 = torch.ops.aten.slice.Tensor(layer_norm_2, 0, 0, 9223372036854775807);  layer_norm_2 = None
    slice_40 = torch.ops.aten.slice.Tensor(slice_39, 1, 0, 9223372036854775807);  slice_39 = None
    slice_41 = torch.ops.aten.slice.Tensor(slice_40, 2, 0, 9223372036854775807);  slice_40 = None
    linear_12 = torch.ops.aten.linear.default(slice_41, arg31_1, arg32_1);  slice_41 = arg31_1 = arg32_1 = None
    return (linear_12, cat_5, cat_11, cat_6, cat_12)

# To see more debug info, please use `graph_module.print_readable()`
[bypass_export_some_errors] restored sympy functions
[bypass_export_some_errors] restored pytorch functions
[bypass_export_some_errors] restored produce_guards_and_solve_constraints
[bypass_export_some_errors] restored transformer
[bypass_export_some_errors] restored DynamicCache
[bypass_export_some_errors] unregistered MambaCache
[bypass_export_some_errors] unregistered DynamicCache
[bypass_export_some_errors] unregistered patched_DynamicCache

Let’s display the model.

onx = onnx.load("plot_exporter_recipes_c_phi2.onnx")
gr = GraphBuilder(onx, infer_shapes_options=InferShapesOptions.NONE)
print(gr.pretty_text())
dyn---: batch -> WrapSym(batch)
dyn---: batch*seq_length -> 'batch*seq_length'
dyn---: cache_length -> WrapSym(cache_length)
dyn---: cache_length+seq_length -> WrapSym(cache_length+seq_length)
dyn---: channel -> WrapSym(channel)
dyn---: seq_length -> WrapSym(seq_length)
dynrev: batch -> [('batch', SymInt(batch))]
dynrev: cache_length -> [('cache_length', SymInt(cache_length))]
dynrev: cache_length+seq_length -> [('cache_length+seq_length', SymInt(cache_length+seq_length))]
dynrev: channel -> [('channel', SymInt(channel))]
dynrev: seq_length -> [('seq_length', SymInt(seq_length))]
dynsrc: batch -> [{batch:('input_name', 'input_ids'), batch:('axis', 0)}, {batch:('input_name', 'attention_mask'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_key_cache_0'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_key_cache_1'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_value_cache_0'), batch:('axis', 0)}, {batch:('input_name', 'past_key_values_value_cache_1'), batch:('axis', 0)}, {batch:('input_name', 'output_0'), batch:('axis', 0)}, {batch:('input_name', 'output_1'), batch:('axis', 0)}, {batch:('input_name', 'output_2'), batch:('axis', 0)}, {batch:('input_name', 'output_3'), batch:('axis', 0)}, {batch:('input_name', 'output_4'), batch:('axis', 0)}, {batch:('input_name', 'embedding'), batch:('axis', 0)}, {batch:('input_name', 'expand'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_cast_unsqueeze_40'), batch:('axis', 0)}, {batch:('input_name', 'add_2'), batch:('axis', 0)}, {batch:('input_name', 'eq_7'), batch:('axis', 0)}, {batch:('input_name', 'masked_fill'), batch:('axis', 0)}, {batch:('input_name', 'expand_as'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm0'), batch:('axis', 0)}, {batch:('input_name', 'view'), batch:('axis', 0)}, {batch:('input_name', 'transpose_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_17'), batch:('axis', 0)}, {batch:('input_name', 'slice_18'), batch:('axis', 0)}, {batch:('input_name', 'slice_21'), batch:('axis', 0)}, {batch:('input_name', 'slice_22'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm02'), batch:('axis', 0)}, {batch:('input_name', 'view_1'), batch:('axis', 0)}, {batch:('input_name', 'transpose_2'), batch:('axis', 0)}, {batch:('input_name', 'slice_19'), batch:('axis', 0)}, {batch:('input_name', 'slice_20'), batch:('axis', 0)}, {batch:('input_name', 'slice_23'), batch:('axis', 0)}, {batch:('input_name', 'slice_24'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm03'), batch:('axis', 0)}, {batch:('input_name', 'view_2'), batch:('axis', 0)}, {batch:('input_name', 'transpose_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_2'), batch:('axis', 0)}, {batch:('input_name', 'neg'), batch:('axis', 0)}, {batch:('input_name', 'cat_1'), batch:('axis', 0)}, {batch:('input_name', 'mul_3'), batch:('axis', 0)}, {batch:('input_name', 'add_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_4'), batch:('axis', 0)}, {batch:('input_name', 'neg_1'), batch:('axis', 0)}, {batch:('input_name', 'cat_2'), batch:('axis', 0)}, {batch:('input_name', 'mul_5'), batch:('axis', 0)}, {batch:('input_name', 'add_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_3'), batch:('axis', 0)}, {batch:('input_name', 'cat_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_50'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention'), batch:('axis', 0)}, {batch:('input_name', 'transpose_4'), batch:('axis', 0)}, {batch:('input_name', 'reshape_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm04'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_40'), batch:('axis', 0)}, {batch:('input_name', 'pow_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_10'), batch:('axis', 0)}, {batch:('input_name', 'add_6'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_60'), batch:('axis', 0)}, {batch:('input_name', 'tanh'), batch:('axis', 0)}, {batch:('input_name', 'add_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_9'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_90'), batch:('axis', 0)}, {batch:('input_name', 'add_8'), batch:('axis', 0)}, {batch:('input_name', 'add_9'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_10'), batch:('axis', 0)}, {batch:('input_name', 'view_3'), batch:('axis', 0)}, {batch:('input_name', 'transpose_5'), batch:('axis', 0)}, {batch:('input_name', 'slice_28'), batch:('axis', 0)}, {batch:('input_name', 'slice_29'), batch:('axis', 0)}, {batch:('input_name', 'slice_32'), batch:('axis', 0)}, {batch:('input_name', 'slice_33'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_102'), batch:('axis', 0)}, {batch:('input_name', 'view_4'), batch:('axis', 0)}, {batch:('input_name', 'transpose_6'), batch:('axis', 0)}, {batch:('input_name', 'slice_30'), batch:('axis', 0)}, {batch:('input_name', 'slice_31'), batch:('axis', 0)}, {batch:('input_name', 'slice_34'), batch:('axis', 0)}, {batch:('input_name', 'slice_35'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_103'), batch:('axis', 0)}, {batch:('input_name', 'view_5'), batch:('axis', 0)}, {batch:('input_name', 'transpose_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_10'), batch:('axis', 0)}, {batch:('input_name', 'neg_2'), batch:('axis', 0)}, {batch:('input_name', 'cat_7'), batch:('axis', 0)}, {batch:('input_name', 'mul_11'), batch:('axis', 0)}, {batch:('input_name', 'add_10'), batch:('axis', 0)}, {batch:('input_name', 'mul_12'), batch:('axis', 0)}, {batch:('input_name', 'neg_3'), batch:('axis', 0)}, {batch:('input_name', 'cat_8'), batch:('axis', 0)}, {batch:('input_name', 'mul_13'), batch:('axis', 0)}, {batch:('input_name', 'add_11'), batch:('axis', 0)}, {batch:('input_name', 'cat_9'), batch:('axis', 0)}, {batch:('input_name', 'cat_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_110'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention_1'), batch:('axis', 0)}, {batch:('input_name', 'transpose_8'), batch:('axis', 0)}, {batch:('input_name', 'reshape_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_104'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_100'), batch:('axis', 0)}, {batch:('input_name', 'pow_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_20'), batch:('axis', 0)}, {batch:('input_name', 'add_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_120'), batch:('axis', 0)}, {batch:('input_name', 'tanh_1'), batch:('axis', 0)}, {batch:('input_name', 'add_13'), batch:('axis', 0)}, {batch:('input_name', 'mul_17'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_170'), batch:('axis', 0)}, {batch:('input_name', 'add_14'), batch:('axis', 0)}, {batch:('input_name', 'add_15'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', 'linear_6'), batch:('axis', 0)}, {batch:('input_name', 'contiguous_1'), batch:('axis', 0)}, {batch:('input_name', 'view_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_6'), batch:('axis', 0)}, {batch:('input_name', 'slice_31'), batch:('axis', 0)}, {batch:('input_name', 'neg_3'), batch:('axis', 0)}, {batch:('input_name', 'cat_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm03'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_dropout000'), batch:('axis', 0)}, {batch:('input_name', 'slice_21'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_add_150'), batch:('axis', 0)}, {batch:('input_name', 'mul_4'), batch:('axis', 0)}, {batch:('input_name', 'expand'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_transpose_cat_1100'), batch:('axis', 0)}, {batch:('input_name', 'linear_9'), batch:('axis', 0)}, {batch:('input_name', 'add_11'), batch:('axis', 0)}, {batch:('input_name', 'transpose_6'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm02'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_dropout00000'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_dropout0'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_104'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_10'), batch:('axis', 0)}, {batch:('input_name', 'cat_11'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_add_15000'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_110'), batch:('axis', 0)}, {batch:('input_name', 'dropout'), batch:('axis', 0)}, {batch:('input_name', 'add_13'), batch:('axis', 0)}, {batch:('input_name', 'pow_2'), batch:('axis', 0)}, {batch:('input_name', 'linear_10'), batch:('axis', 0)}, {batch:('input_name', 'neg_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_170'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_100'), batch:('axis', 0)}, {batch:('input_name', 'linear_7'), batch:('axis', 0)}, {batch:('input_name', 'embedding'), batch:('axis', 0)}, {batch:('input_name', 'linear_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_102'), batch:('axis', 0)}, {batch:('input_name', 'tanh_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_pow_20'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', 'reshape_2'), batch:('axis', 0)}, {batch:('input_name', 'view_5'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_dropout0'), batch:('axis', 0)}, {batch:('input_name', 'transpose_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_34'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm04'), batch:('axis', 0)}, {batch:('input_name', 'expand_as'), batch:('axis', 0)}, {batch:('input_name', 'mul_9'), batch:('axis', 0)}, {batch:('input_name', 'linear'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm_1'), batch:('axis', 0)}, {batch:('input_name', 'linear_12'), batch:('axis', 0)}, {batch:('input_name', 'linear_8'), batch:('axis', 0)}, {batch:('input_name', 'slice_17'), batch:('axis', 0)}, {batch:('input_name', 'add_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', 'cat_8'), batch:('axis', 0)}, {batch:('input_name', 'view_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_20'), batch:('axis', 0)}, {batch:('input_name', 'mul_16'), batch:('axis', 0)}, {batch:('input_name', 'dropout_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_div_sub_add_9000'), batch:('axis', 0)}, {batch:('input_name', 'masked_fill'), batch:('axis', 0)}, {batch:('input_name', 'slice_19'), batch:('axis', 0)}, {batch:('input_name', 'view_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_103'), batch:('axis', 0)}, {batch:('input_name', 'transpose_8'), batch:('axis', 0)}, {batch:('input_name', 'reshape_1'), batch:('axis', 0)}, {batch:('input_name', 'slice_22'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_reshape_20'), batch:('axis', 0)}, {batch:('input_name', 'transpose_7'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_3'), batch:('axis', 0)}, {batch:('input_name', 'slice_32'), batch:('axis', 0)}, {batch:('input_name', 'layer_norm'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_1500000'), batch:('axis', 0)}, {batch:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_900000'), batch:('axis', 0)}, {batch:('input_name', 'linear_1'), batch:('axis', 0)}, {batch:('input_name', 'transpose_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm_10'), batch:('axis', 0)}, {batch:('input_name', 'view'), batch:('axis', 0)}, {batch:('input_name', 'mul_6'), batch:('axis', 0)}, {batch:('input_name', 'dropout_3'), batch:('axis', 0)}, {batch:('input_name', 'slice_33'), batch:('axis', 0)}, {batch:('input_name', 'cat_3'), batch:('axis', 0)}, {batch:('input_name', 'unsqueeze_4'), batch:('axis', 0)}, {batch:('input_name', 'transpose_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_dropout0000'), batch:('axis', 0)}, {batch:('input_name', 'slice_35'), batch:('axis', 0)}, {batch:('input_name', 'cat_5'), batch:('axis', 0)}, {batch:('input_name', 'linear_4'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_cat_90'), batch:('axis', 0)}, {batch:('input_name', 'linear_5'), batch:('axis', 0)}, {batch:('input_name', 'neg'), batch:('axis', 0)}, {batch:('input_name', 'slice_23'), batch:('axis', 0)}, {batch:('input_name', 'mul_15'), batch:('axis', 0)}, {batch:('input_name', 'mul_2'), batch:('axis', 0)}, {batch:('input_name', 'add_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_7'), batch:('axis', 0)}, {batch:('input_name', 'slice_18'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_layer_norm0'), batch:('axis', 0)}, {batch:('input_name', 'mul_5'), batch:('axis', 0)}, {batch:('input_name', 'slice_20'), batch:('axis', 0)}, {batch:('input_name', 'mul_8'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_add_9000'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_mul_90'), batch:('axis', 0)}, {batch:('input_name', 'mul_14'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_60'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_add_15000'), batch:('axis', 0)}, {batch:('input_name', 'slice_28'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_dropout0000'), batch:('axis', 0)}, {batch:('input_name', 'add_3'), batch:('axis', 0)}, {batch:('input_name', 'mul_11'), batch:('axis', 0)}, {batch:('input_name', 'transpose_3'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_add_150000'), batch:('axis', 0)}, {batch:('input_name', 'clone'), batch:('axis', 0)}, {batch:('input_name', '_onx_cast_unsqueeze_40'), batch:('axis', 0)}, {batch:('input_name', 'slice_24'), batch:('axis', 0)}, {batch:('input_name', 'dropout_4'), batch:('axis', 0)}, {batch:('input_name', 'cat_9'), batch:('axis', 0)}, {batch:('input_name', 'eq_7'), batch:('axis', 0)}, {batch:('input_name', 'cat_12'), batch:('axis', 0)}, {batch:('input_name', '_onx_transpose_cat_50'), batch:('axis', 0)}, {batch:('input_name', 'linear_11'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', 'mul_12'), batch:('axis', 0)}, {batch:('input_name', 'slice_30'), batch:('axis', 0)}, {batch:('input_name', '_onx_sub_add_90'), batch:('axis', 0)}, {batch:('input_name', 'neg_2'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_add_90'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_transpose_cat_500'), batch:('axis', 0)}, {batch:('input_name', 'view_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_pow_sub_dropout000'), batch:('axis', 0)}, {batch:('input_name', 'mul_3'), batch:('axis', 0)}, {batch:('input_name', 'add_8'), batch:('axis', 0)}, {batch:('input_name', 'scaled_dot_product_attention_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_add_120'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_add_90000'), batch:('axis', 0)}, {batch:('input_name', 'mul_13'), batch:('axis', 0)}, {batch:('input_name', 'linear_2'), batch:('axis', 0)}, {batch:('input_name', 'mul_7'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_cat_30'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_dropout00'), batch:('axis', 0)}, {batch:('input_name', 'cat_2'), batch:('axis', 0)}, {batch:('input_name', 'add_7'), batch:('axis', 0)}, {batch:('input_name', 'tanh'), batch:('axis', 0)}, {batch:('input_name', '_onx_div_sub_add_900'), batch:('axis', 0)}, {batch:('input_name', 'mul_10'), batch:('axis', 0)}, {batch:('input_name', 'slice_29'), batch:('axis', 0)}, {batch:('input_name', 'add_9'), batch:('axis', 0)}, {batch:('input_name', 'transpose_5'), batch:('axis', 0)}, {batch:('input_name', 'add_6'), batch:('axis', 0)}, {batch:('input_name', '_onx_pow_sub_add_1500'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_mul_div_sub_add_90000'), batch:('axis', 0)}, {batch:('input_name', 'add_2'), batch:('axis', 0)}, {batch:('input_name', 'add_14'), batch:('axis', 0)}, {batch:('input_name', 'add_15'), batch:('axis', 0)}, {batch:('input_name', 'mul_17'), batch:('axis', 0)}, {batch:('input_name', 'dropout_1'), batch:('axis', 0)}, {batch:('input_name', '_onx_matmul_reshape_10'), batch:('axis', 0)}, {batch:('input_name', 'add_10'), batch:('axis', 0)}, {batch:('input_name', '_onx_reducemean_add_150'), batch:('axis', 0)}, {batch:('input_name', 'contiguous'), batch:('axis', 0)}, {batch:('input_name', '_onx_mul_linear_40'), batch:('axis', 0)}, {batch:('input_name', '_onx_add_reducemean_pow_sub_add_150000'), batch:('axis', 0)}, {batch:('input_name', 'pow_1'), batch:('axis', 0)}, {batch:('input_name', 'cat_4'), batch:('axis', 0)}]
dynsrc: batch*seq_length -> [{batch*seq_length:('input_name', 'MatMulAddPattern--reshape_1'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_12'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_13'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_2'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_22'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_23'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_2'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_1'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_12'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_13'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_23'), batch*seq_length:('axis', 0)}, {batch*seq_length:('input_name', 'MatMulAddPattern--reshape_22'), batch*seq_length:('axis', 0)}]
dynsrc: cache_length -> [{cache_length:('input_name', 'past_key_values_key_cache_0'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_key_cache_1'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_value_cache_0'), cache_length:('axis', 2)}, {cache_length:('input_name', 'past_key_values_value_cache_1'), cache_length:('axis', 2)}]
dynsrc: cache_length+seq_length -> [{cache_length+seq_length:('input_name', 'output_1'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_2'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_3'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'output_4'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_50'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_110'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', 'cat_6'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_mul_transpose_cat_1100'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', 'cat_11'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_110'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', 'cat_5'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', 'cat_12'), cache_length+seq_length:('axis', 2)}, {cache_length+seq_length:('input_name', '_onx_transpose_cat_50'), cache_length+seq_length:('axis', 3)}, {cache_length+seq_length:('input_name', '_onx_mul_transpose_cat_500'), cache_length+seq_length:('axis', 3)}]
dynsrc: channel -> [{channel:('input_name', 'attention_mask'), channel:('axis', 1)}, {channel:('input_name', 'full'), channel:('axis', 1)}, {channel:('input_name', 'triu'), channel:('axis', 1)}, {channel:('input_name', 'arange_1'), channel:('axis', 0)}, {channel:('input_name', 'gt'), channel:('axis', 1)}, {channel:('input_name', '_onx_cast_gt0'), channel:('axis', 1)}, {channel:('input_name', '_onx_mul_triu0'), channel:('axis', 1)}, {channel:('input_name', 'unsqueeze_2'), channel:('axis', 3)}, {channel:('input_name', 'expand'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_4'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_unsqueeze_40'), channel:('axis', 3)}, {channel:('input_name', 'add_2'), channel:('axis', 3)}, {channel:('input_name', 'eq_7'), channel:('axis', 3)}, {channel:('input_name', 'masked_fill'), channel:('axis', 3)}, {channel:('input_name', 'expand_as'), channel:('axis', 3)}, {channel:('input_name', 'full'), channel:('axis', 1)}, {channel:('input_name', 'expand'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_gt0'), channel:('axis', 1)}, {channel:('input_name', 'unsqueeze_1'), channel:('axis', 2)}, {channel:('input_name', 'gt'), channel:('axis', 1)}, {channel:('input_name', 'expand_as'), channel:('axis', 3)}, {channel:('input_name', 'masked_fill'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_3'), channel:('axis', 2)}, {channel:('input_name', 'unsqueeze_4'), channel:('axis', 3)}, {channel:('input_name', 'unsqueeze_2'), channel:('axis', 3)}, {channel:('input_name', 'mul_'), channel:('axis', 1)}, {channel:('input_name', 'clone'), channel:('axis', 3)}, {channel:('input_name', '_onx_cast_unsqueeze_40'), channel:('axis', 3)}, {channel:('input_name', 'eq_7'), channel:('axis', 3)}, {channel:('input_name', '_onx_mul_triu0'), channel:('axis', 1)}, {channel:('input_name', 'arange_1'), channel:('axis', 0)}, {channel:('input_name', 'add_2'), channel:('axis', 3)}, {channel:('input_name', 'triu'), channel:('axis', 1)}]
dynsrc: seq_length -> [{seq_length:('input_name', 'input_ids'), seq_length:('axis', 1)}, {seq_length:('input_name', 'output_0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'embedding'), seq_length:('axis', 1)}, {seq_length:('input_name', 'arange'), seq_length:('axis', 0)}, {seq_length:('input_name', 'unsqueeze'), seq_length:('axis', 1)}, {seq_length:('input_name', 'full'), seq_length:('axis', 0)}, {seq_length:('input_name', 'triu'), seq_length:('axis', 0)}, {seq_length:('input_name', 'reshape'), seq_length:('axis', 0)}, {seq_length:('input_name', 'gt'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_cast_gt0'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_mul_triu0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'unsqueeze_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'expand'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'eq_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'masked_fill'), seq_length:('axis', 2)}, {seq_length:('input_name', 'expand_as'), seq_length:('axis', 2)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_8'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_17'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_18'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_21'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_22'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm02'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_19'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_20'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_23'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_24'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm03'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm04'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_40'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_6'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_60'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_7'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_9'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_9'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_3'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_28'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_29'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_32'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_33'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_102'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_6'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_30'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_31'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_34'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_35'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_103'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_5'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_12'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_8'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_13'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_104'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_100'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_12'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_120'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_13'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_17'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_170'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_14'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_15'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_6'), seq_length:('axis', 1)}, {seq_length:('input_name', 'contiguous_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_31'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_1'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm03'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_div_sub_dropout000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_21'), seq_length:('axis', 2)}, {seq_length:('input_name', 'scaled_dot_product_attention'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_sub_add_150'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'full'), seq_length:('axis', 0)}, {seq_length:('input_name', 'expand'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_6'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm02'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_dropout00000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_dropout0'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_104'), seq_length:('axis', 1)}, {seq_length:('input_name', 'layer_norm_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_10'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_div_sub_add_15000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_13'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'neg_1'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_mul_170'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_100'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_7'), seq_length:('axis', 1)}, {seq_length:('input_name', 'embedding'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_3'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_102'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_pow_20'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_cast_gt0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'unsqueeze_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_5'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sub_dropout0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'gt'), seq_length:('axis', 0)}, {seq_length:('input_name', 'transpose_1'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_34'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm04'), seq_length:('axis', 1)}, {seq_length:('input_name', 'expand_as'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear'), seq_length:('axis', 1)}, {seq_length:('input_name', 'layer_norm_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_12'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_17'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_12'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_8'), seq_length:('axis', 2)}, {seq_length:('input_name', 'to_5'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'to_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_16'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout_2'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_div_sub_add_9000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'masked_fill'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_19'), seq_length:('axis', 2)}, {seq_length:('input_name', 'view_3'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_layer_norm_103'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'reshape_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_22'), seq_length:('axis', 2)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#0'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_reshape_20'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_32'), seq_length:('axis', 2)}, {seq_length:('input_name', 'layer_norm'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_1500000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_sqrt_add_reducemean_pow_sub_add_900000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'linear_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_10'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'view'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_6'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout_3'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_33'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'reshape'), seq_length:('axis', 0)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_dropout0000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_35'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_4'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_cat_90'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_5'), seq_length:('axis', 1)}, {seq_length:('input_name', 'neg'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_23'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_15'), seq_length:('axis', 1)}, {seq_length:('input_name', 'arange'), seq_length:('axis', 0)}, {seq_length:('input_name', 'mul_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_4'), seq_length:('axis', 2)}, {seq_length:('input_name', 'cat_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_18'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_matmul_layer_norm0'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_20'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_8'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_add_9000'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_mul_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_'), seq_length:('axis', 0)}, {seq_length:('input_name', 'mul_14'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_add_60'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_add_15000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'slice_28'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_add_mul_div_sub_dropout0000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'mul_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'transpose_3'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_add_mul_div_sub_add_150000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'clone'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_24'), seq_length:('axis', 2)}, {seq_length:('input_name', 'unsqueeze_8'), seq_length:('axis', 2)}, {seq_length:('input_name', 'dropout_4'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'eq_7'), seq_length:('axis', 2)}, {seq_length:('input_name', 'wrap_with_set_grad_enabled#1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_11'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_11'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_12'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_30'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_sub_add_90'), seq_length:('axis', 1)}, {seq_length:('input_name', 'unsqueeze_9'), seq_length:('axis', 2)}, {seq_length:('input_name', 'neg_2'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_reducemean_add_90'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_triu0'), seq_length:('axis', 0)}, {seq_length:('input_name', 'view_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_reducemean_pow_sub_dropout000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_3'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_8'), seq_length:('axis', 1)}, {seq_length:('input_name', 'scaled_dot_product_attention_1'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_mul_add_120'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_add_90000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_13'), seq_length:('axis', 2)}, {seq_length:('input_name', 'linear_2'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_7'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_cat_30'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_div_sub_dropout00'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_7'), seq_length:('axis', 1)}, {seq_length:('input_name', 'tanh'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_div_sub_add_900'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_10'), seq_length:('axis', 2)}, {seq_length:('input_name', 'slice_29'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_9'), seq_length:('axis', 1)}, {seq_length:('input_name', 'transpose_5'), seq_length:('axis', 2)}, {seq_length:('input_name', 'add_6'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_pow_sub_add_1500'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_mul_div_sub_add_90000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_2'), seq_length:('axis', 2)}, {seq_length:('input_name', 'triu'), seq_length:('axis', 0)}, {seq_length:('input_name', 'add_14'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_15'), seq_length:('axis', 1)}, {seq_length:('input_name', 'mul_17'), seq_length:('axis', 1)}, {seq_length:('input_name', 'dropout_1'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_matmul_reshape_10'), seq_length:('axis', 1)}, {seq_length:('input_name', 'add_10'), seq_length:('axis', 2)}, {seq_length:('input_name', '_onx_reducemean_add_150'), seq_length:('axis', 1)}, {seq_length:('input_name', 'contiguous'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_mul_linear_40'), seq_length:('axis', 1)}, {seq_length:('input_name', '_onx_add_reducemean_pow_sub_add_150000'), seq_length:('axis', 1)}, {seq_length:('input_name', 'pow_1'), seq_length:('axis', 1)}, {seq_length:('input_name', 'cat_4'), seq_length:('axis', 2)}]
opset: : 18
opset: local_functions.0: 1
opset: local_functions: 1
init: b_model_rotary_emb_inv_freq: ?: ?                                -- GraphBuilder._update_structures_with_proto.1/from(b_model_rotary_emb_inv_freq)
init: init7_s1_1: int64: 1                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_1)##GraphBuilder.compute_constant/from(init7_s1_1)##GraphBuilder._update_structures_with_proto.1/from(init7_s1_1)
init: init7_s_1: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init7_s_1)
init: init7_s1_0: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_0)
init: init7_s_0: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init7_s_0)
init: init7_s2_-1_1: int64: 2                                          -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_1)##GraphBuilder.compute_constant/from(init7_s2_-1_1)##GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_1)
init: init1_s_: ?: ?                                                   -- GraphBuilder._update_structures_with_proto.1/from(init1_s_)
init: init1_s1_: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_)
init: init7_s1_-1: ?: ?                                                -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_-1)
init: init7_s1_80: ?: ?                                                -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_80)
init: init1_s_3: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_3)
init: init1_s1_5: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_5)
init: init1_s_4: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_4)
init: init1_s_5: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_5)
init: init1_s_6: ?: ?                                                  -- GraphBuilder._update_structures_with_proto.1/from(init1_s_6)
init: init7_s2_0_1: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_0_1)
init: init7_s2_1_2: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_1_2)
init: init1_s2560_: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_)
init: init1_s2560_2: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_2)
init: init1_s2560_3: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_3)
init: init1_s2560_4: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_4)
init: init1_s2560_5: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_5)
init: init1_s2560_6: ?: ?                                              -- GraphBuilder._update_structures_with_proto.1/from(init1_s2560_6)
init: init1_s1_6: ?: ?                                                 -- GraphBuilder._update_structures_with_proto.1/from(init1_s1_6)
init: init7_s2_32_48: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_32_48)
init: init7_s2_16_16: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_16_16)
init: init7_s2_-1_2560: int64: 2                                       -- GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_2560)##GraphBuilder.compute_constant/from(init7_s2_-1_2560)##GraphBuilder._update_structures_with_proto.1/from(init7_s2_-1_2560)
init: init7_s1_-12: ?: ?                                               -- GraphBuilder._update_structures_with_proto.1/from(init7_s1_-12)
init: model.embed_tokens.weight: ?: ?                                  -- GraphBuilder._update_structures_with_proto.1/from(model.embed_tokens.weight)
init: model.layers.0.self_attn.q_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.q_proj.weight)
init: model.layers.0.self_attn.k_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.k_proj.weight)
init: model.layers.0.self_attn.v_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.v_proj.weight)
init: model.layers.0.self_attn.dense.weight: ?: ?                      -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.self_attn.dense.weight)
init: model.layers.0.mlp.fc1.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.mlp.fc1.weight)
init: model.layers.0.mlp.fc2.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.0.mlp.fc2.weight)
init: model.layers.1.self_attn.q_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.q_proj.weight)
init: model.layers.1.self_attn.k_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.k_proj.weight)
init: model.layers.1.self_attn.v_proj.weight: ?: ?                     -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.v_proj.weight)
init: model.layers.1.self_attn.dense.weight: ?: ?                      -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.self_attn.dense.weight)
init: model.layers.1.mlp.fc1.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.mlp.fc1.weight)
init: model.layers.1.mlp.fc2.weight: ?: ?                              -- GraphBuilder._update_structures_with_proto.1/from(model.layers.1.mlp.fc2.weight)
init: lm_head.weight: ?: ?                                             -- GraphBuilder._update_structures_with_proto.1/from(lm_head.weight)
input:: input_ids                                                               |T7: batch x seq_length
input:: attention_mask                                                          |T7: batch x channel
input:: past_key_values_key_cache_0                                             |T1: batch x 32 x cache_length x 80
input:: past_key_values_key_cache_1                                             |T1: batch x 32 x cache_length x 80
input:: past_key_values_value_cache_0                                           |T1: batch x 32 x cache_length x 80
input:: past_key_values_value_cache_1                                           |T1: batch x 32 x cache_length x 80
Shape: input_ids -> _shape_input_ids0                                           |T7: 1                        - sym_size_int
Shape: input_ids -> _shape_input_ids02                                          |T7: 1                        - sym_size_int3
Squeeze: _shape_input_ids02 -> sym_size_int_20                                  |T7:                          - sym_size_int4
Unsqueeze: sym_size_int_20, init7_s1_0 -> _onx_unsqueeze_sym_size_int_200       |T7: 1                        - _mkshape1_sym_size_int_20
Shape: past_key_values_key_cache_0 -> _shape_past_key_values_key_cache_00       |T7: 1                        - sym_size_int5
Squeeze: _shape_past_key_values_key_cache_00 -> sym_size_int_21                 |T7:                          - sym_size_int6
Gather: model.embed_tokens.weight, input_ids -> embedding                       |T1: batch x seq_length x 2560- embedding
Add: sym_size_int_21, sym_size_int_20 -> add                                    |T7:                          - add3
Range: sym_size_int_21, add, init7_s_1 -> arange                                |T7: seq_length               - arange
Unsqueeze: arange, init7_s1_0 -> unsqueeze                                      |T7: 1 x seq_length           - unsqueeze
Unsqueeze: add, init7_s1_0 -> _onx_unsqueeze_add0                               |T7: 1                        - _mkshape1_add
Concat: _onx_unsqueeze_sym_size_int_200, _onx_unsqueeze_add0 -> _onx_concat_unsqueeze_sym_size_int_2000                       |T7: 2- _mkshape_add
ConstantOfShape: _onx_concat_unsqueeze_sym_size_int_2000 -> full                |T1: seq_length x channel     - fullD2
Trilu: full, init7_s_1 -> triu                                                  |T1: seq_length x channel     - triu
Range: init7_s_0, add, init7_s_1 -> arange_1                                    |T7: channel                  - arange2
Reshape: arange, init7_s2_-1_1 -> reshape                                       |T7: seq_length x 1           - reshape
Greater: arange_1, reshape -> gt                                                |T9: seq_length x channel     - gt_Tensor
Cast: gt -> _onx_cast_gt0                                                       |T1: seq_length x channel     - mul__Tensor
Mul: triu, _onx_cast_gt0 -> _onx_mul_triu0                                      |T1: seq_length x channel     - mul__Tensor2
Unsqueeze: _onx_mul_triu0, init7_s2_0_1 -> unsqueeze_2                          |T1: 1 x 1 x seq_length x channel- UnsqueezeUnsqueezePattern--unsqueeze2
Shape: unsqueeze_2 -> _shape_unsqueeze_20                                       |T7: 1                        - expand_B
Shape: unsqueeze_2 -> _shape_unsqueeze_202                                      |T7: 1                        - expand_B2
Concat: _shape_input_ids0, init7_s1_1, _shape_unsqueeze_20, _shape_unsqueeze_202 -> _onx_concat_unsqueeze_sym_size_int_1900                                           |T7: 4- _mkshape__shape_unsqueeze_202
Expand: unsqueeze_2, _onx_concat_unsqueeze_sym_size_int_1900 -> expand          |T1: batch x 1 x seq_length x channel- expand_B_neg
Unsqueeze: attention_mask, init7_s2_1_2 -> unsqueeze_4                          |T7: batch x 1 x 1 x channel  - UnsqueezeUnsqueezePattern--unsqueeze4
Cast: unsqueeze_4 -> _onx_cast_unsqueeze_40                                     |T1: batch x 1 x 1 x channel  - Opset
Add: expand, _onx_cast_unsqueeze_40 -> add_2                                    |T1: batch x 1 x seq_length x channel- add_Tensor
Reshape: init1_s_, init7_s1_1 -> _reshape_init1_s_0                             |T1: 1                        - Opset2
Equal: add_2, _reshape_init1_s_0 -> eq_7                                        |T9: batch x 1 x seq_length x channel- eq
Where: eq_7, init1_s1_, expand -> masked_fill                                   |T1: batch x 1 x seq_length x channel- masked_fill_Scalar
Shape: expand -> _shape_clone0                                                  |T7: 4                        - aten_meth_expand_as
Expand: masked_fill, _shape_clone0 -> expand_as                                 |T1: batch x 1 x seq_length x channel- aten_meth_expand_as2
submod_3[local_functions]: b_model_rotary_emb_inv_freq, unsqueeze -> wrap_with_set_grad_enabled#0, wrap_with_set_grad_enabled#1                                               |T1: 1 x seq_length x 32 T1: 1 x seq_length x 32- wrap_with_set_grad_enabled
Unsqueeze: wrap_with_set_grad_enabled#0, init7_s1_1 -> unsqueeze_8              |T1: 1 x 1 x seq_length x 32  - unsqueeze6
LayerNormalization: embedding, init1_s2560_, init1_s2560_2 -> _onx_div_sub_dropout00    |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm4
Transpose: model.layers.0.self_attn.q_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_q_proj_weight0                             |T1: 2560 x 2560- linear
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_q_proj_weight0 -> _onx_matmul_layer_norm0                                   |T1: batch x seq_length x 2560- Opset3
Concat: _shape_input_ids0, _onx_unsqueeze_sym_size_int_200, init7_s1_-1, init7_s1_80 -> _onx_concat_unsqueeze_sym_size_int_19020                                                |T7: 4- _mkshape_sym_size_int_20
Reshape: _onx_matmul_layer_norm0, _onx_concat_unsqueeze_sym_size_int_19020 -> view  |T1: batch x seq_length x 32 x 80- view
Transpose: view -> transpose_1                                                  |T1: batch x 32 x seq_length x 80- transpose_int
Split: transpose_1, init7_s2_32_48 -> slice_17, slice_18                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor
Split: slice_17, init7_s2_16_16 -> slice_21, slice_22                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor5
Transpose: model.layers.0.self_attn.k_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_k_proj_weight0                             |T1: 2560 x 2560- linear2
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_k_proj_weight0 -> _onx_matmul_layer_norm02                                    |T1: batch x seq_length x 2560- Opset5
Reshape: _onx_matmul_layer_norm02, _onx_concat_unsqueeze_sym_size_int_19020 -> view_1     |T1: batch x seq_length x 32 x 80- view2
Transpose: view_1 -> transpose_2                                                |T1: batch x 32 x seq_length x 80- transpose_int2
Split: transpose_2, init7_s2_32_48 -> slice_19, slice_20                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor3
Split: slice_19, init7_s2_16_16 -> slice_23, slice_24                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor7
Transpose: model.layers.0.self_attn.v_proj.weight -> _onx_transpose_p_model_layers_0_self_attn_v_proj_weight0                             |T1: 2560 x 2560- linear3
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_self_attn_v_proj_weight0 -> _onx_matmul_layer_norm03                                    |T1: batch x seq_length x 2560- Opset7
Reshape: _onx_matmul_layer_norm03, _onx_concat_unsqueeze_sym_size_int_19020 -> view_2     |T1: batch x seq_length x 32 x 80- view3
Transpose: view_2 -> transpose_3                                                |T1: batch x 32 x seq_length x 80- transpose_int3
Unsqueeze: wrap_with_set_grad_enabled#1, init7_s1_1 -> unsqueeze_9              |T1: 1 x 1 x seq_length x 32  - unsqueeze7
Mul: slice_17, unsqueeze_8 -> mul_2                                             |T1: batch x 32 x seq_length x 32- mul_Tensor
Neg: slice_22 -> neg                                                            |T1: batch x 32 x seq_length x 16- neg
Concat: neg, slice_21 -> cat_1                                                  |T1: batch x 32 x seq_length x 32- cat
Mul: cat_1, unsqueeze_9 -> mul_3                                                |T1: batch x 32 x seq_length x 32- mul_Tensor2
Add: mul_2, mul_3 -> add_3                                                      |T1: batch x 32 x seq_length x 32- add_Tensor2
Mul: slice_19, unsqueeze_8 -> mul_4                                             |T1: batch x 32 x seq_length x 32- mul_Tensor3
Neg: slice_24 -> neg_1                                                          |T1: batch x 32 x seq_length x 16- neg2
Concat: neg_1, slice_23 -> cat_2                                                |T1: batch x 32 x seq_length x 32- cat2
Mul: cat_2, unsqueeze_9 -> mul_5                                                |T1: batch x 32 x seq_length x 32- mul_Tensor4
Add: mul_4, mul_5 -> add_4                                                      |T1: batch x 32 x seq_length x 32- add_Tensor3
Concat: add_3, slice_18 -> cat_3                                                |T1: batch x 32 x seq_length x 80- cat3
Concat: add_4, slice_20 -> cat_4                                                |T1: batch x 32 x seq_length x 80- cat4
Concat: past_key_values_key_cache_0, cat_4 -> output_1                          |T1: batch x 32 x cache_length+seq_length x 80- cat5
Concat: past_key_values_value_cache_0, transpose_3 -> output_3                  |T1: batch x 32 x cache_length+seq_length x 80- cat6
Transpose: output_1 -> _onx_transpose_cat_50                                    |T1: batch x 32 x 80 x cache_length+seq_length- aten_scaled_dot_product_attention
MatMul: cat_3, _onx_transpose_cat_50 -> MulMulMatMulPattern__onx_matmul_mul_cat_300   |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention5-1
Mul: MulMulMatMulPattern__onx_matmul_mul_cat_300, init1_s1_6 -> _onx_matmul_mul_cat_300       |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention5-2
Add: _onx_matmul_mul_cat_300, expand_as -> _onx_add_matmul_mul_cat_3000         |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention6
Softmax: _onx_add_matmul_mul_cat_3000 -> _onx_softmax_add_matmul_mul_cat_30000  |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention7
MatMul: _onx_softmax_add_matmul_mul_cat_30000, output_3 -> scaled_dot_product_attention       |T1: batch x 32 x seq_length x 80- aten_scaled_dot_product_attention8
Transpose: scaled_dot_product_attention -> transpose_4                          |T1: batch x seq_length x 32 x 80- transpose_int4
Concat: _shape_input_ids0, _onx_unsqueeze_sym_size_int_200, init7_s1_-1 -> _onx_concat_unsqueeze_sym_size_int_19030                                   |T7: 3- _mkshape_sym_size_int_202
Reshape: transpose_4, _onx_concat_unsqueeze_sym_size_int_19030 -> reshape_1     |T1: batch x seq_length x 2560- reshape2
Transpose: model.layers.0.mlp.fc1.weight -> _onx_transpose_p_model_layers_0_mlp_fc1_weight0           |T1: 2560 x 10240- linear5
MatMul: _onx_div_sub_dropout00, _onx_transpose_p_model_layers_0_mlp_fc1_weight0 -> _onx_matmul_layer_norm04                           |T1: batch x seq_length x 10240- Opset11
Reshape: init1_s_3, init7_s1_1 -> _reshape_init1_s_30                           |T1: 1                        - mul_Tensor5
Mul: _onx_matmul_layer_norm04, _reshape_init1_s_30 -> _onx_mul_linear_40        |T1: batch x seq_length x 10240- mul_Tensor6
Pow: _onx_matmul_layer_norm04, init1_s1_5 -> pow_1                              |T1: batch x seq_length x 10240- pow_Tensor_Scalar
Reshape: init1_s_4, init7_s1_1 -> _reshape_init1_s_40                           |T1: 1                        - mul_Tensor8
Mul: pow_1, _reshape_init1_s_40 -> _onx_mul_pow_10                              |T1: batch x seq_length x 10240- mul_Tensor9
Add: _onx_matmul_layer_norm04, _onx_mul_pow_10 -> add_6                         |T1: batch x seq_length x 10240- add_Tensor4
Reshape: init1_s_5, init7_s1_1 -> _reshape_init1_s_50                           |T1: 1                        - mul_Tensor11
Mul: add_6, _reshape_init1_s_50 -> _onx_mul_add_60                              |T1: batch x seq_length x 10240- mul_Tensor12
Tanh: _onx_mul_add_60 -> tanh                                                   |T1: batch x seq_length x 10240- tanh
Reshape: init1_s_6, init7_s1_1 -> _reshape_init1_s_60                           |T1: 1                        - Opset13
Add: tanh, _reshape_init1_s_60 -> add_7                                         |T1: batch x seq_length x 10240- add_Tensor5
Mul: _onx_mul_linear_40, add_7 -> mul_9                                         |T1: batch x seq_length x 10240- mul_Tensor14
Transpose: model.layers.0.mlp.fc2.weight -> _onx_transpose_p_model_layers_0_mlp_fc2_weight0           |T1: 10240 x 2560- linear6
MatMul: mul_9, _onx_transpose_p_model_layers_0_mlp_fc2_weight0 -> _onx_matmul_mul_90    |T1: batch x seq_length x 2560- Opset14
Reshape: reshape_1, init7_s2_-1_2560 -> MatMulAddPattern--reshape_1             |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset9
Reshape: _onx_matmul_mul_90, init7_s2_-1_2560 -> MatMulAddPattern--reshape_12   |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset92
Shape: reshape_1 -> MatMulAddPattern--reshape_14                                |T7: 2                        - MatMulAddPattern--Opset93
Concat: MatMulAddPattern--reshape_14, init7_s1_-12 -> MatMulAddPattern--reshape_15  |T7: 3                    - MatMulAddPattern--Opset94
Gemm: MatMulAddPattern--reshape_1, model.layers.0.self_attn.dense.weight, MatMulAddPattern--reshape_12 -> MatMulAddPattern--reshape_13                                                      |T1: batch*seq_length x 2560- GemmTransposePattern--MatMulAddPattern--Opset962
Reshape: MatMulAddPattern--reshape_13, MatMulAddPattern--reshape_15 -> add_8    |T1: batch x seq_length x 2560- MatMulAddPattern--Opset95
Add: add_8, embedding -> add_9                                                  |T1: batch x seq_length x 2560- add_Tensor7
LayerNormalization: add_9, init1_s2560_3, init1_s2560_4 -> _onx_div_sub_add_900 |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm13
Transpose: model.layers.1.self_attn.q_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_q_proj_weight0                             |T1: 2560 x 2560- linear7
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_q_proj_weight0 -> _onx_matmul_layer_norm_10                                   |T1: batch x seq_length x 2560- Opset16
Reshape: _onx_matmul_layer_norm_10, _onx_concat_unsqueeze_sym_size_int_19020 -> view_3      |T1: batch x seq_length x 32 x 80- view4
Transpose: view_3 -> transpose_5                                                |T1: batch x 32 x seq_length x 80- transpose_int5
Split: transpose_5, init7_s2_32_48 -> slice_28, slice_29                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor9
Split: slice_28, init7_s2_16_16 -> slice_32, slice_33                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor13
Transpose: model.layers.1.self_attn.k_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_k_proj_weight0                             |T1: 2560 x 2560- linear8
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_k_proj_weight0 -> _onx_matmul_layer_norm_102                                    |T1: batch x seq_length x 2560- Opset18
Reshape: _onx_matmul_layer_norm_102, _onx_concat_unsqueeze_sym_size_int_19020 -> view_4       |T1: batch x seq_length x 32 x 80- view5
Transpose: view_4 -> transpose_6                                                |T1: batch x 32 x seq_length x 80- transpose_int6
Split: transpose_6, init7_s2_32_48 -> slice_30, slice_31                        |T1: batch x 32 x seq_length x 32 T1: batch x 32 x seq_length x 48- SlicesSplitPattern--slice_Tensor11
Split: slice_30, init7_s2_16_16 -> slice_34, slice_35                           |T1: batch x 32 x seq_length x 16 T1: batch x 32 x seq_length x 16- SlicesSplitPattern--slice_Tensor15
Transpose: model.layers.1.self_attn.v_proj.weight -> _onx_transpose_p_model_layers_1_self_attn_v_proj_weight0                             |T1: 2560 x 2560- linear9
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_self_attn_v_proj_weight0 -> _onx_matmul_layer_norm_103                                    |T1: batch x seq_length x 2560- Opset20
Reshape: _onx_matmul_layer_norm_103, _onx_concat_unsqueeze_sym_size_int_19020 -> view_5       |T1: batch x seq_length x 32 x 80- view6
Transpose: view_5 -> transpose_7                                                |T1: batch x 32 x seq_length x 80- transpose_int7
Unsqueeze: wrap_with_set_grad_enabled#1, init7_s1_1 -> unsqueeze_11             |T1: 1 x 1 x seq_length x 32  - unsqueeze9
Mul: slice_28, unsqueeze_8 -> mul_10                                            |T1: batch x 32 x seq_length x 32- mul_Tensor15
Neg: slice_33 -> neg_2                                                          |T1: batch x 32 x seq_length x 16- neg3
Concat: neg_2, slice_32 -> cat_7                                                |T1: batch x 32 x seq_length x 32- cat7
Mul: cat_7, unsqueeze_11 -> mul_11                                              |T1: batch x 32 x seq_length x 32- mul_Tensor16
Add: mul_10, mul_11 -> add_10                                                   |T1: batch x 32 x seq_length x 32- add_Tensor8
Mul: slice_30, unsqueeze_8 -> mul_12                                            |T1: batch x 32 x seq_length x 32- mul_Tensor17
Neg: slice_35 -> neg_3                                                          |T1: batch x 32 x seq_length x 16- neg4
Concat: neg_3, slice_34 -> cat_8                                                |T1: batch x 32 x seq_length x 32- cat8
Mul: cat_8, unsqueeze_11 -> mul_13                                              |T1: batch x 32 x seq_length x 32- mul_Tensor18
Add: mul_12, mul_13 -> add_11                                                   |T1: batch x 32 x seq_length x 32- add_Tensor9
Concat: add_10, slice_29 -> cat_9                                               |T1: batch x 32 x seq_length x 80- cat9
Concat: add_11, slice_31 -> cat_10                                              |T1: batch x 32 x seq_length x 80- cat10
Concat: past_key_values_key_cache_1, cat_10 -> output_2                         |T1: batch x 32 x cache_length+seq_length x 80- cat11
Concat: past_key_values_value_cache_1, transpose_7 -> output_4                  |T1: batch x 32 x cache_length+seq_length x 80- cat12
Transpose: output_2 -> _onx_transpose_cat_110                                   |T1: batch x 32 x 80 x cache_length+seq_length- aten_scaled_dot_product_attention9
MatMul: cat_9, _onx_transpose_cat_110 -> MulMulMatMulPattern__onx_matmul_mul_cat_900    |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention13-1
Mul: MulMulMatMulPattern__onx_matmul_mul_cat_900, init1_s1_6 -> _onx_matmul_mul_cat_900       |T1: batch x 32 x seq_length x cache_length+seq_length- MulMulMatMulPattern--aten_scaled_dot_product_attention13-2
Add: _onx_matmul_mul_cat_900, expand_as -> _onx_add_matmul_mul_cat_9000         |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention14
Softmax: _onx_add_matmul_mul_cat_9000 -> _onx_softmax_add_matmul_mul_cat_90000  |T1: batch x 32 x seq_length x cache_length+seq_length- aten_scaled_dot_product_attention15
MatMul: _onx_softmax_add_matmul_mul_cat_90000, output_4 -> scaled_dot_product_attention_1         |T1: batch x 32 x seq_length x 80- aten_scaled_dot_product_attention16
Transpose: scaled_dot_product_attention_1 -> transpose_8                        |T1: batch x seq_length x 32 x 80- transpose_int8
Reshape: transpose_8, _onx_concat_unsqueeze_sym_size_int_19030 -> reshape_2     |T1: batch x seq_length x 2560- reshape3
Transpose: model.layers.1.mlp.fc1.weight -> _onx_transpose_p_model_layers_1_mlp_fc1_weight0           |T1: 2560 x 10240- linear11
MatMul: _onx_div_sub_add_900, _onx_transpose_p_model_layers_1_mlp_fc1_weight0 -> _onx_matmul_layer_norm_104                           |T1: batch x seq_length x 10240- Opset24
Reshape: init1_s_3, init7_s1_1 -> _reshape_init1_s_302                          |T1: 1                        - mul_Tensor19
Mul: _onx_matmul_layer_norm_104, _reshape_init1_s_302 -> _onx_mul_linear_100    |T1: batch x seq_length x 10240- mul_Tensor20
Pow: _onx_matmul_layer_norm_104, init1_s1_5 -> pow_2                            |T1: batch x seq_length x 10240- pow_Tensor_Scalar2
Reshape: init1_s_4, init7_s1_1 -> _reshape_init1_s_402                          |T1: 1                        - mul_Tensor22
Mul: pow_2, _reshape_init1_s_402 -> _onx_mul_pow_20                             |T1: batch x seq_length x 10240- mul_Tensor23
Add: _onx_matmul_layer_norm_104, _onx_mul_pow_20 -> add_12                      |T1: batch x seq_length x 10240- add_Tensor10
Reshape: init1_s_5, init7_s1_1 -> _reshape_init1_s_502                          |T1: 1                        - mul_Tensor25
Mul: add_12, _reshape_init1_s_502 -> _onx_mul_add_120                           |T1: batch x seq_length x 10240- mul_Tensor26
Tanh: _onx_mul_add_120 -> tanh_1                                                |T1: batch x seq_length x 10240- tanh2
Reshape: init1_s_6, init7_s1_1 -> _reshape_init1_s_602                          |T1: 1                        - Opset26
Add: tanh_1, _reshape_init1_s_602 -> add_13                                     |T1: batch x seq_length x 10240- add_Tensor11
Mul: _onx_mul_linear_100, add_13 -> mul_17                                      |T1: batch x seq_length x 10240- mul_Tensor28
Transpose: model.layers.1.mlp.fc2.weight -> _onx_transpose_p_model_layers_1_mlp_fc2_weight0           |T1: 10240 x 2560- linear12
MatMul: mul_17, _onx_transpose_p_model_layers_1_mlp_fc2_weight0 -> _onx_matmul_mul_170      |T1: batch x seq_length x 2560- Opset27
Reshape: reshape_2, init7_s2_-1_2560 -> MatMulAddPattern--reshape_2             |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset22
Reshape: _onx_matmul_mul_170, init7_s2_-1_2560 -> MatMulAddPattern--reshape_22  |T1: batch*seq_length x 2560  - MatMulAddPattern--Opset222
Shape: reshape_2 -> MatMulAddPattern--reshape_24                                |T7: 2                        - MatMulAddPattern--Opset223
Concat: MatMulAddPattern--reshape_24, init7_s1_-12 -> MatMulAddPattern--reshape_25  |T7: 3                    - MatMulAddPattern--Opset224
Gemm: MatMulAddPattern--reshape_2, model.layers.1.self_attn.dense.weight, MatMulAddPattern--reshape_22 -> MatMulAddPattern--reshape_23                                                      |T1: batch*seq_length x 2560- GemmTransposePattern--MatMulAddPattern--Opset2262
Reshape: MatMulAddPattern--reshape_23, MatMulAddPattern--reshape_25 -> add_14   |T1: batch x seq_length x 2560- MatMulAddPattern--Opset225
Add: add_14, add_9 -> add_15                                                    |T1: batch x seq_length x 2560- add_Tensor13
LayerNormalization: add_15, init1_s2560_5, init1_s2560_6 -> _onx_div_sub_add_1500 |T1: batch x seq_length x 2560- LayerNormalizationPattern--layer_norm22
Transpose: lm_head.weight -> _onx_transpose_p_lm_head_weight0                   |T1: 2560 x 51200             - linear13
MatMul: _onx_div_sub_add_1500, _onx_transpose_p_lm_head_weight0 -> output_0     |T1: batch x seq_length x 51200- Opset29
output:: output_0                                                               |T1: batch x seq_length x 51200
output:: output_1                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_2                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_3                                                               |T1: batch x 32 x cache_length+seq_length x 80
output:: output_4                                                               |T1: batch x 32 x cache_length+seq_length x 80

FUNCKEY: ('local_functions.0', 'submod_3')
FUNC submod_3[local_functions.0]: ['expand_1', 'to_1'] -> ['output_0', 'output_1']
  opset: '': 18
  MatMul: expand_1, to_1 -> matmul                                                |:                            - Opset
  Transpose: matmul -> transpose                                                  |:                            - transpose_int
  Concat: transpose, transpose -> cat                                             |:                            - cat
  Cos: cat -> output_0                                                            |T1: batch x seq_length x 51200- cos
  Sin: cat -> output_1                                                            |T1: batch x 32 x cache_length+seq_length x 80- sin

FUNCKEY: ('local_functions', 'submod_3')
FUNC submod_3[local_functions]: ['b_model_rotary_emb_inv_freq', 'unsqueeze'] -> ['output_0', 'output_1']
  opset: '': 18
  opset: local_functions.0: 1
  Constant:  -> init7_s1_1                                                        |T7: 1                        - init2cst
  Constant:  -> init7_s2_0_2                                                      |:                            - init2cst2
  Unsqueeze: b_model_rotary_emb_inv_freq, init7_s2_0_2 -> unsqueeze_6             |:                            - UnsqueezeUnsqueezePattern--unsqueeze
  Unsqueeze: unsqueeze, init7_s1_1 -> unsqueeze_7                                 |:                            - unsqueeze3
  Cast: unsqueeze_7 -> to_1                                                       |:                            - to_dtype2
  submod_3[local_functions.0]: unsqueeze_6, to_1 -> output_0, output_1            |T1: batch x seq_length x 51200 T1: batch x 32 x cache_length+seq_length x 80- wrap_with_autocast

Visually.

plot exporter recipes c phi2

Total running time of the script: (0 minutes 19.087 seconds)

Related examples

torch.onnx.export and Phi-2

torch.onnx.export and Phi-2

Export Phi-3.5-mini-instruct piece by piece

Export Phi-3.5-mini-instruct piece by piece

Export Phi-3.5-mini-instruct with draft_export

Export Phi-3.5-mini-instruct with draft_export

Gallery generated by Sphinx-Gallery