Use the custom exporter in torch¶

Subject to change

File onnxruntime.py¶

This change enables the custom rewriter is an environment variable is enabled. Look for substring TODO:.

def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
    """This function replaces GraphModule._wrapped_call in compiled model.

    The _wrapped_call is the underlying implementation of forward method. Replacing
    it means we delegate the computation to _ort_acclerated_call and therefore
    onnxruntime.InferenceSession.
    """
    cached_execution_info_per_session = (
        self._all_ort_execution_info.search_reusable_session_execution_info(
            graph_module, *args
        )
    )
    if cached_execution_info_per_session:
        onnx_session = cached_execution_info_per_session.session
        input_names = cached_execution_info_per_session.input_names
        output_names = cached_execution_info_per_session.output_names
        input_value_infos = cached_execution_info_per_session.input_value_infos
        output_value_infos = cached_execution_info_per_session.output_value_infos
        input_devices = cached_execution_info_per_session.input_devices
        output_devices = cached_execution_info_per_session.output_devices
        prim_outputs = cached_execution_info_per_session.example_outputs
    else:
        # It's first time seeing such as graph. Let's make a new session
        # (type: onnxruntime.InferenceSession) for it.

        ##########################
        # TODO: Insert these lines
        ##########################

        use_other_rewriter = os.environ.get("ONNXRT_CHANGE_REWRITER", None) in (1, "1")
        if use_other_rewriter:
            from experimental_experiment.torch_interpreter import to_onnx
            from experimental_experiment.torch_interpreter._torch_models import create_input_names
            from experimental_experiment.xbuilder import OptimizationOptions
            from experimental_experiment.torch_interpreter.oxs_dispatcher import OxsDispatcher

            input_names = input_names = create_input_names(graph_module, args)
            dispatcher = OxsDispatcher()
            target_opset = self._resolved_onnx_exporter_options.onnx_registry.opset_version
            options = OptimizationOptions(
                remove_unused=True,
                constant_folding=False,
                patterns="default",
                verbose=1,
            )
            onnx_model, builder = to_onnx(
                graph_module,
                tuple(args),
                input_names=input_names,
                options=options,
                verbose=1,
                target_opset=target_opset,
                return_builder=True,
                dispatcher=dispatcher,
            )

            def maybe_map_to_meta_val(value):
                if hasattr(value, "meta") and "val" in value.meta:
                    # Select outputs with "val" information. Without "val",
                    # it's not possible access output_arg.meta["val"].device.
                    return value.meta["val"]
                return value

            extracted_outputs = _extract_graph_module_outputs(graph_module)
            prim_outputs = _pytree.tree_map(maybe_map_to_meta_val, extracted_outputs)

        else:

        ####################################
        # TODO: end of the insertion
        # TODO: indent what follows
        ####################################

            graph_module = torch.onnx._internal.fx.passes.MovePlaceholderToFront(
                self._resolved_onnx_exporter_options.diagnostic_context,
                graph_module,
            ).run()
            # Generate reference outputs. They are used to indicate output
            # tensors' types and devices when calling ORT.
            #
            # WARNING: The downstream code should not change prim_outputs and
            # this backend should always produces output with schema identical to prim_outputs'.

            if self._resolved_onnx_exporter_options.dynamic_shapes:
                # No pre-allocation when dynamic shape is enabled.
                self.preallocate_output = False
                extracted_outputs = _extract_graph_module_outputs(graph_module)

                def maybe_map_to_meta_val(value):
                    if hasattr(value, "meta") and "val" in value.meta:
                        # Select outputs with "val" information. Without "val",
                        # it's not possible access output_arg.meta["val"].device.
                        return value.meta["val"]
                    else:
                        return value

                prim_outputs = _pytree.tree_map(
                    maybe_map_to_meta_val, extracted_outputs
                )
            else:
                try:
                    prim_outputs = FakeTensorProp(graph_module).propagate(
                        *args, **kwargs
                    )
                except Exception:
                    logger.warning("FakeTensorProb failed for %s", graph_module)
                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
                    # because the output shapes are not inferred.
                    self.preallocate_output = False

                    # rethrow FakeTensorProb failure because it is not yet currently handled.
                    raise

            # Create the object to iterate through the nodes in graph one-by-one
            # and calls the corresponding ONNX exporter for each node.
            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
                diagnostic_context=self._resolved_onnx_exporter_options.diagnostic_context
            )
            # Cast FX variables if they will result schema-mismatch when searching
            # for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
            # but ONNX expects add(double_tensor, double_tensor).
            graph_module = torch.onnx._internal.fx.passes.InsertTypePromotion(
                self._resolved_onnx_exporter_options.diagnostic_context, graph_module
            ).run()
            # Start the per-node exporting process. It's conceptually a for loop
            # scanning through the nodes in the graph.
            exported = fx_interpreter.run(
                fx_graph_module=graph_module,
                onnxfunction_dispatcher=self._resolved_onnx_exporter_options.onnxfunction_dispatcher,
                op_level_debug=self._resolved_onnx_exporter_options.op_level_debug,
            )
            # Convert the exported result to ONNX ModelProto.
            onnx_model = exported.to_model_proto(
                opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
            )

        ####################################
        # TODO: end of the modification
        ####################################

        # Modify ONNX model using pre-registered graph transforms.
        # They are in-place modifications for avoiding unnecessary
        # copy of ONNX initializers.
        if self._options.pre_ort_model_transforms:
            for transform in self._options.pre_ort_model_transforms:
                transform(onnx_model)

        onnx_model_bytes = onnx_model.SerializeToString()
        if os.environ.get("ONNXRT_DUMP_PATH", None):
            # If not empty, environment variable ONNXRT_DUMP_PATH defined the path
            # where generated onnx files should be stored.
            # This module keeps a global variables keeping track of the
            # stored models.
            # If ONNXRT_DUMP_PATH="dumped/dumped_model_"
            # The first file name will be 'dumped/dumped_model_0.onnx'.
            # For every dumped model, a text file 'dumped/dumped_model_0.txt'
            # is created as well to contain the string representing the graph_module.
            _dump_onnx_model(onnx_model_bytes, graph_module=graph_module)

        # Initialize a ORT session to execute this ONNX model.
        # Note that TorchDynamo assumes all inputs/outputs are on the
        # same device, but it's subject to change (very likely with
        # dynamic shape support), so we add execution providers
        # based on the logic in _select_eps: (explicitly preferred EPs,
        # EPs inferred from inputs or graph, and the fallback default EP)/
        #
        # TODO(wschin): enable external allocators.
        # See https://github.com/pytorch/pytorch/issues/106867
        onnx_session = onnxruntime.InferenceSession(
            path_or_bytes=onnx_model_bytes,
            sess_options=self._options.ort_session_options,
            providers=self._select_eps(graph_module, *args),
        )

        # Cache ORT session. It's reused for the same "graph_module".
        # Generate ONNX model and extract its input and output names.
        input_names = tuple(input.name for input in onnx_model.graph.input)
        output_names = tuple(output.name for output in onnx_model.graph.output)
        input_devices = _get_onnx_devices(args)
        # Cache devices for inputs and outputs. They are used to invoke
        # ORT session. Output devices indicate where (e.g., GPU or CPU)
        # to store outputs
        if isinstance(prim_outputs, tuple):
            output_devices = _get_onnx_devices(prim_outputs)
        else:
            output_devices = _get_onnx_devices((prim_outputs,))

        input_value_infos = tuple(input for input in onnx_model.graph.input)
        output_value_infos = tuple(output for output in onnx_model.graph.output)

        execution_info_per_session = OrtExecutionInfoPerSession(
            session=onnx_session,
            input_names=input_names,
            input_value_infos=input_value_infos,
            output_names=output_names,
            output_value_infos=output_value_infos,
            input_devices=input_devices,
            output_devices=output_devices,
            example_outputs=prim_outputs,
        )

        self._all_ort_execution_info.cache_session_execution_info(
            graph_module, execution_info_per_session
        )

    self.execution_count += 1

    # ORT always returns a tuple of outputs. If the original output is a tensor,
    # ORT output's first element must be extracted and returned. Otherwise, type
    # mismatch may happen in downstream computation.
    is_single_tensor_output = isinstance(prim_outputs, torch.Tensor)
    normalized_prim_outputs = (
        (prim_outputs,) if is_single_tensor_output else prim_outputs
    )
    assert isinstance(normalized_prim_outputs, tuple)
    assert all(
        isinstance(elem, (torch.Tensor, torch.SymInt, int))
        for elem in normalized_prim_outputs
    )

    _nvtx_range_push("run_onnx_session_with_ortvaluevector")
    onnx_outputs = self.run(
        onnx_session,
        input_names,
        args,
        input_devices,
        output_names,
        normalized_prim_outputs,
        output_devices,
        self._options.preallocate_output,
        input_value_infos,
        normalized_prim_outputs,
    )
    _nvtx_range_pop()

    if self._assert_allclose_to_baseline:
        # Compute baseline.
        baseline_outputs = torch._prims.executor.execute(
            graph_module, *args, executor="aten"
        )
        normalized_baseline_ouptuts = (
            (baseline_outputs,) if is_single_tensor_output else baseline_outputs
        )
        # Ensure every output tensor is close to the corresponding baseline.
        for onnx_output, baseline_output in zip(
            onnx_outputs, normalized_baseline_ouptuts
        ):
            torch.testing.assert_close(onnx_output, baseline_output)
    return onnx_outputs[0] if is_single_tensor_output else onnx_outputs

Examples¶

Baseline¶

<<<

import os
import warnings
import numpy as np
import onnx
import torch
import torch.onnx
from experimental_experiment.torch_models.training_helper import (
    make_aot_ort,
    train_loop,
)
from experimental_experiment.torch_models.dump_helper import dump_onnx

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from transformers import LlamaConfig
    from transformers.models.llama.modeling_llama import LlamaModel


def ids_tensor(shape, vocab_size):
    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(np.random.randint(0, vocab_size - 1))

    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()


config = LlamaConfig(
    hidden_size=16,
    num_hidden_layers=1,
    vocab_size=1024,
    intermediate_size=16,
    max_position_embeddings=1024,
    num_attention_heads=2,
)
config._attn_implementation = "eager"

model = LlamaModel(config)

batch, seq, vocab_size = 2, 1024, 1024

input_ids = ids_tensor([batch, seq], vocab_size)
input_mask = torch.tril(torch.ones(batch, seq, dtype=torch.float32))

model(input_ids, input_mask)

local_aot_ort, _ = make_aot_ort(
    dynamic=True,
    rewrite=True,
    verbose=1,
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimized_mod = torch.compile(model, backend=local_aot_ort, fullgraph=True)
    with dump_onnx("dort-llama-ort", folder="dump_llama", clean=True):
        train_loop(optimized_mod, input_ids, input_mask)

names = [_ for _ in os.listdir("dump_llama") if _.endswith(".onnx")]
print("------------------------------------------")
print(f"exported model: {names}")
for name in names:
    print()
    print("NODES in {name!r}")
    onx = onnx.load(os.path.join("dump_llama", name))
    for i, node in enumerate(onx.graph.node):
        print(
            f"{i+1}/{len(onx.graph.node)}: {node.op_type} {node.input} -> {node.output}"
        )

>>>

    [2024-05-08 14:07:04,496] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    Applied 1 of general pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific pattern rewrite rules.
    ------------------------------------------
    exported model: ['dort-llama-ort_1.onnx', 'dort-llama-ort_0.onnx']
    
    NODES in {name!r}
    1/305: Constant [] -> ['aten_view_111_size_0']
    2/305: Reshape ['mm_5', 'aten_view_111_size_0'] -> ['view_25']
    3/305: Cos ['cat'] -> ['cos']
    4/305: Mul ['embedding', 'rsqrt'] -> ['mul_2']
    5/305: Constant [] -> ['_val_38']
    6/305: Pow ['add_6', '_val_38'] -> ['pow_5']
    7/305: Constant [] -> ['_val_41']
    8/305: Equal ['primals_13', '_val_41'] -> ['eq_2']
    9/305: Constant [] -> ['aten_view_121_size_0']
    10/305: Reshape ['mm_3', 'aten_view_121_size_0'] -> ['view_21']
    11/305: Mul ['tangents_1', 'primals_3'] -> ['mul_14']
    12/305: Constant [] -> ['_val_46']
    13/305: Pow ['embedding', '_val_46'] -> ['pow_9']
    14/305: Mul ['add_6', 'rsqrt_2'] -> ['mul_12']
    15/305: Constant [] -> ['aten_view_131_size_0']
    16/305: Reshape ['mm_4', 'aten_view_131_size_0'] -> ['view_23']
    17/305: Sin ['cat'] -> ['sin']
    18/305: Constant [] -> ['aten_unsqueeze_134_dim_0']
    19/305: Unsqueeze ['cos', 'aten_unsqueeze_134_dim_0'] -> ['unsqueeze_10']
    20/305: Constant [] -> ['_val_58']
    21/305: Mul ['pow_5', '_val_58'] -> ['mul_20']
    22/305: Constant [] -> ['aten_unsqueeze_138_dim_0']
    23/305: Unsqueeze ['eq_2', 'aten_unsqueeze_138_dim_0'] -> ['unsqueeze_12']
    24/305: Constant [] -> ['alpha__1']
    25/305: Mul ['view_21', 'alpha__1'] -> ['other_1__1']
    26/305: Add ['embedding', 'other_1__1'] -> ['add_4']
    27/305: Mul ['mul_14', 'add_6'] -> ['mul_16']
    28/305: Mul ['mul_14', 'rsqrt_2'] -> ['mul_17']
    29/305: Constant [] -> ['_val_65']
    30/305: Mul ['pow_9', '_val_65'] -> ['mul_47']
    31/305: Mul ['tangents_1', 'mul_12'] -> ['mul_15']
    32/305: Constant [] -> ['fill']
    33/305: Mul ['view_23', 'sigmoid'] -> ['mul_10']
    34/305: Constant [] -> ['aten_unsqueeze_149_dim_0']
    35/305: Unsqueeze ['sin', 'aten_unsqueeze_149_dim_0'] -> ['unsqueeze_11']
    36/305: Mul ['add_4', 'rsqrt_1'] -> ['mul_8']
    37/305: Constant [] -> ['_val_76']
    38/305: Pow ['add_4', '_val_76'] -> ['pow_7']
    39/305: Constant [] -> ['_val_78']
    40/305: ReduceSum ['mul_16', '_val_78'] -> ['sum_2']
    41/305: Constant [] -> ['_val_80']
    42/305: ReduceSum ['mul_15', '_val_80'] -> ['sum_1']
    43/305: Constant [] -> ['alpha__2']
    44/305: Mul ['sigmoid', 'alpha__2'] -> ['other_1__2']
    45/305: Sub ['fill', 'other_1__2'] -> ['sub']
    46/305: Constant [] -> ['_val_86']
    47/305: Mul ['pow_7', '_val_86'] -> ['mul_33']
    48/305: Constant [] -> ['_val_88']
    49/305: Mul ['sum_2', '_val_88'] -> ['mul_18']
    50/305: Constant [] -> ['aten_view_168_size_0']
    51/305: Reshape ['sum_1', 'aten_view_168_size_0'] -> ['view_28']
    52/305: Mul ['view_23', 'sub'] -> ['mul_24']
    53/305: Constant [] -> ['scalar_tensor_default']
    54/305: Pow ['rsqrt', 'scalar_tensor_default'] -> ['pow_8']
    55/305: Constant [] -> ['scalar_tensor_default_1']
    56/305: Pow ['rsqrt_1', 'scalar_tensor_default_1'] -> ['pow_6']
    57/305: Constant [] -> ['scalar_tensor_default_2']
    58/305: Pow ['rsqrt_2', 'scalar_tensor_default_2'] -> ['pow_4']
    59/305: Constant [] -> ['aten_add_182_other_1']
    60/305: Add ['mul_24', 'aten_add_182_other_1'] -> ['add_9']
    61/305: Mul ['mul_18', 'pow_4'] -> ['mul_19']
    62/305: Mul ['sigmoid', 'add_9'] -> ['mul_25']
    63/305: Constant [] -> ['aten_expand_186_size_1']
    64/305: Expand ['mul_19', 'aten_expand_186_size_1'] -> ['expand_9']
    65/305: Constant [] -> ['scalar_tensor_default_4']
    66/305: Div ['expand_9', 'scalar_tensor_default_4'] -> ['div_1']
    67/305: Mul ['div_1', 'mul_20'] -> ['mul_21']
    68/305: Constant [] -> ['alpha__3']
    69/305: Mul ['mul_21', 'alpha__3'] -> ['other_1__3']
    70/305: Add ['mul_17', 'other_1__3'] -> ['add_8']
    71/305: Constant [] -> ['aten_view_193_size_0']
    72/305: Reshape ['add_8', 'aten_view_193_size_0'] -> ['view_29']
    73/305: Transpose ['view_29'] -> ['t_7']
    74/305: MatMul ['view_29', 't_9'] -> ['mm_8']
    75/305: MatMul ['t_7', 'view_26'] -> ['mm_7']
    76/305: Constant [] -> ['aten_view_198_size_0']
    77/305: Reshape ['mm_8', 'aten_view_198_size_0'] -> ['view_30']
    78/305: Transpose ['mm_7'] -> ['t_8']
    79/305: Mul ['view_30', 'mul_10'] -> ['mul_22']
    80/305: Mul ['view_30', 'view_25'] -> ['mul_23']
    81/305: Transpose ['t_8'] -> ['t_10']
    82/305: Constant [] -> ['aten_view_204_size_0']
    83/305: Reshape ['mul_22', 'aten_view_204_size_0'] -> ['view_31']
    84/305: Mul ['mul_23', 'mul_25'] -> ['mul_26']
    85/305: Transpose ['view_31'] -> ['t_11']
    86/305: MatMul ['view_31', 't_13'] -> ['mm_10']
    87/305: Constant [] -> ['aten_view_209_size_0']
    88/305: Reshape ['mul_26', 'aten_view_209_size_0'] -> ['view_33']
    89/305: MatMul ['t_11', 'view_22'] -> ['mm_9']
    90/305: Constant [] -> ['aten_view_212_size_0']
    91/305: Reshape ['mm_10', 'aten_view_212_size_0'] -> ['view_32']
    92/305: Transpose ['view_33'] -> ['t_15']
    93/305: MatMul ['view_33', 't_17'] -> ['mm_12']
    94/305: Transpose ['mm_9'] -> ['t_12']
    95/305: MatMul ['t_15', 'view_22'] -> ['mm_11']
    96/305: Constant [] -> ['aten_view_218_size_0']
    97/305: Reshape ['mm_12', 'aten_view_218_size_0'] -> ['view_34']
    98/305: Transpose ['t_12'] -> ['t_14']
    99/305: Transpose ['mm_11'] -> ['t_16']
    100/305: Constant [] -> ['alpha__4']
    101/305: Mul ['view_34', 'alpha__4'] -> ['other_1__4']
    102/305: Add ['view_32', 'other_1__4'] -> ['add_10']
    103/305: Transpose ['t_16'] -> ['t_18']
    104/305: Mul ['add_10', 'primals_2'] -> ['mul_27']
    105/305: Mul ['add_10', 'mul_8'] -> ['mul_28']
    106/305: Mul ['mul_27', 'add_4'] -> ['mul_29']
    107/305: Mul ['mul_27', 'rsqrt_1'] -> ['mul_30']
    108/305: Constant [] -> ['_val_150']
    109/305: ReduceSum ['mul_28', '_val_150'] -> ['sum_3']
    110/305: Constant [] -> ['_val_152']
    111/305: ReduceSum ['mul_29', '_val_152'] -> ['sum_4']
    112/305: Constant [] -> ['alpha__5']
    113/305: Mul ['mul_30', 'alpha__5'] -> ['other_1__5']
    114/305: Add ['add_8', 'other_1__5'] -> ['add_11']
    115/305: Constant [] -> ['aten_view_233_size_0']
    116/305: Reshape ['sum_3', 'aten_view_233_size_0'] -> ['view_35']
    117/305: Constant [] -> ['_val_157']
    118/305: Mul ['sum_4', '_val_157'] -> ['mul_31']
    119/305: Mul ['mul_31', 'pow_6'] -> ['mul_32']
    120/305: Constant [] -> ['aten_expand_238_size_1']
    121/305: Expand ['mul_32', 'aten_expand_238_size_1'] -> ['expand_10']
    122/305: Constant [] -> ['scalar_tensor_default_5']
    123/305: Div ['expand_10', 'scalar_tensor_default_5'] -> ['div_2']
    124/305: Mul ['div_2', 'mul_33'] -> ['mul_34']
    125/305: Constant [] -> ['alpha__6']
    126/305: Mul ['mul_34', 'alpha__6'] -> ['other_1__6']
    127/305: Add ['add_11', 'other_1__6'] -> ['add_12']
    128/305: Constant [] -> ['aten_view_245_size_0']
    129/305: Reshape ['add_12', 'aten_view_245_size_0'] -> ['view_36']
    130/305: Transpose ['view_36'] -> ['t_19']
    131/305: MatMul ['view_36', 't_21'] -> ['mm_14']
    132/305: MatMul ['t_19', 'view_20'] -> ['mm_13']
    133/305: Constant [] -> ['aten_view_250_size_0']
    134/305: Reshape ['mm_14', 'aten_view_250_size_0'] -> ['view_37']
    135/305: Transpose ['mm_13'] -> ['t_20']
    136/305: Constant [] -> ['aten_view_253_size_0']
    137/305: Reshape ['view_37', 'aten_view_253_size_0'] -> ['view_38']
    138/305: Transpose ['t_20'] -> ['t_22']
    139/305: Transpose ['view_38'] -> ['transpose_6']
    140/305: Constant [] -> ['aten_view_258_size_0']
    141/305: Reshape ['transpose_6', 'aten_view_258_size_0'] -> ['view_39']
    142/305: MatMul ['transpose_7', 'view_39'] -> ['bmm_3']
    143/305: MatMul ['view_39', 'transpose_8'] -> ['bmm_4']
    144/305: Constant [] -> ['aten_view_262_size_0']
    145/305: Reshape ['bmm_3', 'aten_view_262_size_0'] -> ['view_40']
    146/305: Constant [] -> ['aten_view_264_size_0']
    147/305: Reshape ['bmm_4', 'aten_view_264_size_0'] -> ['view_41']
    148/305: Constant [] -> ['alpha__7']
    149/305: Mul ['view_40', 'alpha__7'] -> ['other_1__7']
    150/305: Add ['tangents_3', 'other_1__7'] -> ['add_13']
    151/305: Mul ['view_41', 'detach_13'] -> ['mul_35']
    152/305: Transpose ['add_13'] -> ['transpose_12']
    153/305: Constant [] -> ['_val_191']
    154/305: ReduceSum ['mul_35', '_val_191'] -> ['sum_5']
    155/305: Mul ['detach_13', 'sum_5'] -> ['mul_36']
    156/305: Constant [] -> ['aten_view_273_size_0']
    157/305: Reshape ['transpose_12', 'aten_view_273_size_0'] -> ['view_45']
    158/305: Constant [] -> ['alpha__8']
    159/305: Mul ['mul_36', 'alpha__8'] -> ['other_1__8']
    160/305: Sub ['mul_35', 'other_1__8'] -> ['sub_1']
    161/305: Constant [] -> ['aten_view_276_size_0']
    162/305: Reshape ['view_45', 'aten_view_276_size_0'] -> ['view_48']
    163/305: Constant [] -> ['_val_200']
    164/305: Div ['sub_1', '_val_200'] -> ['div_3']
    165/305: Transpose ['view_48'] -> ['t_23']
    166/305: MatMul ['view_48', 't_25'] -> ['mm_16']
    167/305: Constant [] -> ['aten_view_282_size_0']
    168/305: Reshape ['div_3', 'aten_view_282_size_0'] -> ['view_42']
    169/305: MatMul ['t_23', 'view_1'] -> ['mm_15']
    170/305: Constant [] -> ['aten_view_285_size_0']
    171/305: Reshape ['mm_16', 'aten_view_285_size_0'] -> ['view_49']
    172/305: MatMul ['transpose_9', 'view_42'] -> ['bmm_5']
    173/305: MatMul ['view_42', 'transpose_10'] -> ['bmm_6']
    174/305: Transpose ['mm_15'] -> ['t_24']
    175/305: Constant [] -> ['aten_view_290_size_0']
    176/305: Reshape ['bmm_5', 'aten_view_290_size_0'] -> ['view_43']
    177/305: Constant [] -> ['aten_view_292_size_0']
    178/305: Reshape ['bmm_6', 'aten_view_292_size_0'] -> ['view_44']
    179/305: Transpose ['t_24'] -> ['t_26']
    180/305: Transpose ['view_43'] -> ['transpose_11']
    181/305: Mul ['view_44', 'unsqueeze_11'] -> ['mul_39']
    182/305: Mul ['view_44', 'unsqueeze_10'] -> ['mul_40']
    183/305: Constant [] -> ['alpha__9']
    184/305: Mul ['transpose_11', 'alpha__9'] -> ['other_1__9']
    185/305: Add ['tangents_2', 'other_1__9'] -> ['add_14']
    186/305: Constant [] -> ['_val_224']
    187/305: Constant [] -> ['_val_228']
    188/305: Constant [] -> ['_val_232']
    189/305: Constant [] -> ['_val_236']
    190/305: Slice ['mul_39', '_val_224', '_val_228', '_val_232', '_val_236'] -> ['slice_22']
    191/305: Constant [] -> ['_val_241']
    192/305: Constant [] -> ['_val_245']
    193/305: Constant [] -> ['_val_249']
    194/305: Constant [] -> ['_val_253']
    195/305: Slice ['mul_39', '_val_241', '_val_245', '_val_249', '_val_253'] -> ['slice_23']
    196/305: Mul ['add_14', 'unsqueeze_11'] -> ['mul_37']
    197/305: Mul ['add_14', 'unsqueeze_10'] -> ['mul_38']
    198/305: Neg ['slice_22'] -> ['neg_3']
    199/305: Constant [] -> ['_val_263']
    200/305: Constant [] -> ['_val_267']
    201/305: Constant [] -> ['_val_271']
    202/305: Constant [] -> ['_val_275']
    203/305: Slice ['mul_37', '_val_263', '_val_267', '_val_271', '_val_275'] -> ['slice_20']
    204/305: Constant [] -> ['_val_280']
    205/305: Constant [] -> ['_val_284']
    206/305: Constant [] -> ['_val_288']
    207/305: Constant [] -> ['_val_292']
    208/305: Slice ['mul_37', '_val_280', '_val_284', '_val_288', '_val_292'] -> ['slice_21']
    209/305: Constant [] -> ['_val_311']
    210/305: Transpose ['slice_23'] -> ['_val_312']
    211/305: Constant [] -> ['_val_313']
    212/305: ScatterND ['_val_313', '_val_311', '_val_312'] -> ['_val_314']
    213/305: Transpose ['_val_314'] -> ['slice_scatter_3']
    214/305: Neg ['slice_20'] -> ['neg_2']
    215/305: Constant [] -> ['_val_334']
    216/305: Transpose ['neg_3'] -> ['_val_335']
    217/305: Constant [] -> ['_val_336']
    218/305: ScatterND ['_val_336', '_val_334', '_val_335'] -> ['_val_337']
    219/305: Transpose ['_val_337'] -> ['slice_scatter_2']
    220/305: Constant [] -> ['_val_356']
    221/305: Transpose ['slice_21'] -> ['_val_357']
    222/305: Constant [] -> ['_val_358']
    223/305: ScatterND ['_val_358', '_val_356', '_val_357'] -> ['_val_359']
    224/305: Transpose ['_val_359'] -> ['slice_scatter_1']
    225/305: Constant [] -> ['alpha__10']
    226/305: Mul ['slice_scatter_3', 'alpha__10'] -> ['other_1__10']
    227/305: Add ['slice_scatter_2', 'other_1__10'] -> ['add_17']
    228/305: Constant [] -> ['_val_377']
    229/305: Transpose ['neg_2'] -> ['_val_378']
    230/305: Constant [] -> ['_val_379']
    231/305: ScatterND ['_val_379', '_val_377', '_val_378'] -> ['_val_380']
    232/305: Transpose ['_val_380'] -> ['slice_scatter']
    233/305: Constant [] -> ['alpha__11']
    234/305: Mul ['mul_40', 'alpha__11'] -> ['other_1__11']
    235/305: Add ['add_17', 'other_1__11'] -> ['add_18']
    236/305: Constant [] -> ['alpha__12']
    237/305: Mul ['slice_scatter_1', 'alpha__12'] -> ['other_1__12']
    238/305: Add ['slice_scatter', 'other_1__12'] -> ['add_15']
    239/305: Transpose ['add_18'] -> ['transpose_14']
    240/305: Constant [] -> ['alpha__13']
    241/305: Mul ['mul_38', 'alpha__13'] -> ['other_1__13']
    242/305: Add ['add_15', 'other_1__13'] -> ['add_16']
    243/305: Transpose ['add_16'] -> ['transpose_13']
    244/305: Constant [] -> ['aten_view_466_size_0']
    245/305: Reshape ['transpose_14', 'aten_view_466_size_0'] -> ['view_47']
    246/305: Constant [] -> ['aten_view_469_size_0']
    247/305: Reshape ['view_47', 'aten_view_469_size_0'] -> ['view_52']
    248/305: Constant [] -> ['aten_view_471_size_0']
    249/305: Reshape ['transpose_13', 'aten_view_471_size_0'] -> ['view_46']
    250/305: Transpose ['view_52'] -> ['t_31']
    251/305: MatMul ['view_52', 't_33'] -> ['mm_20']
    252/305: Constant [] -> ['aten_view_475_size_0']
    253/305: Reshape ['view_46', 'aten_view_475_size_0'] -> ['view_50']
    254/305: MatMul ['t_31', 'view_1'] -> ['mm_19']
    255/305: Constant [] -> ['aten_view_478_size_0']
    256/305: Reshape ['mm_20', 'aten_view_478_size_0'] -> ['view_53']
    257/305: Transpose ['view_50'] -> ['t_27']
    258/305: MatMul ['view_50', 't_29'] -> ['mm_18']
    259/305: Transpose ['mm_19'] -> ['t_32']
    260/305: MatMul ['t_27', 'view_1'] -> ['mm_17']
    261/305: Constant [] -> ['aten_view_484_size_0']
    262/305: Reshape ['mm_18', 'aten_view_484_size_0'] -> ['view_51']
    263/305: Transpose ['t_32'] -> ['t_34']
    264/305: Transpose ['mm_17'] -> ['t_28']
    265/305: Constant [] -> ['alpha__14']
    266/305: Mul ['view_51', 'alpha__14'] -> ['other_1__14']
    267/305: Add ['view_49', 'other_1__14'] -> ['add_19']
    268/305: Transpose ['t_28'] -> ['t_30']
    269/305: Constant [] -> ['alpha__15']
    270/305: Mul ['view_53', 'alpha__15'] -> ['other_1__15']
    271/305: Add ['add_19', 'other_1__15'] -> ['add_20']
    272/305: Mul ['add_20', 'primals_1'] -> ['mul_41']
    273/305: Mul ['add_20', 'mul_2'] -> ['mul_42']
    274/305: Mul ['mul_41', 'embedding'] -> ['mul_43']
    275/305: Mul ['mul_41', 'rsqrt'] -> ['mul_44']
    276/305: Constant [] -> ['_val_417']
    277/305: ReduceSum ['mul_42', '_val_417'] -> ['sum_6']
    278/305: Constant [] -> ['_val_419']
    279/305: ReduceSum ['mul_43', '_val_419'] -> ['sum_7']
    280/305: Constant [] -> ['alpha__16']
    281/305: Mul ['mul_44', 'alpha__16'] -> ['other_1__16']
    282/305: Add ['add_12', 'other_1__16'] -> ['add_21']
    283/305: Constant [] -> ['aten_view_500_size_0']
    284/305: Reshape ['sum_6', 'aten_view_500_size_0'] -> ['view_54']
    285/305: Constant [] -> ['_val_424']
    286/305: Mul ['sum_7', '_val_424'] -> ['mul_45']
    287/305: Mul ['mul_45', 'pow_8'] -> ['mul_46']
    288/305: Constant [] -> ['aten_expand_505_size_1']
    289/305: Expand ['mul_46', 'aten_expand_505_size_1'] -> ['expand_11']
    290/305: Constant [] -> ['scalar_tensor_default_6']
    291/305: Div ['expand_11', 'scalar_tensor_default_6'] -> ['div_4']
    292/305: Mul ['div_4', 'mul_47'] -> ['mul_48']
    293/305: Constant [] -> ['alpha__17']
    294/305: Mul ['mul_48', 'alpha__17'] -> ['other_1__17']
    295/305: Add ['add_21', 'other_1__17'] -> ['add_22']
    296/305: Constant [] -> ['aten_masked_fill_512_value_cast']
    297/305: Where ['unsqueeze_12', 'aten_masked_fill_512_value_cast', 'add_22'] -> ['masked_fill_1']
    298/305: Constant [] -> ['_val_436']
    299/305: ConstantOfShape ['_val_436'] -> ['aten_new_zeros_514_result']
    300/305: SequenceConstruct ['primals_13'] -> ['438']
    301/305: Constant [] -> ['int64_0__18']
    302/305: SequenceAt ['438', 'int64_0__18'] -> ['index__18']
    303/305: Constant [] -> ['int64_m1_1d__18']
    304/305: Unsqueeze ['index__18', 'int64_m1_1d__18'] -> ['new_index__18']
    305/305: ScatterND ['aten_new_zeros_514_result', 'new_index__18', 'masked_fill_1'] -> ['_unsafe_index_put']
    
    NODES in {name!r}
    1/248: Gather ['primals_4', 'primals_13'] -> ['embedding']
    2/248: Transpose ['primals_8'] -> ['t_3']
    3/248: Constant [] -> ['_val_22']
    4/248: Constant [] -> ['_val_23']
    5/248: Constant [] -> ['size_0__1']
    6/248: Constant [] -> ['fill_value_1__1']
    7/248: Expand ['fill_value_1__1', 'size_0__1'] -> ['full']
    8/248: Constant [] -> ['_val_36']
    9/248: Constant [] -> ['_val_40']
    10/248: Constant [] -> ['_val_44']
    11/248: Constant [] -> ['_val_48']
    12/248: Slice ['primals_14', '_val_36', '_val_40', '_val_44', '_val_48'] -> ['slice_5']
    13/248: Transpose ['primals_9'] -> ['t_4']
    14/248: Transpose ['primals_10'] -> ['t_5']
    15/248: Transpose ['primals_11'] -> ['t_6']
    16/248: Transpose ['primals_5'] -> ['t']
    17/248: Transpose ['primals_6'] -> ['t_1']
    18/248: Transpose ['primals_7'] -> ['t_2']
    19/248: Constant [] -> ['aten_unsqueeze_155_dim_0']
    20/248: Unsqueeze ['primals_12', 'aten_unsqueeze_155_dim_0'] -> ['unsqueeze_7']
    21/248: Constant [] -> ['scalar_tensor_default']
    22/248: Pow ['embedding', 'scalar_tensor_default'] -> ['pow_1']
    23/248: Transpose ['t_3'] -> ['t_21']
    24/248: Constant [] -> ['aten_triu_163_diagonal']
    25/248: Trilu ['full', 'aten_triu_163_diagonal'] -> ['triu']
    26/248: Constant [] -> ['aten_unsqueeze_164_dim_0']
    27/248: Unsqueeze ['slice_5', 'aten_unsqueeze_164_dim_0'] -> ['unsqueeze_5']
    28/248: Transpose ['t_4'] -> ['t_17']
    29/248: Transpose ['t_5'] -> ['t_13']
    30/248: Transpose ['t_6'] -> ['t_9']
    31/248: Transpose ['t'] -> ['t_33']
    32/248: Transpose ['t_1'] -> ['t_29']
    33/248: Transpose ['t_2'] -> ['t_25']
    34/248: Constant [] -> ['_val_75']
    35/248: Constant [] -> ['_val_79']
    36/248: Constant [] -> ['_val_83']
    37/248: Constant [] -> ['_val_87']
    38/248: Slice ['unsqueeze_7', '_val_75', '_val_79', '_val_83', '_val_87'] -> ['slice_7']
    39/248: Constant [] -> ['gt']
    40/248: Constant [] -> ['_val_107']
    41/248: ReduceMean ['pow_1', '_val_107'] -> ['mean']
    42/248: Constant [] -> ['aten_unsqueeze_208_dim_0']
    43/248: Unsqueeze ['unsqueeze_5', 'aten_unsqueeze_208_dim_0'] -> ['unsqueeze_6']
    44/248: Constant [] -> ['aten_unsqueeze_209_dim_0']
    45/248: Unsqueeze ['slice_7', 'aten_unsqueeze_209_dim_0'] -> ['unsqueeze_8']
    46/248: Cast ['gt'] -> ['convert_element_type_default']
    47/248: Mul ['triu', 'convert_element_type_default'] -> ['mul']
    48/248: Constant [] -> ['aten_add_214_other_1']
    49/248: Add ['mean', 'aten_add_214_other_1'] -> ['add']
    50/248: Constant [] -> ['_val_119']
    51/248: Constant [] -> ['_val_123']
    52/248: Constant [] -> ['_val_127']
    53/248: Constant [] -> ['_val_131']
    54/248: Slice ['unsqueeze_6', '_val_119', '_val_123', '_val_127', '_val_131'] -> ['slice_6']
    55/248: Constant [] -> ['aten_expand_233_size_1']
    56/248: Expand ['unsqueeze_8', 'aten_expand_233_size_1'] -> ['expand_2']
    57/248: Constant [] -> ['aten_unsqueeze_251_dim_0']
    58/248: Unsqueeze ['mul', 'aten_unsqueeze_251_dim_0'] -> ['unsqueeze_3']
    59/248: Sqrt ['add'] -> ['aten_rsqrt_252_tmp']
    60/248: Reciprocal ['aten_rsqrt_252_tmp'] -> ['rsqrt']
    61/248: Constant [] -> ['_val_154']
    62/248: Equal ['slice_6', '_val_154'] -> ['eq_1']
    63/248: Constant [] -> ['aten_expand_256_size_1']
    64/248: Expand ['expand_2', 'aten_expand_256_size_1'] -> ['expand_3']
    65/248: Constant [] -> ['aten_unsqueeze_258_dim_0']
    66/248: Unsqueeze ['unsqueeze_3', 'aten_unsqueeze_258_dim_0'] -> ['unsqueeze_4']
    67/248: Mul ['embedding', 'rsqrt'] -> ['mul_2']
    68/248: Constant [] -> ['_val_168']
    69/248: Constant [] -> ['_val_172']
    70/248: Constant [] -> ['_val_176']
    71/248: Constant [] -> ['_val_180']
    72/248: Slice ['unsqueeze_4', '_val_168', '_val_172', '_val_176', '_val_180'] -> ['slice_3']
    73/248: Mul ['primals_1', 'mul_2'] -> ['mul_3']
    74/248: Constant [] -> ['view_11']
    75/248: Constant [] -> ['_val_188']
    76/248: Constant [] -> ['_val_192']
    77/248: Constant [] -> ['_val_196']
    78/248: Constant [] -> ['_val_200']
    79/248: Slice ['slice_3', '_val_188', '_val_192', '_val_196', '_val_200'] -> ['slice_4']
    80/248: Constant [] -> ['aten_view_302_size_0']
    81/248: Reshape ['mul_3', 'aten_view_302_size_0'] -> ['view_1']
    82/248: Constant [] -> ['aten_expand_305_size_1']
    83/248: Expand ['slice_4', 'aten_expand_305_size_1'] -> ['expand_1']
    84/248: MatMul ['view_1', 't'] -> ['mm']
    85/248: MatMul ['view_1', 't_1'] -> ['mm_1']
    86/248: MatMul ['view_1', 't_2'] -> ['mm_2']
    87/248: MatMul ['expand_3', 'view_11'] -> ['view_12']
    88/248: Constant [] -> ['aten_view_313_size_0']
    89/248: Reshape ['mm', 'aten_view_313_size_0'] -> ['view_2']
    90/248: Constant [] -> ['aten_view_315_size_0']
    91/248: Reshape ['mm_1', 'aten_view_315_size_0'] -> ['view_4']
    92/248: Constant [] -> ['aten_view_317_size_0']
    93/248: Reshape ['mm_2', 'aten_view_317_size_0'] -> ['view_6']
    94/248: Transpose ['view_12'] -> ['transpose_3']
    95/248: Constant [] -> ['aten_view_321_size_0']
    96/248: Reshape ['view_2', 'aten_view_321_size_0'] -> ['view_7']
    97/248: Constant [] -> ['aten_view_323_size_0']
    98/248: Reshape ['view_4', 'aten_view_323_size_0'] -> ['view_8']
    99/248: Constant [] -> ['aten_view_325_size_0']
    100/248: Reshape ['view_6', 'aten_view_325_size_0'] -> ['view_9']
    101/248: Concat ['transpose_3', 'transpose_3'] -> ['cat']
    102/248: Constant [] -> ['_val_229']
    103/248: Equal ['expand_1', '_val_229'] -> ['eq']
    104/248: Transpose ['view_7'] -> ['transpose']
    105/248: Transpose ['view_8'] -> ['transpose_1']
    106/248: Transpose ['view_9'] -> ['transpose_2']
    107/248: Cos ['cat'] -> ['cos']
    108/248: Sin ['cat'] -> ['sin']
    109/248: And ['eq', 'eq_1'] -> ['mul_1']
    110/248: Constant [] -> ['_val_240']
    111/248: Constant [] -> ['_val_244']
    112/248: Constant [] -> ['_val_248']
    113/248: Constant [] -> ['_val_252']
    114/248: Slice ['transpose', '_val_240', '_val_244', '_val_248', '_val_252'] -> ['slice_10']
    115/248: Constant [] -> ['_val_257']
    116/248: Constant [] -> ['_val_261']
    117/248: Constant [] -> ['_val_265']
    118/248: Constant [] -> ['_val_269']
    119/248: Slice ['transpose', '_val_257', '_val_261', '_val_265', '_val_269'] -> ['slice_11']
    120/248: Constant [] -> ['_val_274']
    121/248: Constant [] -> ['_val_278']
    122/248: Constant [] -> ['_val_282']
    123/248: Constant [] -> ['_val_286']
    124/248: Slice ['transpose_1', '_val_274', '_val_278', '_val_282', '_val_286'] -> ['slice_12']
    125/248: Constant [] -> ['_val_291']
    126/248: Constant [] -> ['_val_295']
    127/248: Constant [] -> ['_val_299']
    128/248: Constant [] -> ['_val_303']
    129/248: Slice ['transpose_1', '_val_291', '_val_295', '_val_299', '_val_303'] -> ['slice_13']
    130/248: Constant [] -> ['aten_expand_405_size_1']
    131/248: Expand ['transpose_2', 'aten_expand_405_size_1'] -> ['expand_8']
    132/248: Constant [] -> ['aten_unsqueeze_406_dim_0']
    133/248: Unsqueeze ['cos', 'aten_unsqueeze_406_dim_0'] -> ['unsqueeze_10']
    134/248: Constant [] -> ['aten_unsqueeze_407_dim_0']
    135/248: Unsqueeze ['sin', 'aten_unsqueeze_407_dim_0'] -> ['unsqueeze_11']
    136/248: Constant [] -> ['_val_309']
    137/248: Where ['mul_1', '_val_309', 'expand_1'] -> ['masked_fill']
    138/248: Neg ['slice_11'] -> ['neg']
    139/248: Neg ['slice_13'] -> ['neg_1']
    140/248: Mul ['transpose', 'unsqueeze_10'] -> ['mul_4']
    141/248: Mul ['transpose_1', 'unsqueeze_10'] -> ['mul_6']
    142/248: Concat ['neg', 'slice_10'] -> ['cat_1']
    143/248: Concat ['neg_1', 'slice_12'] -> ['cat_2']
    144/248: Constant [] -> ['aten_view_421_size_0']
    145/248: Reshape ['expand_8', 'aten_view_421_size_0'] -> ['view_17']
    146/248: Constant [] -> ['_val_326']
    147/248: Constant [] -> ['_val_330']
    148/248: Constant [] -> ['_val_334']
    149/248: Constant [] -> ['_val_338']
    150/248: Slice ['masked_fill', '_val_326', '_val_330', '_val_334', '_val_338'] -> ['slice_17']
    151/248: Mul ['cat_1', 'unsqueeze_11'] -> ['mul_5']
    152/248: Mul ['cat_2', 'unsqueeze_11'] -> ['mul_7']
    153/248: Transpose ['view_17'] -> ['transpose_8']
    154/248: Constant [] -> ['_val_346']
    155/248: Constant [] -> ['_val_350']
    156/248: Constant [] -> ['_val_354']
    157/248: Constant [] -> ['_val_358']
    158/248: Slice ['slice_17', '_val_346', '_val_350', '_val_354', '_val_358'] -> ['slice_18']
    159/248: Constant [] -> ['alpha__2']
    160/248: Mul ['mul_5', 'alpha__2'] -> ['other_1__2']
    161/248: Add ['mul_4', 'other_1__2'] -> ['add_1']
    162/248: Constant [] -> ['alpha__3']
    163/248: Mul ['mul_7', 'alpha__3'] -> ['other_1__3']
    164/248: Add ['mul_6', 'other_1__3'] -> ['add_2']
    165/248: Constant [] -> ['_val_365']
    166/248: Constant [] -> ['_val_369']
    167/248: Constant [] -> ['_val_373']
    168/248: Constant [] -> ['_val_377']
    169/248: Slice ['slice_18', '_val_365', '_val_369', '_val_373', '_val_377'] -> ['slice_19']
    170/248: Constant [] -> ['aten_expand_479_size_1']
    171/248: Expand ['add_1', 'aten_expand_479_size_1'] -> ['expand_5']
    172/248: Transpose ['add_2'] -> ['transpose_4']
    173/248: Constant [] -> ['aten_expand_483_size_1']
    174/248: Expand ['transpose_4', 'aten_expand_483_size_1'] -> ['expand_6']
    175/248: Constant [] -> ['aten_view_485_size_0']
    176/248: Reshape ['expand_5', 'aten_view_485_size_0'] -> ['view_13']
    177/248: Transpose ['view_13'] -> ['transpose_9']
    178/248: Constant [] -> ['aten_view_489_size_0']
    179/248: Reshape ['expand_6', 'aten_view_489_size_0'] -> ['view_14']
    180/248: MatMul ['view_13', 'view_14'] -> ['bmm_1']
    181/248: Transpose ['view_14'] -> ['transpose_10']
    182/248: Constant [] -> ['aten_view_493_size_0']
    183/248: Reshape ['bmm_1', 'aten_view_493_size_0'] -> ['view_15']
    184/248: Constant [] -> ['_val_395']
    185/248: Div ['view_15', '_val_395'] -> ['div']
    186/248: Constant [] -> ['alpha__4']
    187/248: Mul ['slice_19', 'alpha__4'] -> ['other_1__4']
    188/248: Add ['div', 'other_1__4'] -> ['add_3']
    189/248: Softmax ['add_3'] -> ['_softmax']
    190/248: Constant [] -> ['aten_expand_502_size_1']
    191/248: Expand ['_softmax', 'aten_expand_502_size_1'] -> ['expand_7']
    192/248: Constant [] -> ['aten_view_505_size_0']
    193/248: Reshape ['expand_7', 'aten_view_505_size_0'] -> ['view_16']
    194/248: Identity ['_softmax'] -> ['detach_13']
    195/248: MatMul ['view_16', 'view_17'] -> ['bmm_2']
    196/248: Transpose ['view_16'] -> ['transpose_7']
    197/248: Constant [] -> ['aten_view_510_size_0']
    198/248: Reshape ['bmm_2', 'aten_view_510_size_0'] -> ['view_18']
    199/248: Transpose ['view_18'] -> ['transpose_5']
    200/248: Constant [] -> ['aten_view_514_size_0']
    201/248: Reshape ['transpose_5', 'aten_view_514_size_0'] -> ['view_19']
    202/248: Constant [] -> ['aten_view_516_size_0']
    203/248: Reshape ['view_19', 'aten_view_516_size_0'] -> ['view_20']
    204/248: MatMul ['view_20', 't_3'] -> ['mm_3']
    205/248: Constant [] -> ['aten_view_519_size_0']
    206/248: Reshape ['mm_3', 'aten_view_519_size_0'] -> ['view_21']
    207/248: Constant [] -> ['alpha__5']
    208/248: Mul ['view_21', 'alpha__5'] -> ['other_1__5']
    209/248: Add ['embedding', 'other_1__5'] -> ['add_4']
    210/248: Constant [] -> ['scalar_tensor_default_1']
    211/248: Pow ['add_4', 'scalar_tensor_default_1'] -> ['pow_2']
    212/248: Constant [] -> ['_val_425']
    213/248: ReduceMean ['pow_2', '_val_425'] -> ['mean_1']
    214/248: Constant [] -> ['aten_add_527_other_1']
    215/248: Add ['mean_1', 'aten_add_527_other_1'] -> ['add_5']
    216/248: Sqrt ['add_5'] -> ['aten_rsqrt_528_tmp']
    217/248: Reciprocal ['aten_rsqrt_528_tmp'] -> ['rsqrt_1']
    218/248: Mul ['add_4', 'rsqrt_1'] -> ['mul_8']
    219/248: Mul ['primals_2', 'mul_8'] -> ['mul_9']
    220/248: Constant [] -> ['aten_view_532_size_0']
    221/248: Reshape ['mul_9', 'aten_view_532_size_0'] -> ['view_22']
    222/248: MatMul ['view_22', 't_4'] -> ['mm_4']
    223/248: MatMul ['view_22', 't_5'] -> ['mm_5']
    224/248: Constant [] -> ['aten_view_536_size_0']
    225/248: Reshape ['mm_4', 'aten_view_536_size_0'] -> ['view_23']
    226/248: Constant [] -> ['aten_view_538_size_0']
    227/248: Reshape ['mm_5', 'aten_view_538_size_0'] -> ['view_25']
    228/248: Sigmoid ['view_23'] -> ['sigmoid']
    229/248: Mul ['view_23', 'sigmoid'] -> ['mul_10']
    230/248: Mul ['mul_10', 'view_25'] -> ['mul_11']
    231/248: Constant [] -> ['aten_view_543_size_0']
    232/248: Reshape ['mul_11', 'aten_view_543_size_0'] -> ['view_26']
    233/248: MatMul ['view_26', 't_6'] -> ['mm_6']
    234/248: Constant [] -> ['aten_view_546_size_0']
    235/248: Reshape ['mm_6', 'aten_view_546_size_0'] -> ['view_27']
    236/248: Constant [] -> ['alpha__6']
    237/248: Mul ['view_27', 'alpha__6'] -> ['other_1__6']
    238/248: Add ['add_4', 'other_1__6'] -> ['add_6']
    239/248: Constant [] -> ['scalar_tensor_default_2']
    240/248: Pow ['add_6', 'scalar_tensor_default_2'] -> ['pow_3']
    241/248: Constant [] -> ['_val_452']
    242/248: ReduceMean ['pow_3', '_val_452'] -> ['mean_2']
    243/248: Constant [] -> ['aten_add_554_other_1']
    244/248: Add ['mean_2', 'aten_add_554_other_1'] -> ['add_7']
    245/248: Sqrt ['add_7'] -> ['aten_rsqrt_555_tmp']
    246/248: Reciprocal ['aten_rsqrt_555_tmp'] -> ['rsqrt_2']
    247/248: Mul ['add_6', 'rsqrt_2'] -> ['mul_12']
    248/248: Mul ['primals_3', 'mul_12'] -> ['mul_13']
    [runpythonerror]
    /home/xadupre/.local/lib/python3.10/site-packages/torch/onnx/_internal/exporter.py:137: UserWarning: torch.onnx.dynamo_export only implements opset version 18 for now. If you need to use a different opset version, please register them with register_custom_op.
      warnings.warn(
    2024-05-08 14:07:07,804 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:07,805 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue full due to large size 4194304.
    2024-05-08 14:07:07,829 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue triu due to large size 4194304.
    2024-05-08 14:07:07,864 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue convert_element_type_default due to large size 4194304.
    2024-05-08 14:07:07,865 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue mul due to large size 4194304.
    2024-05-08 14:07:07,879 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:07,880 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_3 due to large size 4194304.
    2024-05-08 14:07:07,882 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:07,882 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_4 due to large size 4194304.
    2024-05-08 14:07:07,889 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_3 due to large size 4194304.
    2024-05-08 14:07:07,895 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_4 due to large size 4194304.
    2024-05-08 14:07:07,900 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 8388608.
    2024-05-08 14:07:07,901 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue expand_1 due to large size 8388608.
    2024-05-08 14:07:07,905 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue clone due to large size 8388608.
    2024-05-08 14:07:07,910 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue alias due to large size 8388608.
    2024-05-08 14:07:07,913 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue eq due to large size 2097152.
    2024-05-08 14:07:08,071 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:08,071 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue full due to large size 4194304.
    2024-05-08 14:07:08,075 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue triu due to large size 4194304.
    2024-05-08 14:07:08,080 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue convert_element_type_default due to large size 4194304.
    2024-05-08 14:07:08,081 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue mul due to large size 4194304.
    2024-05-08 14:07:08,083 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_3 due to large size 4194304.
    2024-05-08 14:07:08,084 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_4 due to large size 4194304.
    2024-05-08 14:07:08,085 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_3 due to large size 4194304.
    2024-05-08 14:07:08,086 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_4 due to large size 4194304.
    2024-05-08 14:07:08,088 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue expand_1 due to large size 8388608.
    2024-05-08 14:07:08,092 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue eq due to large size 2097152.
    [0;93m2024-05-08 14:07:08.190834200 [W:onnxruntime:, graph.cc:4051 CleanUnusedInitializersAndNodeArgs] Removing initializer '_val_23'. It is not used by any node and should be removed from the model.[m
    [0;93m2024-05-08 14:07:08.190879200 [W:onnxruntime:, graph.cc:4051 CleanUnusedInitializersAndNodeArgs] Removing initializer '_val_22'. It is not used by any node and should be removed from the model.[m

With the custom exporter¶

<<<

import os
import warnings
import numpy as np
import onnx

# from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
import torch
import torch.onnx
from experimental_experiment.torch_models.training_helper import (
    make_aot_ort,
    train_loop,
)
from experimental_experiment.torch_models.dump_helper import dump_onnx

# from experimental_experiment.torch_interpreter import to_onnx

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from transformers import LlamaConfig
    from transformers.models.llama.modeling_llama import LlamaModel


def ids_tensor(shape, vocab_size):
    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(np.random.randint(0, vocab_size - 1))

    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()


config = LlamaConfig(
    hidden_size=16,
    num_hidden_layers=1,
    vocab_size=1024,
    intermediate_size=16,
    max_position_embeddings=1024,
    num_attention_heads=2,
)
config._attn_implementation = "eager"

model = LlamaModel(config)

batch, seq, vocab_size = 2, 1024, 1024

input_ids = ids_tensor([batch, seq], vocab_size)
input_mask = torch.tril(torch.ones(batch, seq, dtype=torch.float32))

model(input_ids, input_mask)

os.environ["ONNXRT_CHANGE_REWRITER"] = "1"

local_aot_ort, _ = make_aot_ort(
    dynamic=True,
    verbose=1,
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimized_mod = torch.compile(model, backend=local_aot_ort, fullgraph=True)
    with dump_onnx("dort-llama-ort", folder="dump_llama", clean=True):
        train_loop(optimized_mod, input_ids, input_mask)

names = [_ for _ in os.listdir("dump_llama") if _.endswith(".onnx")]
print("------------------------------------------")
print(f"exported model: {names}")
for name in names:
    print()
    print("NODES in {name!r}")
    onx = onnx.load(os.path.join("dump_llama", name))
    for i, node in enumerate(onx.graph.node):
        print(
            f"{i+1}/{len(onx.graph.node)}: {node.op_type} {node.input} -> {node.output}"
        )

os.environ["ONNXRT_CHANGE_REWRITER"] = "0"

>>>

    [2024-05-08 14:07:12,831] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    Applied 1 of general pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of general pattern rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific function rewrite rules.
    Applied 0 of onnxruntime specific pattern rewrite rules.
    ------------------------------------------
    exported model: ['dort-llama-ort_1.onnx', 'dort-llama-ort_0.onnx']
    
    NODES in {name!r}
    1/305: Cos ['cat'] -> ['cos']
    2/305: Constant [] -> ['aten_view_112_size_0']
    3/305: Reshape ['mm_5', 'aten_view_112_size_0'] -> ['view_25']
    4/305: Constant [] -> ['aten_view_114_size_0']
    5/305: Reshape ['mm_4', 'aten_view_114_size_0'] -> ['view_23']
    6/305: Mul ['embedding', 'rsqrt'] -> ['mul_2']
    7/305: Constant [] -> ['_val_41']
    8/305: Equal ['primals_13', '_val_41'] -> ['eq_2']
    9/305: Mul ['tangents_1', 'primals_3'] -> ['mul_14']
    10/305: Constant [] -> ['_val_44']
    11/305: Pow ['embedding', '_val_44'] -> ['pow_9']
    12/305: Mul ['add_6', 'rsqrt_2'] -> ['mul_12']
    13/305: Constant [] -> ['aten_view_125_size_0']
    14/305: Reshape ['mm_3', 'aten_view_125_size_0'] -> ['view_21']
    15/305: Sin ['cat'] -> ['sin']
    16/305: Constant [] -> ['_val_51']
    17/305: Pow ['add_6', '_val_51'] -> ['pow_5']
    18/305: Constant [] -> ['aten_unsqueeze_133_dim_0']
    19/305: Unsqueeze ['cos', 'aten_unsqueeze_133_dim_0'] -> ['unsqueeze_10']
    20/305: Mul ['view_23', 'sigmoid'] -> ['mul_10']
    21/305: Constant [] -> ['aten_unsqueeze_137_dim_0']
    22/305: Unsqueeze ['eq_2', 'aten_unsqueeze_137_dim_0'] -> ['unsqueeze_12']
    23/305: Mul ['mul_14', 'add_6'] -> ['mul_16']
    24/305: Mul ['mul_14', 'rsqrt_2'] -> ['mul_17']
    25/305: Constant [] -> ['_val_63']
    26/305: Mul ['pow_9', '_val_63'] -> ['mul_47']
    27/305: Mul ['tangents_1', 'mul_12'] -> ['mul_15']
    28/305: Constant [] -> ['alpha__1']
    29/305: Mul ['view_21', 'alpha__1'] -> ['other_1__1']
    30/305: Add ['embedding', 'other_1__1'] -> ['add_4']
    31/305: Constant [] -> ['aten_unsqueeze_145_dim_0']
    32/305: Unsqueeze ['sin', 'aten_unsqueeze_145_dim_0'] -> ['unsqueeze_11']
    33/305: Constant [] -> ['_val_69']
    34/305: Mul ['pow_5', '_val_69'] -> ['mul_20']
    35/305: Constant [] -> ['fill']
    36/305: Constant [] -> ['_val_75']
    37/305: ReduceSum ['mul_16', '_val_75'] -> ['sum_2']
    38/305: Constant [] -> ['_val_77']
    39/305: ReduceSum ['mul_15', '_val_77'] -> ['sum_1']
    40/305: Mul ['add_4', 'rsqrt_1'] -> ['mul_8']
    41/305: Constant [] -> ['_val_80']
    42/305: Pow ['add_4', '_val_80'] -> ['pow_7']
    43/305: Constant [] -> ['alpha__2']
    44/305: Mul ['sigmoid', 'alpha__2'] -> ['other_1__2']
    45/305: Sub ['fill', 'other_1__2'] -> ['sub']
    46/305: Constant [] -> ['_val_86']
    47/305: Mul ['sum_2', '_val_86'] -> ['mul_18']
    48/305: Constant [] -> ['aten_view_166_size_0']
    49/305: Reshape ['sum_1', 'aten_view_166_size_0'] -> ['view_28']
    50/305: Constant [] -> ['_val_90']
    51/305: Mul ['pow_7', '_val_90'] -> ['mul_33']
    52/305: Mul ['view_23', 'sub'] -> ['mul_24']
    53/305: Constant [] -> ['scalar_tensor_default']
    54/305: Pow ['rsqrt', 'scalar_tensor_default'] -> ['pow_8']
    55/305: Constant [] -> ['scalar_tensor_default_1']
    56/305: Pow ['rsqrt_1', 'scalar_tensor_default_1'] -> ['pow_6']
    57/305: Constant [] -> ['scalar_tensor_default_2']
    58/305: Pow ['rsqrt_2', 'scalar_tensor_default_2'] -> ['pow_4']
    59/305: Constant [] -> ['aten_add_182_other_1']
    60/305: Add ['mul_24', 'aten_add_182_other_1'] -> ['add_9']
    61/305: Mul ['mul_18', 'pow_4'] -> ['mul_19']
    62/305: Mul ['sigmoid', 'add_9'] -> ['mul_25']
    63/305: Constant [] -> ['aten_expand_186_size_1']
    64/305: Expand ['mul_19', 'aten_expand_186_size_1'] -> ['expand_9']
    65/305: Constant [] -> ['scalar_tensor_default_4']
    66/305: Div ['expand_9', 'scalar_tensor_default_4'] -> ['div_1']
    67/305: Mul ['div_1', 'mul_20'] -> ['mul_21']
    68/305: Constant [] -> ['alpha__3']
    69/305: Mul ['mul_21', 'alpha__3'] -> ['other_1__3']
    70/305: Add ['mul_17', 'other_1__3'] -> ['add_8']
    71/305: Constant [] -> ['aten_view_193_size_0']
    72/305: Reshape ['add_8', 'aten_view_193_size_0'] -> ['view_29']
    73/305: Transpose ['view_29'] -> ['t_7']
    74/305: MatMul ['view_29', 't_9'] -> ['mm_8']
    75/305: MatMul ['t_7', 'view_26'] -> ['mm_7']
    76/305: Constant [] -> ['aten_view_198_size_0']
    77/305: Reshape ['mm_8', 'aten_view_198_size_0'] -> ['view_30']
    78/305: Transpose ['mm_7'] -> ['t_8']
    79/305: Mul ['view_30', 'mul_10'] -> ['mul_22']
    80/305: Mul ['view_30', 'view_25'] -> ['mul_23']
    81/305: Transpose ['t_8'] -> ['t_10']
    82/305: Constant [] -> ['aten_view_204_size_0']
    83/305: Reshape ['mul_22', 'aten_view_204_size_0'] -> ['view_31']
    84/305: Mul ['mul_23', 'mul_25'] -> ['mul_26']
    85/305: Transpose ['view_31'] -> ['t_11']
    86/305: MatMul ['view_31', 't_13'] -> ['mm_10']
    87/305: Constant [] -> ['aten_view_209_size_0']
    88/305: Reshape ['mul_26', 'aten_view_209_size_0'] -> ['view_33']
    89/305: MatMul ['t_11', 'view_22'] -> ['mm_9']
    90/305: Constant [] -> ['aten_view_212_size_0']
    91/305: Reshape ['mm_10', 'aten_view_212_size_0'] -> ['view_32']
    92/305: Transpose ['view_33'] -> ['t_15']
    93/305: MatMul ['view_33', 't_17'] -> ['mm_12']
    94/305: Transpose ['mm_9'] -> ['t_12']
    95/305: MatMul ['t_15', 'view_22'] -> ['mm_11']
    96/305: Constant [] -> ['aten_view_218_size_0']
    97/305: Reshape ['mm_12', 'aten_view_218_size_0'] -> ['view_34']
    98/305: Transpose ['t_12'] -> ['t_14']
    99/305: Transpose ['mm_11'] -> ['t_16']
    100/305: Constant [] -> ['alpha__4']
    101/305: Mul ['view_34', 'alpha__4'] -> ['other_1__4']
    102/305: Add ['view_32', 'other_1__4'] -> ['add_10']
    103/305: Transpose ['t_16'] -> ['t_18']
    104/305: Mul ['add_10', 'primals_2'] -> ['mul_27']
    105/305: Mul ['add_10', 'mul_8'] -> ['mul_28']
    106/305: Mul ['mul_27', 'add_4'] -> ['mul_29']
    107/305: Mul ['mul_27', 'rsqrt_1'] -> ['mul_30']
    108/305: Constant [] -> ['_val_150']
    109/305: ReduceSum ['mul_28', '_val_150'] -> ['sum_3']
    110/305: Constant [] -> ['_val_152']
    111/305: ReduceSum ['mul_29', '_val_152'] -> ['sum_4']
    112/305: Constant [] -> ['alpha__5']
    113/305: Mul ['mul_30', 'alpha__5'] -> ['other_1__5']
    114/305: Add ['add_8', 'other_1__5'] -> ['add_11']
    115/305: Constant [] -> ['aten_view_233_size_0']
    116/305: Reshape ['sum_3', 'aten_view_233_size_0'] -> ['view_35']
    117/305: Constant [] -> ['_val_157']
    118/305: Mul ['sum_4', '_val_157'] -> ['mul_31']
    119/305: Mul ['mul_31', 'pow_6'] -> ['mul_32']
    120/305: Constant [] -> ['aten_expand_238_size_1']
    121/305: Expand ['mul_32', 'aten_expand_238_size_1'] -> ['expand_10']
    122/305: Constant [] -> ['scalar_tensor_default_5']
    123/305: Div ['expand_10', 'scalar_tensor_default_5'] -> ['div_2']
    124/305: Mul ['div_2', 'mul_33'] -> ['mul_34']
    125/305: Constant [] -> ['alpha__6']
    126/305: Mul ['mul_34', 'alpha__6'] -> ['other_1__6']
    127/305: Add ['add_11', 'other_1__6'] -> ['add_12']
    128/305: Constant [] -> ['aten_view_245_size_0']
    129/305: Reshape ['add_12', 'aten_view_245_size_0'] -> ['view_36']
    130/305: Transpose ['view_36'] -> ['t_19']
    131/305: MatMul ['view_36', 't_21'] -> ['mm_14']
    132/305: MatMul ['t_19', 'view_20'] -> ['mm_13']
    133/305: Constant [] -> ['aten_view_250_size_0']
    134/305: Reshape ['mm_14', 'aten_view_250_size_0'] -> ['view_37']
    135/305: Transpose ['mm_13'] -> ['t_20']
    136/305: Constant [] -> ['aten_view_253_size_0']
    137/305: Reshape ['view_37', 'aten_view_253_size_0'] -> ['view_38']
    138/305: Transpose ['t_20'] -> ['t_22']
    139/305: Transpose ['view_38'] -> ['transpose_6']
    140/305: Constant [] -> ['aten_view_258_size_0']
    141/305: Reshape ['transpose_6', 'aten_view_258_size_0'] -> ['view_39']
    142/305: MatMul ['transpose_7', 'view_39'] -> ['bmm_3']
    143/305: MatMul ['view_39', 'transpose_8'] -> ['bmm_4']
    144/305: Constant [] -> ['aten_view_262_size_0']
    145/305: Reshape ['bmm_3', 'aten_view_262_size_0'] -> ['view_40']
    146/305: Constant [] -> ['aten_view_264_size_0']
    147/305: Reshape ['bmm_4', 'aten_view_264_size_0'] -> ['view_41']
    148/305: Constant [] -> ['alpha__7']
    149/305: Mul ['view_40', 'alpha__7'] -> ['other_1__7']
    150/305: Add ['tangents_3', 'other_1__7'] -> ['add_13']
    151/305: Mul ['view_41', 'detach_13'] -> ['mul_35']
    152/305: Transpose ['add_13'] -> ['transpose_12']
    153/305: Constant [] -> ['_val_191']
    154/305: ReduceSum ['mul_35', '_val_191'] -> ['sum_5']
    155/305: Mul ['detach_13', 'sum_5'] -> ['mul_36']
    156/305: Constant [] -> ['aten_view_273_size_0']
    157/305: Reshape ['transpose_12', 'aten_view_273_size_0'] -> ['view_45']
    158/305: Constant [] -> ['alpha__8']
    159/305: Mul ['mul_36', 'alpha__8'] -> ['other_1__8']
    160/305: Sub ['mul_35', 'other_1__8'] -> ['sub_1']
    161/305: Constant [] -> ['aten_view_276_size_0']
    162/305: Reshape ['view_45', 'aten_view_276_size_0'] -> ['view_48']
    163/305: Constant [] -> ['_val_200']
    164/305: Div ['sub_1', '_val_200'] -> ['div_3']
    165/305: Transpose ['view_48'] -> ['t_23']
    166/305: MatMul ['view_48', 't_25'] -> ['mm_16']
    167/305: Constant [] -> ['aten_view_282_size_0']
    168/305: Reshape ['div_3', 'aten_view_282_size_0'] -> ['view_42']
    169/305: MatMul ['t_23', 'view_1'] -> ['mm_15']
    170/305: Constant [] -> ['aten_view_285_size_0']
    171/305: Reshape ['mm_16', 'aten_view_285_size_0'] -> ['view_49']
    172/305: MatMul ['transpose_9', 'view_42'] -> ['bmm_5']
    173/305: MatMul ['view_42', 'transpose_10'] -> ['bmm_6']
    174/305: Transpose ['mm_15'] -> ['t_24']
    175/305: Constant [] -> ['aten_view_290_size_0']
    176/305: Reshape ['bmm_5', 'aten_view_290_size_0'] -> ['view_43']
    177/305: Constant [] -> ['aten_view_292_size_0']
    178/305: Reshape ['bmm_6', 'aten_view_292_size_0'] -> ['view_44']
    179/305: Transpose ['t_24'] -> ['t_26']
    180/305: Transpose ['view_43'] -> ['transpose_11']
    181/305: Mul ['view_44', 'unsqueeze_11'] -> ['mul_39']
    182/305: Mul ['view_44', 'unsqueeze_10'] -> ['mul_40']
    183/305: Constant [] -> ['alpha__9']
    184/305: Mul ['transpose_11', 'alpha__9'] -> ['other_1__9']
    185/305: Add ['tangents_2', 'other_1__9'] -> ['add_14']
    186/305: Constant [] -> ['_val_224']
    187/305: Constant [] -> ['_val_228']
    188/305: Constant [] -> ['_val_232']
    189/305: Constant [] -> ['_val_236']
    190/305: Slice ['mul_39', '_val_224', '_val_228', '_val_232', '_val_236'] -> ['slice_22']
    191/305: Constant [] -> ['_val_241']
    192/305: Constant [] -> ['_val_245']
    193/305: Constant [] -> ['_val_249']
    194/305: Constant [] -> ['_val_253']
    195/305: Slice ['mul_39', '_val_241', '_val_245', '_val_249', '_val_253'] -> ['slice_23']
    196/305: Mul ['add_14', 'unsqueeze_11'] -> ['mul_37']
    197/305: Mul ['add_14', 'unsqueeze_10'] -> ['mul_38']
    198/305: Neg ['slice_22'] -> ['neg_3']
    199/305: Constant [] -> ['_val_263']
    200/305: Constant [] -> ['_val_267']
    201/305: Constant [] -> ['_val_271']
    202/305: Constant [] -> ['_val_275']
    203/305: Slice ['mul_37', '_val_263', '_val_267', '_val_271', '_val_275'] -> ['slice_20']
    204/305: Constant [] -> ['_val_280']
    205/305: Constant [] -> ['_val_284']
    206/305: Constant [] -> ['_val_288']
    207/305: Constant [] -> ['_val_292']
    208/305: Slice ['mul_37', '_val_280', '_val_284', '_val_288', '_val_292'] -> ['slice_21']
    209/305: Constant [] -> ['_val_311']
    210/305: Transpose ['slice_23'] -> ['_val_312']
    211/305: Constant [] -> ['_val_313']
    212/305: ScatterND ['_val_313', '_val_311', '_val_312'] -> ['_val_314']
    213/305: Transpose ['_val_314'] -> ['slice_scatter_3']
    214/305: Neg ['slice_20'] -> ['neg_2']
    215/305: Constant [] -> ['_val_334']
    216/305: Transpose ['neg_3'] -> ['_val_335']
    217/305: Constant [] -> ['_val_336']
    218/305: ScatterND ['_val_336', '_val_334', '_val_335'] -> ['_val_337']
    219/305: Transpose ['_val_337'] -> ['slice_scatter_2']
    220/305: Constant [] -> ['_val_356']
    221/305: Transpose ['slice_21'] -> ['_val_357']
    222/305: Constant [] -> ['_val_358']
    223/305: ScatterND ['_val_358', '_val_356', '_val_357'] -> ['_val_359']
    224/305: Transpose ['_val_359'] -> ['slice_scatter_1']
    225/305: Constant [] -> ['alpha__10']
    226/305: Mul ['slice_scatter_3', 'alpha__10'] -> ['other_1__10']
    227/305: Add ['slice_scatter_2', 'other_1__10'] -> ['add_17']
    228/305: Constant [] -> ['_val_377']
    229/305: Transpose ['neg_2'] -> ['_val_378']
    230/305: Constant [] -> ['_val_379']
    231/305: ScatterND ['_val_379', '_val_377', '_val_378'] -> ['_val_380']
    232/305: Transpose ['_val_380'] -> ['slice_scatter']
    233/305: Constant [] -> ['alpha__11']
    234/305: Mul ['mul_40', 'alpha__11'] -> ['other_1__11']
    235/305: Add ['add_17', 'other_1__11'] -> ['add_18']
    236/305: Constant [] -> ['alpha__12']
    237/305: Mul ['slice_scatter_1', 'alpha__12'] -> ['other_1__12']
    238/305: Add ['slice_scatter', 'other_1__12'] -> ['add_15']
    239/305: Transpose ['add_18'] -> ['transpose_14']
    240/305: Constant [] -> ['alpha__13']
    241/305: Mul ['mul_38', 'alpha__13'] -> ['other_1__13']
    242/305: Add ['add_15', 'other_1__13'] -> ['add_16']
    243/305: Transpose ['add_16'] -> ['transpose_13']
    244/305: Constant [] -> ['aten_view_466_size_0']
    245/305: Reshape ['transpose_14', 'aten_view_466_size_0'] -> ['view_47']
    246/305: Constant [] -> ['aten_view_469_size_0']
    247/305: Reshape ['view_47', 'aten_view_469_size_0'] -> ['view_52']
    248/305: Constant [] -> ['aten_view_471_size_0']
    249/305: Reshape ['transpose_13', 'aten_view_471_size_0'] -> ['view_46']
    250/305: Transpose ['view_52'] -> ['t_31']
    251/305: MatMul ['view_52', 't_33'] -> ['mm_20']
    252/305: Constant [] -> ['aten_view_475_size_0']
    253/305: Reshape ['view_46', 'aten_view_475_size_0'] -> ['view_50']
    254/305: MatMul ['t_31', 'view_1'] -> ['mm_19']
    255/305: Constant [] -> ['aten_view_478_size_0']
    256/305: Reshape ['mm_20', 'aten_view_478_size_0'] -> ['view_53']
    257/305: Transpose ['view_50'] -> ['t_27']
    258/305: MatMul ['view_50', 't_29'] -> ['mm_18']
    259/305: Transpose ['mm_19'] -> ['t_32']
    260/305: MatMul ['t_27', 'view_1'] -> ['mm_17']
    261/305: Constant [] -> ['aten_view_484_size_0']
    262/305: Reshape ['mm_18', 'aten_view_484_size_0'] -> ['view_51']
    263/305: Transpose ['t_32'] -> ['t_34']
    264/305: Transpose ['mm_17'] -> ['t_28']
    265/305: Constant [] -> ['alpha__14']
    266/305: Mul ['view_51', 'alpha__14'] -> ['other_1__14']
    267/305: Add ['view_49', 'other_1__14'] -> ['add_19']
    268/305: Transpose ['t_28'] -> ['t_30']
    269/305: Constant [] -> ['alpha__15']
    270/305: Mul ['view_53', 'alpha__15'] -> ['other_1__15']
    271/305: Add ['add_19', 'other_1__15'] -> ['add_20']
    272/305: Mul ['add_20', 'primals_1'] -> ['mul_41']
    273/305: Mul ['add_20', 'mul_2'] -> ['mul_42']
    274/305: Mul ['mul_41', 'embedding'] -> ['mul_43']
    275/305: Mul ['mul_41', 'rsqrt'] -> ['mul_44']
    276/305: Constant [] -> ['_val_417']
    277/305: ReduceSum ['mul_42', '_val_417'] -> ['sum_6']
    278/305: Constant [] -> ['_val_419']
    279/305: ReduceSum ['mul_43', '_val_419'] -> ['sum_7']
    280/305: Constant [] -> ['alpha__16']
    281/305: Mul ['mul_44', 'alpha__16'] -> ['other_1__16']
    282/305: Add ['add_12', 'other_1__16'] -> ['add_21']
    283/305: Constant [] -> ['aten_view_500_size_0']
    284/305: Reshape ['sum_6', 'aten_view_500_size_0'] -> ['view_54']
    285/305: Constant [] -> ['_val_424']
    286/305: Mul ['sum_7', '_val_424'] -> ['mul_45']
    287/305: Mul ['mul_45', 'pow_8'] -> ['mul_46']
    288/305: Constant [] -> ['aten_expand_505_size_1']
    289/305: Expand ['mul_46', 'aten_expand_505_size_1'] -> ['expand_11']
    290/305: Constant [] -> ['scalar_tensor_default_6']
    291/305: Div ['expand_11', 'scalar_tensor_default_6'] -> ['div_4']
    292/305: Mul ['div_4', 'mul_47'] -> ['mul_48']
    293/305: Constant [] -> ['alpha__17']
    294/305: Mul ['mul_48', 'alpha__17'] -> ['other_1__17']
    295/305: Add ['add_21', 'other_1__17'] -> ['add_22']
    296/305: Constant [] -> ['aten_masked_fill_512_value_cast']
    297/305: Where ['unsqueeze_12', 'aten_masked_fill_512_value_cast', 'add_22'] -> ['masked_fill_1']
    298/305: Constant [] -> ['_val_436']
    299/305: ConstantOfShape ['_val_436'] -> ['aten_new_zeros_514_result']
    300/305: SequenceConstruct ['primals_13'] -> ['438']
    301/305: Constant [] -> ['int64_0__18']
    302/305: SequenceAt ['438', 'int64_0__18'] -> ['index__18']
    303/305: Constant [] -> ['int64_m1_1d__18']
    304/305: Unsqueeze ['index__18', 'int64_m1_1d__18'] -> ['new_index__18']
    305/305: ScatterND ['aten_new_zeros_514_result', 'new_index__18', 'masked_fill_1'] -> ['_unsafe_index_put']
    
    NODES in {name!r}
    1/248: Gather ['primals_4', 'primals_13'] -> ['embedding']
    2/248: Transpose ['primals_8'] -> ['t_3']
    3/248: Constant [] -> ['_val_22']
    4/248: Constant [] -> ['_val_23']
    5/248: Constant [] -> ['size_0__1']
    6/248: Constant [] -> ['fill_value_1__1']
    7/248: Expand ['fill_value_1__1', 'size_0__1'] -> ['full']
    8/248: Constant [] -> ['_val_36']
    9/248: Constant [] -> ['_val_40']
    10/248: Constant [] -> ['_val_44']
    11/248: Constant [] -> ['_val_48']
    12/248: Slice ['primals_14', '_val_36', '_val_40', '_val_44', '_val_48'] -> ['slice_5']
    13/248: Transpose ['primals_9'] -> ['t_4']
    14/248: Transpose ['primals_10'] -> ['t_5']
    15/248: Transpose ['primals_11'] -> ['t_6']
    16/248: Transpose ['primals_5'] -> ['t']
    17/248: Transpose ['primals_6'] -> ['t_1']
    18/248: Transpose ['primals_7'] -> ['t_2']
    19/248: Constant [] -> ['aten_unsqueeze_155_dim_0']
    20/248: Unsqueeze ['primals_12', 'aten_unsqueeze_155_dim_0'] -> ['unsqueeze_7']
    21/248: Constant [] -> ['scalar_tensor_default']
    22/248: Pow ['embedding', 'scalar_tensor_default'] -> ['pow_1']
    23/248: Transpose ['t_3'] -> ['t_21']
    24/248: Constant [] -> ['aten_triu_163_diagonal']
    25/248: Trilu ['full', 'aten_triu_163_diagonal'] -> ['triu']
    26/248: Constant [] -> ['aten_unsqueeze_164_dim_0']
    27/248: Unsqueeze ['slice_5', 'aten_unsqueeze_164_dim_0'] -> ['unsqueeze_5']
    28/248: Transpose ['t_4'] -> ['t_17']
    29/248: Transpose ['t_5'] -> ['t_13']
    30/248: Transpose ['t_6'] -> ['t_9']
    31/248: Transpose ['t'] -> ['t_33']
    32/248: Transpose ['t_1'] -> ['t_29']
    33/248: Transpose ['t_2'] -> ['t_25']
    34/248: Constant [] -> ['_val_75']
    35/248: Constant [] -> ['_val_79']
    36/248: Constant [] -> ['_val_83']
    37/248: Constant [] -> ['_val_87']
    38/248: Slice ['unsqueeze_7', '_val_75', '_val_79', '_val_83', '_val_87'] -> ['slice_7']
    39/248: Constant [] -> ['gt']
    40/248: Constant [] -> ['_val_107']
    41/248: ReduceMean ['pow_1', '_val_107'] -> ['mean']
    42/248: Constant [] -> ['aten_unsqueeze_208_dim_0']
    43/248: Unsqueeze ['unsqueeze_5', 'aten_unsqueeze_208_dim_0'] -> ['unsqueeze_6']
    44/248: Constant [] -> ['aten_unsqueeze_209_dim_0']
    45/248: Unsqueeze ['slice_7', 'aten_unsqueeze_209_dim_0'] -> ['unsqueeze_8']
    46/248: Cast ['gt'] -> ['convert_element_type_default']
    47/248: Mul ['triu', 'convert_element_type_default'] -> ['mul']
    48/248: Constant [] -> ['aten_add_214_other_1']
    49/248: Add ['mean', 'aten_add_214_other_1'] -> ['add']
    50/248: Constant [] -> ['_val_119']
    51/248: Constant [] -> ['_val_123']
    52/248: Constant [] -> ['_val_127']
    53/248: Constant [] -> ['_val_131']
    54/248: Slice ['unsqueeze_6', '_val_119', '_val_123', '_val_127', '_val_131'] -> ['slice_6']
    55/248: Constant [] -> ['aten_expand_233_size_1']
    56/248: Expand ['unsqueeze_8', 'aten_expand_233_size_1'] -> ['expand_2']
    57/248: Constant [] -> ['aten_unsqueeze_251_dim_0']
    58/248: Unsqueeze ['mul', 'aten_unsqueeze_251_dim_0'] -> ['unsqueeze_3']
    59/248: Sqrt ['add'] -> ['aten_rsqrt_252_tmp']
    60/248: Reciprocal ['aten_rsqrt_252_tmp'] -> ['rsqrt']
    61/248: Constant [] -> ['_val_154']
    62/248: Equal ['slice_6', '_val_154'] -> ['eq_1']
    63/248: Constant [] -> ['aten_expand_256_size_1']
    64/248: Expand ['expand_2', 'aten_expand_256_size_1'] -> ['expand_3']
    65/248: Constant [] -> ['aten_unsqueeze_258_dim_0']
    66/248: Unsqueeze ['unsqueeze_3', 'aten_unsqueeze_258_dim_0'] -> ['unsqueeze_4']
    67/248: Mul ['embedding', 'rsqrt'] -> ['mul_2']
    68/248: Constant [] -> ['_val_168']
    69/248: Constant [] -> ['_val_172']
    70/248: Constant [] -> ['_val_176']
    71/248: Constant [] -> ['_val_180']
    72/248: Slice ['unsqueeze_4', '_val_168', '_val_172', '_val_176', '_val_180'] -> ['slice_3']
    73/248: Mul ['primals_1', 'mul_2'] -> ['mul_3']
    74/248: Constant [] -> ['view_11']
    75/248: Constant [] -> ['_val_188']
    76/248: Constant [] -> ['_val_192']
    77/248: Constant [] -> ['_val_196']
    78/248: Constant [] -> ['_val_200']
    79/248: Slice ['slice_3', '_val_188', '_val_192', '_val_196', '_val_200'] -> ['slice_4']
    80/248: Constant [] -> ['aten_view_302_size_0']
    81/248: Reshape ['mul_3', 'aten_view_302_size_0'] -> ['view_1']
    82/248: Constant [] -> ['aten_expand_305_size_1']
    83/248: Expand ['slice_4', 'aten_expand_305_size_1'] -> ['expand_1']
    84/248: MatMul ['view_1', 't'] -> ['mm']
    85/248: MatMul ['view_1', 't_1'] -> ['mm_1']
    86/248: MatMul ['view_1', 't_2'] -> ['mm_2']
    87/248: MatMul ['expand_3', 'view_11'] -> ['view_12']
    88/248: Constant [] -> ['aten_view_313_size_0']
    89/248: Reshape ['mm', 'aten_view_313_size_0'] -> ['view_2']
    90/248: Constant [] -> ['aten_view_315_size_0']
    91/248: Reshape ['mm_1', 'aten_view_315_size_0'] -> ['view_4']
    92/248: Constant [] -> ['aten_view_317_size_0']
    93/248: Reshape ['mm_2', 'aten_view_317_size_0'] -> ['view_6']
    94/248: Transpose ['view_12'] -> ['transpose_3']
    95/248: Constant [] -> ['aten_view_321_size_0']
    96/248: Reshape ['view_2', 'aten_view_321_size_0'] -> ['view_7']
    97/248: Constant [] -> ['aten_view_323_size_0']
    98/248: Reshape ['view_4', 'aten_view_323_size_0'] -> ['view_8']
    99/248: Constant [] -> ['aten_view_325_size_0']
    100/248: Reshape ['view_6', 'aten_view_325_size_0'] -> ['view_9']
    101/248: Concat ['transpose_3', 'transpose_3'] -> ['cat']
    102/248: Constant [] -> ['_val_229']
    103/248: Equal ['expand_1', '_val_229'] -> ['eq']
    104/248: Transpose ['view_7'] -> ['transpose']
    105/248: Transpose ['view_8'] -> ['transpose_1']
    106/248: Transpose ['view_9'] -> ['transpose_2']
    107/248: Cos ['cat'] -> ['cos']
    108/248: Sin ['cat'] -> ['sin']
    109/248: And ['eq', 'eq_1'] -> ['mul_1']
    110/248: Constant [] -> ['_val_240']
    111/248: Constant [] -> ['_val_244']
    112/248: Constant [] -> ['_val_248']
    113/248: Constant [] -> ['_val_252']
    114/248: Slice ['transpose', '_val_240', '_val_244', '_val_248', '_val_252'] -> ['slice_10']
    115/248: Constant [] -> ['_val_257']
    116/248: Constant [] -> ['_val_261']
    117/248: Constant [] -> ['_val_265']
    118/248: Constant [] -> ['_val_269']
    119/248: Slice ['transpose', '_val_257', '_val_261', '_val_265', '_val_269'] -> ['slice_11']
    120/248: Constant [] -> ['_val_274']
    121/248: Constant [] -> ['_val_278']
    122/248: Constant [] -> ['_val_282']
    123/248: Constant [] -> ['_val_286']
    124/248: Slice ['transpose_1', '_val_274', '_val_278', '_val_282', '_val_286'] -> ['slice_12']
    125/248: Constant [] -> ['_val_291']
    126/248: Constant [] -> ['_val_295']
    127/248: Constant [] -> ['_val_299']
    128/248: Constant [] -> ['_val_303']
    129/248: Slice ['transpose_1', '_val_291', '_val_295', '_val_299', '_val_303'] -> ['slice_13']
    130/248: Constant [] -> ['aten_expand_405_size_1']
    131/248: Expand ['transpose_2', 'aten_expand_405_size_1'] -> ['expand_8']
    132/248: Constant [] -> ['aten_unsqueeze_406_dim_0']
    133/248: Unsqueeze ['cos', 'aten_unsqueeze_406_dim_0'] -> ['unsqueeze_10']
    134/248: Constant [] -> ['aten_unsqueeze_407_dim_0']
    135/248: Unsqueeze ['sin', 'aten_unsqueeze_407_dim_0'] -> ['unsqueeze_11']
    136/248: Constant [] -> ['_val_309']
    137/248: Where ['mul_1', '_val_309', 'expand_1'] -> ['masked_fill']
    138/248: Neg ['slice_11'] -> ['neg']
    139/248: Neg ['slice_13'] -> ['neg_1']
    140/248: Mul ['transpose', 'unsqueeze_10'] -> ['mul_4']
    141/248: Mul ['transpose_1', 'unsqueeze_10'] -> ['mul_6']
    142/248: Concat ['neg', 'slice_10'] -> ['cat_1']
    143/248: Concat ['neg_1', 'slice_12'] -> ['cat_2']
    144/248: Constant [] -> ['aten_view_421_size_0']
    145/248: Reshape ['expand_8', 'aten_view_421_size_0'] -> ['view_17']
    146/248: Constant [] -> ['_val_326']
    147/248: Constant [] -> ['_val_330']
    148/248: Constant [] -> ['_val_334']
    149/248: Constant [] -> ['_val_338']
    150/248: Slice ['masked_fill', '_val_326', '_val_330', '_val_334', '_val_338'] -> ['slice_17']
    151/248: Mul ['cat_1', 'unsqueeze_11'] -> ['mul_5']
    152/248: Mul ['cat_2', 'unsqueeze_11'] -> ['mul_7']
    153/248: Transpose ['view_17'] -> ['transpose_8']
    154/248: Constant [] -> ['_val_346']
    155/248: Constant [] -> ['_val_350']
    156/248: Constant [] -> ['_val_354']
    157/248: Constant [] -> ['_val_358']
    158/248: Slice ['slice_17', '_val_346', '_val_350', '_val_354', '_val_358'] -> ['slice_18']
    159/248: Constant [] -> ['alpha__2']
    160/248: Mul ['mul_5', 'alpha__2'] -> ['other_1__2']
    161/248: Add ['mul_4', 'other_1__2'] -> ['add_1']
    162/248: Constant [] -> ['alpha__3']
    163/248: Mul ['mul_7', 'alpha__3'] -> ['other_1__3']
    164/248: Add ['mul_6', 'other_1__3'] -> ['add_2']
    165/248: Constant [] -> ['_val_365']
    166/248: Constant [] -> ['_val_369']
    167/248: Constant [] -> ['_val_373']
    168/248: Constant [] -> ['_val_377']
    169/248: Slice ['slice_18', '_val_365', '_val_369', '_val_373', '_val_377'] -> ['slice_19']
    170/248: Constant [] -> ['aten_expand_479_size_1']
    171/248: Expand ['add_1', 'aten_expand_479_size_1'] -> ['expand_5']
    172/248: Transpose ['add_2'] -> ['transpose_4']
    173/248: Constant [] -> ['aten_expand_483_size_1']
    174/248: Expand ['transpose_4', 'aten_expand_483_size_1'] -> ['expand_6']
    175/248: Constant [] -> ['aten_view_485_size_0']
    176/248: Reshape ['expand_5', 'aten_view_485_size_0'] -> ['view_13']
    177/248: Transpose ['view_13'] -> ['transpose_9']
    178/248: Constant [] -> ['aten_view_489_size_0']
    179/248: Reshape ['expand_6', 'aten_view_489_size_0'] -> ['view_14']
    180/248: MatMul ['view_13', 'view_14'] -> ['bmm_1']
    181/248: Transpose ['view_14'] -> ['transpose_10']
    182/248: Constant [] -> ['aten_view_493_size_0']
    183/248: Reshape ['bmm_1', 'aten_view_493_size_0'] -> ['view_15']
    184/248: Constant [] -> ['_val_395']
    185/248: Div ['view_15', '_val_395'] -> ['div']
    186/248: Constant [] -> ['alpha__4']
    187/248: Mul ['slice_19', 'alpha__4'] -> ['other_1__4']
    188/248: Add ['div', 'other_1__4'] -> ['add_3']
    189/248: Softmax ['add_3'] -> ['_softmax']
    190/248: Constant [] -> ['aten_expand_502_size_1']
    191/248: Expand ['_softmax', 'aten_expand_502_size_1'] -> ['expand_7']
    192/248: Constant [] -> ['aten_view_505_size_0']
    193/248: Reshape ['expand_7', 'aten_view_505_size_0'] -> ['view_16']
    194/248: Identity ['_softmax'] -> ['detach_13']
    195/248: MatMul ['view_16', 'view_17'] -> ['bmm_2']
    196/248: Transpose ['view_16'] -> ['transpose_7']
    197/248: Constant [] -> ['aten_view_510_size_0']
    198/248: Reshape ['bmm_2', 'aten_view_510_size_0'] -> ['view_18']
    199/248: Transpose ['view_18'] -> ['transpose_5']
    200/248: Constant [] -> ['aten_view_514_size_0']
    201/248: Reshape ['transpose_5', 'aten_view_514_size_0'] -> ['view_19']
    202/248: Constant [] -> ['aten_view_516_size_0']
    203/248: Reshape ['view_19', 'aten_view_516_size_0'] -> ['view_20']
    204/248: MatMul ['view_20', 't_3'] -> ['mm_3']
    205/248: Constant [] -> ['aten_view_519_size_0']
    206/248: Reshape ['mm_3', 'aten_view_519_size_0'] -> ['view_21']
    207/248: Constant [] -> ['alpha__5']
    208/248: Mul ['view_21', 'alpha__5'] -> ['other_1__5']
    209/248: Add ['embedding', 'other_1__5'] -> ['add_4']
    210/248: Constant [] -> ['scalar_tensor_default_1']
    211/248: Pow ['add_4', 'scalar_tensor_default_1'] -> ['pow_2']
    212/248: Constant [] -> ['_val_425']
    213/248: ReduceMean ['pow_2', '_val_425'] -> ['mean_1']
    214/248: Constant [] -> ['aten_add_527_other_1']
    215/248: Add ['mean_1', 'aten_add_527_other_1'] -> ['add_5']
    216/248: Sqrt ['add_5'] -> ['aten_rsqrt_528_tmp']
    217/248: Reciprocal ['aten_rsqrt_528_tmp'] -> ['rsqrt_1']
    218/248: Mul ['add_4', 'rsqrt_1'] -> ['mul_8']
    219/248: Mul ['primals_2', 'mul_8'] -> ['mul_9']
    220/248: Constant [] -> ['aten_view_532_size_0']
    221/248: Reshape ['mul_9', 'aten_view_532_size_0'] -> ['view_22']
    222/248: MatMul ['view_22', 't_4'] -> ['mm_4']
    223/248: MatMul ['view_22', 't_5'] -> ['mm_5']
    224/248: Constant [] -> ['aten_view_536_size_0']
    225/248: Reshape ['mm_4', 'aten_view_536_size_0'] -> ['view_23']
    226/248: Constant [] -> ['aten_view_538_size_0']
    227/248: Reshape ['mm_5', 'aten_view_538_size_0'] -> ['view_25']
    228/248: Sigmoid ['view_23'] -> ['sigmoid']
    229/248: Mul ['view_23', 'sigmoid'] -> ['mul_10']
    230/248: Mul ['mul_10', 'view_25'] -> ['mul_11']
    231/248: Constant [] -> ['aten_view_543_size_0']
    232/248: Reshape ['mul_11', 'aten_view_543_size_0'] -> ['view_26']
    233/248: MatMul ['view_26', 't_6'] -> ['mm_6']
    234/248: Constant [] -> ['aten_view_546_size_0']
    235/248: Reshape ['mm_6', 'aten_view_546_size_0'] -> ['view_27']
    236/248: Constant [] -> ['alpha__6']
    237/248: Mul ['view_27', 'alpha__6'] -> ['other_1__6']
    238/248: Add ['add_4', 'other_1__6'] -> ['add_6']
    239/248: Constant [] -> ['scalar_tensor_default_2']
    240/248: Pow ['add_6', 'scalar_tensor_default_2'] -> ['pow_3']
    241/248: Constant [] -> ['_val_452']
    242/248: ReduceMean ['pow_3', '_val_452'] -> ['mean_2']
    243/248: Constant [] -> ['aten_add_554_other_1']
    244/248: Add ['mean_2', 'aten_add_554_other_1'] -> ['add_7']
    245/248: Sqrt ['add_7'] -> ['aten_rsqrt_555_tmp']
    246/248: Reciprocal ['aten_rsqrt_555_tmp'] -> ['rsqrt_2']
    247/248: Mul ['add_6', 'rsqrt_2'] -> ['mul_12']
    248/248: Mul ['primals_3', 'mul_12'] -> ['mul_13']
    [runpythonerror]
    /home/xadupre/.local/lib/python3.10/site-packages/torch/onnx/_internal/exporter.py:137: UserWarning: torch.onnx.dynamo_export only implements opset version 18 for now. If you need to use a different opset version, please register them with register_custom_op.
      warnings.warn(
    2024-05-08 14:07:16,000 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:16,000 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue full due to large size 4194304.
    2024-05-08 14:07:16,025 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue triu due to large size 4194304.
    2024-05-08 14:07:16,062 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue convert_element_type_default due to large size 4194304.
    2024-05-08 14:07:16,063 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue mul due to large size 4194304.
    2024-05-08 14:07:16,078 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:16,078 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_3 due to large size 4194304.
    2024-05-08 14:07:16,081 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:16,081 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_4 due to large size 4194304.
    2024-05-08 14:07:16,088 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_3 due to large size 4194304.
    2024-05-08 14:07:16,093 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_4 due to large size 4194304.
    2024-05-08 14:07:16,098 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 8388608.
    2024-05-08 14:07:16,099 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue expand_1 due to large size 8388608.
    2024-05-08 14:07:16,103 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue clone due to large size 8388608.
    2024-05-08 14:07:16,109 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue alias due to large size 8388608.
    2024-05-08 14:07:16,112 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue eq due to large size 2097152.
    2024-05-08 14:07:16,266 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue return_val due to large size 4194304.
    2024-05-08 14:07:16,266 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue full due to large size 4194304.
    2024-05-08 14:07:16,270 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue triu due to large size 4194304.
    2024-05-08 14:07:16,275 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue convert_element_type_default due to large size 4194304.
    2024-05-08 14:07:16,276 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue mul due to large size 4194304.
    2024-05-08 14:07:16,278 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_3 due to large size 4194304.
    2024-05-08 14:07:16,279 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue unsqueeze_4 due to large size 4194304.
    2024-05-08 14:07:16,280 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_3 due to large size 4194304.
    2024-05-08 14:07:16,281 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue slice_4 due to large size 4194304.
    2024-05-08 14:07:16,283 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue expand_1 due to large size 8388608.
    2024-05-08 14:07:16,286 onnxscript.optimizer.constant_folding [WARNING] - Skip storing constant folded nvalue eq due to large size 2097152.
    [0;93m2024-05-08 14:07:16.373616600 [W:onnxruntime:, graph.cc:4051 CleanUnusedInitializersAndNodeArgs] Removing initializer '_val_23'. It is not used by any node and should be removed from the model.[m
    [0;93m2024-05-08 14:07:16.373665800 [W:onnxruntime:, graph.cc:4051 CleanUnusedInitializersAndNodeArgs] Removing initializer '_val_22'. It is not used by any node and should be removed from the model.[m