Export with dynamic dimensions in {0,1} into ONNX (custom)

This duplicates the example Export with dynamic dimensions in {0,1} but for experimental_experiment.torch_interpreter.to_onnx(). It checks what inputs can be used to export and with which inputs it can work.

Available input sets

import itertools
from tqdm import tqdm
import numpy as np
import pandas
import onnxruntime
from onnx_diagnostic import doc
from onnx_diagnostic.helpers import max_diff, string_type, flatten_object
from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
from onnx_diagnostic.helpers.rt_helper import make_feeds
from onnx_diagnostic.torch_models.hghub.model_inputs import get_untrained_model_with_inputs
from onnx_diagnostic.torch_export_patches import (
    torch_export_patches,
    register_additional_serialization_functions,
)
from experimental_experiment.torch_interpreter import to_onnx, ExportOptions


data = get_untrained_model_with_inputs("arnir0/Tiny-LLM", add_second_input=True)
model, dynamic_shapes = data["model"], data["dynamic_shapes"]

The trained model can be obtained with:

MODEL_NAME = "arnir0/Tiny-LLM"
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME)
input_sets = {k: v for k, v in data.items() if k.startswith("inputs")}

for k, v in input_sets.items():
    print(f"{k:20}: {string_type(v, with_shape=True)}")
inputs              : dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]))
inputs2             : dict(input_ids:T7s3x4,attention_mask:T7s3x35,position_ids:T7s3x4,past_key_values:DynamicCache(key_cache=#1[T1s3x1x31x96], value_cache=#1[T1s3x1x31x96]))
inputs_empty_cache  : dict(input_ids:T7s2x3,attention_mask:T7s2x3,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x0x96], value_cache=#1[T1s2x1x0x96]))
inputs_batch1       : dict(input_ids:T7s1x3,attention_mask:T7s1x33,position_ids:T7s1x3,past_key_values:DynamicCache(key_cache=#1[T1s1x1x30x96], value_cache=#1[T1s1x1x30x96]))

The dynamic shapes are:

print(f"dynamic_shapes: {string_type(dynamic_shapes)}")
dynamic_shapes: dict(input_ids:{0:DYN(batch),1:DYN(seq_length)},attention_mask:{0:DYN(batch),1:DYN(cache+seq)},position_ids:{0:DYN(batch),1:DYN(cache+seq)},past_key_values:#2[#1[{0:DYN(batch),2:DYN(cache_length)}],#1[{0:DYN(batch),2:DYN(cache_length)}]])

Let’s check they all work and compute the expected values. We use deepcopy because caches are usually modified inplace.

expected = {}
for k, v in input_sets.items():
    expected[k] = model(**torch_deepcopy(v))
    print(f"{k:20}: {string_type(expected[k], with_shape=True)}")
inputs              : CausalLMOutputWithPast(logits:T1s2x3x32000,past_key_values:DynamicCache(key_cache=#1[T1s2x1x33x96], value_cache=#1[T1s2x1x33x96]))
inputs2             : CausalLMOutputWithPast(logits:T1s3x4x32000,past_key_values:DynamicCache(key_cache=#1[T1s3x1x35x96], value_cache=#1[T1s3x1x35x96]))
inputs_empty_cache  : CausalLMOutputWithPast(logits:T1s2x3x32000,past_key_values:DynamicCache(key_cache=#1[T1s2x1x3x96], value_cache=#1[T1s2x1x3x96]))
inputs_batch1       : CausalLMOutputWithPast(logits:T1s1x3x32000,past_key_values:DynamicCache(key_cache=#1[T1s1x1x33x96], value_cache=#1[T1s1x1x33x96]))

Export with options

We try to export with the following options:

Some function first.

def export_model(
    model,
    dynamic_shapes,
    inputs,
    cache=False,
    oblivious=False,
    rt=False,
    cache_patch=False,
    strict=False,
):
    if cache and not cache_patch:
        with register_additional_serialization_functions(patch_transformers=True):
            return export_model(
                model, dynamic_shapes, inputs, oblivious=oblivious, rt=rt, strict=strict
            )
    if cache_patch:
        with torch_export_patches(
            patch_torch=cache_patch in ("all", "torch", True, 1),
            patch_transformers=cache_patch in ("all", "transformers", True, 1),
        ):
            return export_model(
                model, dynamic_shapes, inputs, oblivious=oblivious, rt=rt, strict=strict
            )
    return to_onnx(
        model,
        (),
        kwargs=inputs,
        dynamic_shapes=dynamic_shapes,
        export_options=ExportOptions(
            prefer_deferred_runtime_asserts_over_guards=rt,
            backed_size_oblivious=oblivious,
            strict=strict,
        ),
    )


def try_export_model(
    model,
    dynamic_shapes,
    inputs,
    cache=False,
    oblivious=False,
    rt=False,
    cache_patch=False,
    strict=False,
):
    try:
        return export_model(
            model,
            dynamic_shapes,
            inputs,
            cache=cache,
            oblivious=oblivious,
            rt=rt,
            cache_patch=cache_patch,
            strict=strict,
        )
    except Exception as e:
        return e


def validation(onx, input_sets, expected, catch_exception=True):
    sess = onnxruntime.InferenceSession(
        onx.SerializeToString(), providers=["CPUExecutionProvider"]
    )
    for k, v in input_sets.items():
        feeds = make_feeds(sess, torch_deepcopy(v), use_numpy=True)
        try:
            got = sess.run(None, feeds)
        except Exception as e:
            if not catch_exception:
                raise
            yield k, e
            continue
        yield k, max_diff(flatten_object(expected[k], drop_keys=True), got)

Verification an example known to be working is.

ep = export_model(
    model,
    dynamic_shapes,
    torch_deepcopy(input_sets["inputs"]),
    cache_patch=True,
)
res = list(validation(ep, dict(inputs=input_sets["inputs"]), expected, catch_exception=False))
assert res[0][1]["abs"] < 1e-5, f"Unexpected issue with res={res}"

The main loop

results = []

possibilities = [
    [0, 1],
    [0, "all", "torch", "transformers"],
    [0, 1],
    [0, 1, "auto", "half"],
    [0, 1],
    list(input_sets),
]
with tqdm(list(itertools.product(*possibilities))) as pbar:
    for cache, cache_patch, strict, oblivious, rt, inputs in pbar:
        if cache_patch and not cache:
            # patches include caches.
            continue
        kwargs = dict(
            cache=cache, cache_patch=cache_patch, oblivious=oblivious, rt=rt, strict=strict
        )
        legend = "-".join(
            (k if isinstance(v, int) else f"{k}:{v}") for k, v in kwargs.items() if v
        )
        legend = f"{legend}/{inputs}"
        pbar.set_description(f"{legend} EXPORT")

        # export
        ep = try_export_model(
            model, dynamic_shapes, torch_deepcopy(input_sets[inputs]), **kwargs
        )
        if isinstance(ep, Exception):
            obs = {
                **kwargs,
                "export_with": inputs,
                "EXPORT": 0,
                "ERR-EXPORT": str(ep).split("\n")[0],
            }
            results.append(obs)
            continue

        pbar.set_description(f"{legend} VALIDATE")
        common = {**kwargs, "export_with": inputs, "EXPORT": 1}
        for inp, res in validation(ep, input_sets, expected):
            if isinstance(res, Exception):
                obs = {
                    **common,
                    "run_with": inp,
                    "ERR-RUN": str(res).split("\n")[0],
                    "WORKS": 0,
                }
            else:
                obs = {
                    **common,
                    "run_with": inp,
                    "WORKS": int(~np.isnan(res["abs"]) and res["abs"] < 1e-3),
                }
            results.append(obs)
  0%|          | 0/512 [00:00<?, ?it/s]
/inputs EXPORT:   0%|          | 0/512 [00:00<?, ?it/s]
/inputs EXPORT:   0%|          | 1/512 [00:00<02:09,  3.95it/s]
/inputs2 EXPORT:   0%|          | 1/512 [00:00<02:09,  3.95it/s]
/inputs2 EXPORT:   0%|          | 2/512 [00:00<02:10,  3.90it/s]
/inputs_empty_cache EXPORT:   0%|          | 2/512 [00:00<02:10,  3.90it/s]
/inputs_empty_cache EXPORT:   1%|          | 3/512 [00:00<02:00,  4.23it/s]
/inputs_batch1 EXPORT:   1%|          | 3/512 [00:00<02:00,  4.23it/s]
/inputs_batch1 EXPORT:   1%|          | 4/512 [00:01<02:08,  3.96it/s]
rt/inputs EXPORT:   1%|          | 4/512 [00:01<02:08,  3.96it/s]
rt/inputs EXPORT:   1%|          | 5/512 [00:01<02:06,  4.02it/s]
rt/inputs2 EXPORT:   1%|          | 5/512 [00:01<02:06,  4.02it/s]
rt/inputs2 EXPORT:   1%|          | 6/512 [00:01<02:01,  4.17it/s]
rt/inputs_empty_cache EXPORT:   1%|          | 6/512 [00:01<02:01,  4.17it/s]
rt/inputs_empty_cache EXPORT:   1%|▏         | 7/512 [00:01<01:50,  4.58it/s]
rt/inputs_batch1 EXPORT:   1%|▏         | 7/512 [00:01<01:50,  4.58it/s]
rt/inputs_batch1 EXPORT:   2%|▏         | 8/512 [00:01<01:55,  4.37it/s]
oblivious/inputs EXPORT:   2%|▏         | 8/512 [00:01<01:55,  4.37it/s]
oblivious/inputs EXPORT:   2%|▏         | 9/512 [00:02<02:00,  4.17it/s]
oblivious/inputs2 EXPORT:   2%|▏         | 9/512 [00:02<02:00,  4.17it/s]
oblivious/inputs2 EXPORT:   2%|▏         | 10/512 [00:02<02:01,  4.14it/s]
oblivious/inputs_empty_cache EXPORT:   2%|▏         | 10/512 [00:02<02:01,  4.14it/s]
oblivious/inputs_empty_cache EXPORT:   2%|▏         | 11/512 [00:02<02:06,  3.96it/s]
oblivious/inputs_batch1 EXPORT:   2%|▏         | 11/512 [00:02<02:06,  3.96it/s]
oblivious/inputs_batch1 EXPORT:   2%|▏         | 12/512 [00:02<02:08,  3.89it/s]
oblivious-rt/inputs EXPORT:   2%|▏         | 12/512 [00:02<02:08,  3.89it/s]
oblivious-rt/inputs EXPORT:   3%|▎         | 13/512 [00:03<02:02,  4.08it/s]
oblivious-rt/inputs2 EXPORT:   3%|▎         | 13/512 [00:03<02:02,  4.08it/s]
oblivious-rt/inputs2 EXPORT:   3%|▎         | 14/512 [00:03<01:58,  4.19it/s]
oblivious-rt/inputs_empty_cache EXPORT:   3%|▎         | 14/512 [00:03<01:58,  4.19it/s]
oblivious-rt/inputs_empty_cache EXPORT:   3%|▎         | 15/512 [00:03<01:55,  4.30it/s]
oblivious-rt/inputs_batch1 EXPORT:   3%|▎         | 15/512 [00:03<01:55,  4.30it/s]
oblivious-rt/inputs_batch1 EXPORT:   3%|▎         | 16/512 [00:03<01:54,  4.32it/s]
oblivious:auto/inputs EXPORT:   3%|▎         | 16/512 [00:03<01:54,  4.32it/s]
oblivious:auto/inputs EXPORT:   3%|▎         | 17/512 [00:04<01:57,  4.23it/s]
oblivious:auto/inputs2 EXPORT:   3%|▎         | 17/512 [00:04<01:57,  4.23it/s]
oblivious:auto/inputs2 EXPORT:   4%|▎         | 18/512 [00:04<01:58,  4.17it/s]
oblivious:auto/inputs_empty_cache EXPORT:   4%|▎         | 18/512 [00:04<01:58,  4.17it/s]
oblivious:auto/inputs_empty_cache EXPORT:   4%|▎         | 19/512 [00:04<02:01,  4.07it/s]
oblivious:auto/inputs_batch1 EXPORT:   4%|▎         | 19/512 [00:04<02:01,  4.07it/s]
oblivious:auto/inputs_batch1 EXPORT:   4%|▍         | 20/512 [00:04<02:01,  4.06it/s]
oblivious:auto-rt/inputs EXPORT:   4%|▍         | 20/512 [00:04<02:01,  4.06it/s]
oblivious:auto-rt/inputs EXPORT:   4%|▍         | 21/512 [00:05<01:56,  4.22it/s]
oblivious:auto-rt/inputs2 EXPORT:   4%|▍         | 21/512 [00:05<01:56,  4.22it/s]
oblivious:auto-rt/inputs2 EXPORT:   4%|▍         | 22/512 [00:05<01:53,  4.30it/s]
oblivious:auto-rt/inputs_empty_cache EXPORT:   4%|▍         | 22/512 [00:05<01:53,  4.30it/s]
oblivious:auto-rt/inputs_empty_cache EXPORT:   4%|▍         | 23/512 [00:05<01:51,  4.38it/s]
oblivious:auto-rt/inputs_batch1 EXPORT:   4%|▍         | 23/512 [00:05<01:51,  4.38it/s]
oblivious:auto-rt/inputs_batch1 EXPORT:   5%|▍         | 24/512 [00:05<01:53,  4.31it/s]
oblivious:half/inputs EXPORT:   5%|▍         | 24/512 [00:05<01:53,  4.31it/s]
oblivious:half/inputs EXPORT:   5%|▍         | 25/512 [00:06<02:07,  3.81it/s]
oblivious:half/inputs2 EXPORT:   5%|▍         | 25/512 [00:06<02:07,  3.81it/s]
oblivious:half/inputs2 EXPORT:   5%|▌         | 26/512 [00:06<02:10,  3.72it/s]
oblivious:half/inputs_empty_cache EXPORT:   5%|▌         | 26/512 [00:06<02:10,  3.72it/s]
oblivious:half/inputs_empty_cache EXPORT:   5%|▌         | 27/512 [00:06<02:12,  3.67it/s]
oblivious:half/inputs_batch1 EXPORT:   5%|▌         | 27/512 [00:06<02:12,  3.67it/s]
oblivious:half/inputs_batch1 EXPORT:   5%|▌         | 28/512 [00:06<02:13,  3.63it/s]
oblivious:half-rt/inputs EXPORT:   5%|▌         | 28/512 [00:06<02:13,  3.63it/s]
oblivious:half-rt/inputs EXPORT:   6%|▌         | 29/512 [00:07<02:06,  3.81it/s]
oblivious:half-rt/inputs2 EXPORT:   6%|▌         | 29/512 [00:07<02:06,  3.81it/s]
oblivious:half-rt/inputs2 EXPORT:   6%|▌         | 30/512 [00:07<02:02,  3.93it/s]
oblivious:half-rt/inputs_empty_cache EXPORT:   6%|▌         | 30/512 [00:07<02:02,  3.93it/s]
oblivious:half-rt/inputs_empty_cache EXPORT:   6%|▌         | 31/512 [00:07<01:58,  4.06it/s]
oblivious:half-rt/inputs_batch1 EXPORT:   6%|▌         | 31/512 [00:07<01:58,  4.06it/s]
oblivious:half-rt/inputs_batch1 EXPORT:   6%|▋         | 32/512 [00:07<01:57,  4.10it/s]
strict/inputs EXPORT:   6%|▋         | 32/512 [00:07<01:57,  4.10it/s]
strict/inputs EXPORT:   6%|▋         | 33/512 [00:08<02:27,  3.25it/s]
strict/inputs2 EXPORT:   6%|▋         | 33/512 [00:08<02:27,  3.25it/s]
strict/inputs2 EXPORT:   7%|▋         | 34/512 [00:08<02:44,  2.90it/s]
strict/inputs_empty_cache EXPORT:   7%|▋         | 34/512 [00:08<02:44,  2.90it/s]
strict/inputs_batch1 EXPORT:   7%|▋         | 34/512 [00:07<02:44,  2.90it/s]
rt-strict/inputs EXPORT:   7%|▋         | 34/512 [00:07<02:44,  2.90it/s]
rt-strict/inputs2 EXPORT:   7%|▋         | 34/512 [00:08<02:44,  2.90it/s]
rt-strict/inputs_empty_cache EXPORT:   7%|▋         | 34/512 [00:08<02:44,  2.90it/s]
rt-strict/inputs_empty_cache EXPORT:   8%|▊         | 39/512 [00:09<01:12,  6.53it/s]
rt-strict/inputs_batch1 EXPORT:   8%|▊         | 39/512 [00:09<01:12,  6.53it/s]
rt-strict/inputs_batch1 EXPORT:   8%|▊         | 40/512 [00:09<01:31,  5.14it/s]
oblivious-strict/inputs EXPORT:   8%|▊         | 40/512 [00:09<01:31,  5.14it/s]
oblivious-strict/inputs EXPORT:   8%|▊         | 41/512 [00:09<01:48,  4.35it/s]
oblivious-strict/inputs2 EXPORT:   8%|▊         | 41/512 [00:09<01:48,  4.35it/s]
oblivious-strict/inputs2 EXPORT:   8%|▊         | 42/512 [00:10<02:02,  3.83it/s]
oblivious-strict/inputs_empty_cache EXPORT:   8%|▊         | 42/512 [00:10<02:02,  3.83it/s]
oblivious-strict/inputs_empty_cache EXPORT:   8%|▊         | 43/512 [00:10<02:14,  3.48it/s]
oblivious-strict/inputs_batch1 EXPORT:   8%|▊         | 43/512 [00:10<02:14,  3.48it/s]
oblivious-strict/inputs_batch1 EXPORT:   9%|▊         | 44/512 [00:10<02:25,  3.22it/s]
oblivious-rt-strict/inputs EXPORT:   9%|▊         | 44/512 [00:10<02:25,  3.22it/s]
oblivious-rt-strict/inputs EXPORT:   9%|▉         | 45/512 [00:11<02:33,  3.05it/s]
oblivious-rt-strict/inputs2 EXPORT:   9%|▉         | 45/512 [00:11<02:33,  3.05it/s]
oblivious-rt-strict/inputs2 EXPORT:   9%|▉         | 46/512 [00:11<02:37,  2.96it/s]
oblivious-rt-strict/inputs_empty_cache EXPORT:   9%|▉         | 46/512 [00:11<02:37,  2.96it/s]
oblivious-rt-strict/inputs_empty_cache EXPORT:   9%|▉         | 47/512 [00:12<02:41,  2.89it/s]
oblivious-rt-strict/inputs_batch1 EXPORT:   9%|▉         | 47/512 [00:12<02:41,  2.89it/s]
oblivious-rt-strict/inputs_batch1 EXPORT:   9%|▉         | 48/512 [00:12<02:42,  2.86it/s]
oblivious:auto-strict/inputs EXPORT:   9%|▉         | 48/512 [00:12<02:42,  2.86it/s]
oblivious:auto-strict/inputs EXPORT:  10%|▉         | 49/512 [00:12<02:45,  2.79it/s]
oblivious:auto-strict/inputs2 EXPORT:  10%|▉         | 49/512 [00:12<02:45,  2.79it/s]
oblivious:auto-strict/inputs2 EXPORT:  10%|▉         | 50/512 [00:13<02:49,  2.72it/s]
oblivious:auto-strict/inputs_empty_cache EXPORT:  10%|▉         | 50/512 [00:13<02:49,  2.72it/s]
oblivious:auto-strict/inputs_empty_cache EXPORT:  10%|▉         | 51/512 [00:13<02:48,  2.74it/s]
oblivious:auto-strict/inputs_batch1 EXPORT:  10%|▉         | 51/512 [00:13<02:48,  2.74it/s]
oblivious:auto-strict/inputs_batch1 EXPORT:  10%|█         | 52/512 [00:13<02:52,  2.67it/s]
oblivious:auto-rt-strict/inputs EXPORT:  10%|█         | 52/512 [00:13<02:52,  2.67it/s]
oblivious:auto-rt-strict/inputs EXPORT:  10%|█         | 53/512 [00:14<02:51,  2.68it/s]
oblivious:auto-rt-strict/inputs2 EXPORT:  10%|█         | 53/512 [00:14<02:51,  2.68it/s]
oblivious:auto-rt-strict/inputs2 EXPORT:  11%|█         | 54/512 [00:14<02:51,  2.67it/s]
oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  11%|█         | 54/512 [00:14<02:51,  2.67it/s]
oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  11%|█         | 55/512 [00:15<02:49,  2.70it/s]
oblivious:auto-rt-strict/inputs_batch1 EXPORT:  11%|█         | 55/512 [00:15<02:49,  2.70it/s]
oblivious:auto-rt-strict/inputs_batch1 EXPORT:  11%|█         | 56/512 [00:15<02:51,  2.66it/s]
oblivious:half-strict/inputs EXPORT:  11%|█         | 56/512 [00:15<02:51,  2.66it/s]
oblivious:half-strict/inputs EXPORT:  11%|█         | 57/512 [00:15<02:55,  2.59it/s]
oblivious:half-strict/inputs2 EXPORT:  11%|█         | 57/512 [00:15<02:55,  2.59it/s]
oblivious:half-strict/inputs2 EXPORT:  11%|█▏        | 58/512 [00:16<02:54,  2.60it/s]
oblivious:half-strict/inputs_empty_cache EXPORT:  11%|█▏        | 58/512 [00:16<02:54,  2.60it/s]
oblivious:half-strict/inputs_empty_cache EXPORT:  12%|█▏        | 59/512 [00:16<02:51,  2.64it/s]
oblivious:half-strict/inputs_batch1 EXPORT:  12%|█▏        | 59/512 [00:16<02:51,  2.64it/s]
oblivious:half-strict/inputs_batch1 EXPORT:  12%|█▏        | 60/512 [00:19<07:30,  1.00it/s]
oblivious:half-rt-strict/inputs EXPORT:  12%|█▏        | 60/512 [00:19<07:30,  1.00it/s]
oblivious:half-rt-strict/inputs EXPORT:  12%|█▏        | 61/512 [00:19<06:06,  1.23it/s]
oblivious:half-rt-strict/inputs2 EXPORT:  12%|█▏        | 61/512 [00:19<06:06,  1.23it/s]
oblivious:half-rt-strict/inputs2 EXPORT:  12%|█▏        | 62/512 [00:19<05:06,  1.47it/s]
oblivious:half-rt-strict/inputs_empty_cache EXPORT:  12%|█▏        | 62/512 [00:19<05:06,  1.47it/s]
oblivious:half-rt-strict/inputs_empty_cache EXPORT:  12%|█▏        | 63/512 [00:20<04:23,  1.71it/s]
oblivious:half-rt-strict/inputs_batch1 EXPORT:  12%|█▏        | 63/512 [00:20<04:23,  1.71it/s]
oblivious:half-rt-strict/inputs_batch1 EXPORT:  12%|█▎        | 64/512 [00:20<03:54,  1.91it/s]
cache/inputs EXPORT:  12%|█▎        | 64/512 [00:20<03:54,  1.91it/s]
cache/inputs EXPORT:  50%|█████     | 257/512 [00:20<00:01, 127.77it/s]
cache/inputs2 EXPORT:  50%|█████     | 257/512 [00:20<00:01, 127.77it/s]
cache/inputs_empty_cache EXPORT:  50%|█████     | 257/512 [00:21<00:01, 127.77it/s]
cache/inputs_batch1 EXPORT:  50%|█████     | 257/512 [00:21<00:01, 127.77it/s]
cache-rt/inputs EXPORT:  50%|█████     | 257/512 [00:21<00:01, 127.77it/s]
cache-rt/inputs2 EXPORT:  50%|█████     | 257/512 [00:21<00:01, 127.77it/s]
cache-rt/inputs_empty_cache EXPORT:  50%|█████     | 257/512 [00:22<00:01, 127.77it/s]
cache-rt/inputs_batch1 EXPORT:  50%|█████     | 257/512 [00:22<00:01, 127.77it/s]
cache-oblivious/inputs EXPORT:  50%|█████     | 257/512 [00:22<00:01, 127.77it/s]
cache-oblivious/inputs2 EXPORT:  50%|█████     | 257/512 [00:23<00:01, 127.77it/s]
cache-oblivious/inputs_empty_cache EXPORT:  50%|█████     | 257/512 [00:23<00:01, 127.77it/s]
cache-oblivious/inputs_batch1 EXPORT:  50%|█████     | 257/512 [00:23<00:01, 127.77it/s]
cache-oblivious-rt/inputs EXPORT:  50%|█████     | 257/512 [00:23<00:01, 127.77it/s]
cache-oblivious-rt/inputs2 EXPORT:  50%|█████     | 257/512 [00:24<00:01, 127.77it/s]
cache-oblivious-rt/inputs_empty_cache EXPORT:  50%|█████     | 257/512 [00:24<00:01, 127.77it/s]
cache-oblivious-rt/inputs_batch1 EXPORT:  50%|█████     | 257/512 [00:24<00:01, 127.77it/s]
cache-oblivious:auto/inputs EXPORT:  50%|█████     | 257/512 [00:24<00:01, 127.77it/s]
cache-oblivious:auto/inputs2 EXPORT:  50%|█████     | 257/512 [00:25<00:01, 127.77it/s]
cache-oblivious:auto/inputs_empty_cache EXPORT:  50%|█████     | 257/512 [00:25<00:01, 127.77it/s]
cache-oblivious:auto/inputs_batch1 EXPORT:  50%|█████     | 257/512 [00:25<00:01, 127.77it/s]
cache-oblivious:auto-rt/inputs EXPORT:  50%|█████     | 257/512 [00:25<00:01, 127.77it/s]
cache-oblivious:auto-rt/inputs EXPORT:  54%|█████▍    | 277/512 [00:26<00:09, 24.47it/s]
cache-oblivious:auto-rt/inputs2 EXPORT:  54%|█████▍    | 277/512 [00:26<00:09, 24.47it/s]
cache-oblivious:auto-rt/inputs_empty_cache EXPORT:  54%|█████▍    | 277/512 [00:26<00:09, 24.47it/s]
cache-oblivious:auto-rt/inputs_batch1 EXPORT:  54%|█████▍    | 277/512 [00:26<00:09, 24.47it/s]
cache-oblivious:half/inputs EXPORT:  54%|█████▍    | 277/512 [00:26<00:09, 24.47it/s]
cache-oblivious:half/inputs2 EXPORT:  54%|█████▍    | 277/512 [00:27<00:09, 24.47it/s]
cache-oblivious:half/inputs_empty_cache EXPORT:  54%|█████▍    | 277/512 [00:27<00:09, 24.47it/s]
cache-oblivious:half/inputs_batch1 EXPORT:  54%|█████▍    | 277/512 [00:27<00:09, 24.47it/s]
cache-oblivious:half-rt/inputs EXPORT:  54%|█████▍    | 277/512 [00:28<00:09, 24.47it/s]
cache-oblivious:half-rt/inputs2 EXPORT:  54%|█████▍    | 277/512 [00:28<00:09, 24.47it/s]
cache-oblivious:half-rt/inputs_empty_cache EXPORT:  54%|█████▍    | 277/512 [00:28<00:09, 24.47it/s]
cache-oblivious:half-rt/inputs_batch1 EXPORT:  54%|█████▍    | 277/512 [00:28<00:09, 24.47it/s]
cache-strict/inputs EXPORT:  54%|█████▍    | 277/512 [00:29<00:09, 24.47it/s]
cache-strict/inputs2 EXPORT:  54%|█████▍    | 277/512 [00:29<00:09, 24.47it/s]
cache-strict/inputs_empty_cache EXPORT:  54%|█████▍    | 277/512 [00:29<00:09, 24.47it/s]
cache-strict/inputs_empty_cache EXPORT:  57%|█████▋    | 291/512 [00:30<00:15, 14.37it/s]
cache-strict/inputs_batch1 EXPORT:  57%|█████▋    | 291/512 [00:30<00:15, 14.37it/s]
cache-rt-strict/inputs EXPORT:  57%|█████▋    | 291/512 [00:30<00:15, 14.37it/s]
cache-rt-strict/inputs2 EXPORT:  57%|█████▋    | 291/512 [00:31<00:15, 14.37it/s]
cache-rt-strict/inputs_empty_cache EXPORT:  57%|█████▋    | 291/512 [00:31<00:15, 14.37it/s]
cache-rt-strict/inputs_batch1 EXPORT:  57%|█████▋    | 291/512 [00:31<00:15, 14.37it/s]
cache-oblivious-strict/inputs EXPORT:  57%|█████▋    | 291/512 [00:32<00:15, 14.37it/s]
cache-oblivious-strict/inputs2 EXPORT:  57%|█████▋    | 291/512 [00:32<00:15, 14.37it/s]
cache-oblivious-strict/inputs_empty_cache EXPORT:  57%|█████▋    | 291/512 [00:33<00:15, 14.37it/s]
cache-oblivious-strict/inputs_batch1 EXPORT:  57%|█████▋    | 291/512 [00:33<00:15, 14.37it/s]
cache-oblivious-rt-strict/inputs EXPORT:  57%|█████▋    | 291/512 [00:33<00:15, 14.37it/s]
cache-oblivious-rt-strict/inputs EXPORT:  59%|█████▉    | 301/512 [00:34<00:21,  9.79it/s]
cache-oblivious-rt-strict/inputs2 EXPORT:  59%|█████▉    | 301/512 [00:34<00:21,  9.79it/s]
cache-oblivious-rt-strict/inputs_empty_cache EXPORT:  59%|█████▉    | 301/512 [00:34<00:21,  9.79it/s]
cache-oblivious-rt-strict/inputs_batch1 EXPORT:  59%|█████▉    | 301/512 [00:34<00:21,  9.79it/s]
cache-oblivious:auto-strict/inputs EXPORT:  59%|█████▉    | 301/512 [00:35<00:21,  9.79it/s]
cache-oblivious:auto-strict/inputs2 EXPORT:  59%|█████▉    | 301/512 [00:35<00:21,  9.79it/s]
cache-oblivious:auto-strict/inputs_empty_cache EXPORT:  59%|█████▉    | 301/512 [00:36<00:21,  9.79it/s]
cache-oblivious:auto-strict/inputs_batch1 EXPORT:  59%|█████▉    | 301/512 [00:36<00:21,  9.79it/s]
cache-oblivious:auto-strict/inputs_batch1 EXPORT:  60%|██████    | 308/512 [00:36<00:26,  7.70it/s]
cache-oblivious:auto-rt-strict/inputs EXPORT:  60%|██████    | 308/512 [00:36<00:26,  7.70it/s]
cache-oblivious:auto-rt-strict/inputs2 EXPORT:  60%|██████    | 308/512 [00:37<00:26,  7.70it/s]
cache-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  60%|██████    | 308/512 [00:37<00:26,  7.70it/s]
cache-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  60%|██████    | 308/512 [00:38<00:26,  7.70it/s]
cache-oblivious:half-strict/inputs EXPORT:  60%|██████    | 308/512 [00:38<00:26,  7.70it/s]
cache-oblivious:half-strict/inputs EXPORT:  61%|██████    | 313/512 [00:38<00:30,  6.57it/s]
cache-oblivious:half-strict/inputs2 EXPORT:  61%|██████    | 313/512 [00:38<00:30,  6.57it/s]
cache-oblivious:half-strict/inputs_empty_cache EXPORT:  61%|██████    | 313/512 [00:39<00:30,  6.57it/s]
cache-oblivious:half-strict/inputs_batch1 EXPORT:  61%|██████    | 313/512 [00:39<00:30,  6.57it/s]
cache-oblivious:half-rt-strict/inputs EXPORT:  61%|██████    | 313/512 [00:40<00:30,  6.57it/s]
cache-oblivious:half-rt-strict/inputs EXPORT:  62%|██████▏   | 317/512 [00:41<00:37,  5.25it/s]
cache-oblivious:half-rt-strict/inputs2 EXPORT:  62%|██████▏   | 317/512 [00:41<00:37,  5.25it/s]
cache-oblivious:half-rt-strict/inputs_empty_cache EXPORT:  62%|██████▏   | 317/512 [00:41<00:37,  5.25it/s]
cache-oblivious:half-rt-strict/inputs_batch1 EXPORT:  62%|██████▏   | 317/512 [00:41<00:37,  5.25it/s]
cache-oblivious:half-rt-strict/inputs_batch1 EXPORT:  62%|██████▎   | 320/512 [00:42<00:39,  4.83it/s]
cache-cache_patch:all/inputs EXPORT:  62%|██████▎   | 320/512 [00:42<00:39,  4.83it/s]
cache-cache_patch:all/inputs VALIDATE:  62%|██████▎   | 320/512 [00:44<00:39,  4.83it/s]
cache-cache_patch:all/inputs2 EXPORT:  62%|██████▎   | 320/512 [00:44<00:39,  4.83it/s]
cache-cache_patch:all/inputs2 VALIDATE:  62%|██████▎   | 320/512 [00:45<00:39,  4.83it/s]
cache-cache_patch:all/inputs2 VALIDATE:  63%|██████▎   | 322/512 [00:45<01:02,  3.05it/s]
cache-cache_patch:all/inputs_empty_cache EXPORT:  63%|██████▎   | 322/512 [00:45<01:02,  3.05it/s]
cache-cache_patch:all/inputs_empty_cache VALIDATE:  63%|██████▎   | 322/512 [00:47<01:02,  3.05it/s]
cache-cache_patch:all/inputs_batch1 EXPORT:  63%|██████▎   | 322/512 [00:47<01:02,  3.05it/s]
cache-cache_patch:all/inputs_batch1 VALIDATE:  63%|██████▎   | 322/512 [00:48<01:02,  3.05it/s]
cache-cache_patch:all/inputs_batch1 VALIDATE:  63%|██████▎   | 324/512 [00:48<01:24,  2.23it/s]
cache-cache_patch:all-rt/inputs EXPORT:  63%|██████▎   | 324/512 [00:48<01:24,  2.23it/s]
cache-cache_patch:all-rt/inputs VALIDATE:  63%|██████▎   | 324/512 [00:50<01:24,  2.23it/s]
cache-cache_patch:all-rt/inputs VALIDATE:  63%|██████▎   | 325/512 [00:50<01:38,  1.90it/s]
cache-cache_patch:all-rt/inputs2 EXPORT:  63%|██████▎   | 325/512 [00:50<01:38,  1.90it/s]
cache-cache_patch:all-rt/inputs2 VALIDATE:  63%|██████▎   | 325/512 [00:51<01:38,  1.90it/s]
cache-cache_patch:all-rt/inputs2 VALIDATE:  64%|██████▎   | 326/512 [00:52<01:54,  1.63it/s]
cache-cache_patch:all-rt/inputs_empty_cache EXPORT:  64%|██████▎   | 326/512 [00:52<01:54,  1.63it/s]
cache-cache_patch:all-rt/inputs_empty_cache VALIDATE:  64%|██████▎   | 326/512 [00:53<01:54,  1.63it/s]
cache-cache_patch:all-rt/inputs_empty_cache VALIDATE:  64%|██████▍   | 327/512 [00:53<02:14,  1.38it/s]
cache-cache_patch:all-rt/inputs_batch1 EXPORT:  64%|██████▍   | 327/512 [00:53<02:14,  1.38it/s]
cache-cache_patch:all-rt/inputs_batch1 VALIDATE:  64%|██████▍   | 327/512 [00:54<02:14,  1.38it/s]
cache-cache_patch:all-rt/inputs_batch1 VALIDATE:  64%|██████▍   | 328/512 [00:55<02:30,  1.22it/s]
cache-cache_patch:all-oblivious/inputs EXPORT:  64%|██████▍   | 328/512 [00:55<02:30,  1.22it/s]
cache-cache_patch:all-oblivious/inputs VALIDATE:  64%|██████▍   | 328/512 [00:56<02:30,  1.22it/s]
cache-cache_patch:all-oblivious/inputs VALIDATE:  64%|██████▍   | 329/512 [00:56<02:55,  1.05it/s]
cache-cache_patch:all-oblivious/inputs2 EXPORT:  64%|██████▍   | 329/512 [00:56<02:55,  1.05it/s]
cache-cache_patch:all-oblivious/inputs2 VALIDATE:  64%|██████▍   | 329/512 [00:58<02:55,  1.05it/s]
cache-cache_patch:all-oblivious/inputs2 VALIDATE:  64%|██████▍   | 330/512 [00:58<03:14,  1.07s/it]
cache-cache_patch:all-oblivious/inputs_empty_cache EXPORT:  64%|██████▍   | 330/512 [00:58<03:14,  1.07s/it]
cache-cache_patch:all-oblivious/inputs_empty_cache VALIDATE:  64%|██████▍   | 330/512 [00:59<03:14,  1.07s/it]
cache-cache_patch:all-oblivious/inputs_empty_cache VALIDATE:  65%|██████▍   | 331/512 [00:59<03:31,  1.17s/it]
cache-cache_patch:all-oblivious/inputs_batch1 EXPORT:  65%|██████▍   | 331/512 [00:59<03:31,  1.17s/it]
cache-cache_patch:all-oblivious/inputs_batch1 VALIDATE:  65%|██████▍   | 331/512 [01:01<03:31,  1.17s/it]
cache-cache_patch:all-oblivious/inputs_batch1 VALIDATE:  65%|██████▍   | 332/512 [01:01<03:50,  1.28s/it]
cache-cache_patch:all-oblivious-rt/inputs EXPORT:  65%|██████▍   | 332/512 [01:01<03:50,  1.28s/it]
cache-cache_patch:all-oblivious-rt/inputs VALIDATE:  65%|██████▍   | 332/512 [01:02<03:50,  1.28s/it]
cache-cache_patch:all-oblivious-rt/inputs VALIDATE:  65%|██████▌   | 333/512 [01:02<03:59,  1.34s/it]
cache-cache_patch:all-oblivious-rt/inputs2 EXPORT:  65%|██████▌   | 333/512 [01:02<03:59,  1.34s/it]
cache-cache_patch:all-oblivious-rt/inputs2 VALIDATE:  65%|██████▌   | 333/512 [01:04<03:59,  1.34s/it]
cache-cache_patch:all-oblivious-rt/inputs2 VALIDATE:  65%|██████▌   | 334/512 [01:04<04:09,  1.40s/it]
cache-cache_patch:all-oblivious-rt/inputs_empty_cache EXPORT:  65%|██████▌   | 334/512 [01:04<04:09,  1.40s/it]
cache-cache_patch:all-oblivious-rt/inputs_empty_cache VALIDATE:  65%|██████▌   | 334/512 [01:05<04:09,  1.40s/it]
cache-cache_patch:all-oblivious-rt/inputs_empty_cache VALIDATE:  65%|██████▌   | 335/512 [01:06<04:12,  1.42s/it]
cache-cache_patch:all-oblivious-rt/inputs_batch1 EXPORT:  65%|██████▌   | 335/512 [01:06<04:12,  1.42s/it]
cache-cache_patch:all-oblivious-rt/inputs_batch1 VALIDATE:  65%|██████▌   | 335/512 [01:07<04:12,  1.42s/it]
cache-cache_patch:all-oblivious-rt/inputs_batch1 VALIDATE:  66%|██████▌   | 336/512 [01:07<04:11,  1.43s/it]
cache-cache_patch:all-oblivious:auto/inputs EXPORT:  66%|██████▌   | 336/512 [01:07<04:11,  1.43s/it]
cache-cache_patch:all-oblivious:auto/inputs VALIDATE:  66%|██████▌   | 336/512 [01:08<04:11,  1.43s/it]
cache-cache_patch:all-oblivious:auto/inputs VALIDATE:  66%|██████▌   | 337/512 [01:09<04:18,  1.48s/it]
cache-cache_patch:all-oblivious:auto/inputs2 EXPORT:  66%|██████▌   | 337/512 [01:09<04:18,  1.48s/it]
cache-cache_patch:all-oblivious:auto/inputs2 VALIDATE:  66%|██████▌   | 337/512 [01:10<04:18,  1.48s/it]
cache-cache_patch:all-oblivious:auto/inputs_empty_cache EXPORT:  66%|██████▌   | 337/512 [01:09<04:18,  1.48s/it]
cache-cache_patch:all-oblivious:auto/inputs_empty_cache VALIDATE:  66%|██████▌   | 337/512 [01:10<04:18,  1.48s/it]
cache-cache_patch:all-oblivious:auto/inputs_empty_cache VALIDATE:  66%|██████▌   | 339/512 [01:10<03:16,  1.14s/it]
cache-cache_patch:all-oblivious:auto/inputs_batch1 EXPORT:  66%|██████▌   | 339/512 [01:10<03:16,  1.14s/it]
cache-cache_patch:all-oblivious:auto/inputs_batch1 VALIDATE:  66%|██████▌   | 339/512 [01:11<03:16,  1.14s/it]
cache-cache_patch:all-oblivious:auto/inputs_batch1 VALIDATE:  66%|██████▋   | 340/512 [01:12<03:31,  1.23s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs EXPORT:  66%|██████▋   | 340/512 [01:12<03:31,  1.23s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs VALIDATE:  66%|██████▋   | 340/512 [01:13<03:31,  1.23s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs VALIDATE:  67%|██████▋   | 341/512 [01:13<03:46,  1.33s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs2 EXPORT:  67%|██████▋   | 341/512 [01:13<03:46,  1.33s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs2 VALIDATE:  67%|██████▋   | 341/512 [01:15<03:46,  1.33s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs2 VALIDATE:  67%|██████▋   | 342/512 [01:15<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_empty_cache EXPORT:  67%|██████▋   | 342/512 [01:15<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_empty_cache VALIDATE:  67%|██████▋   | 342/512 [01:16<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_empty_cache VALIDATE:  67%|██████▋   | 343/512 [01:16<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_batch1 EXPORT:  67%|██████▋   | 343/512 [01:16<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_batch1 VALIDATE:  67%|██████▋   | 343/512 [01:20<04:00,  1.42s/it]
cache-cache_patch:all-oblivious:auto-rt/inputs_batch1 VALIDATE:  67%|██████▋   | 344/512 [01:20<05:56,  2.12s/it]
cache-cache_patch:all-oblivious:half/inputs EXPORT:  67%|██████▋   | 344/512 [01:20<05:56,  2.12s/it]
cache-cache_patch:all-oblivious:half/inputs VALIDATE:  67%|██████▋   | 344/512 [01:21<05:56,  2.12s/it]
cache-cache_patch:all-oblivious:half/inputs VALIDATE:  67%|██████▋   | 345/512 [01:22<05:23,  1.93s/it]
cache-cache_patch:all-oblivious:half/inputs2 EXPORT:  67%|██████▋   | 345/512 [01:22<05:23,  1.93s/it]
cache-cache_patch:all-oblivious:half/inputs2 VALIDATE:  67%|██████▋   | 345/512 [01:23<05:23,  1.93s/it]
cache-cache_patch:all-oblivious:half/inputs2 VALIDATE:  68%|██████▊   | 346/512 [01:23<05:03,  1.83s/it]
cache-cache_patch:all-oblivious:half/inputs_empty_cache EXPORT:  68%|██████▊   | 346/512 [01:23<05:03,  1.83s/it]
cache-cache_patch:all-oblivious:half/inputs_empty_cache VALIDATE:  68%|██████▊   | 346/512 [01:25<05:03,  1.83s/it]
cache-cache_patch:all-oblivious:half/inputs_empty_cache VALIDATE:  68%|██████▊   | 347/512 [01:25<04:48,  1.75s/it]
cache-cache_patch:all-oblivious:half/inputs_batch1 EXPORT:  68%|██████▊   | 347/512 [01:25<04:48,  1.75s/it]
cache-cache_patch:all-oblivious:half/inputs_batch1 VALIDATE:  68%|██████▊   | 347/512 [01:26<04:48,  1.75s/it]
cache-cache_patch:all-oblivious:half/inputs_batch1 VALIDATE:  68%|██████▊   | 348/512 [01:26<04:31,  1.66s/it]
cache-cache_patch:all-oblivious:half-rt/inputs EXPORT:  68%|██████▊   | 348/512 [01:26<04:31,  1.66s/it]
cache-cache_patch:all-oblivious:half-rt/inputs VALIDATE:  68%|██████▊   | 348/512 [01:28<04:31,  1.66s/it]
cache-cache_patch:all-oblivious:half-rt/inputs VALIDATE:  68%|██████▊   | 349/512 [01:28<04:22,  1.61s/it]
cache-cache_patch:all-oblivious:half-rt/inputs2 EXPORT:  68%|██████▊   | 349/512 [01:28<04:22,  1.61s/it]
cache-cache_patch:all-oblivious:half-rt/inputs2 VALIDATE:  68%|██████▊   | 349/512 [01:29<04:22,  1.61s/it]
cache-cache_patch:all-oblivious:half-rt/inputs2 VALIDATE:  68%|██████▊   | 350/512 [01:29<04:17,  1.59s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_empty_cache EXPORT:  68%|██████▊   | 350/512 [01:29<04:17,  1.59s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_empty_cache VALIDATE:  68%|██████▊   | 350/512 [01:31<04:17,  1.59s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_empty_cache VALIDATE:  69%|██████▊   | 351/512 [01:31<04:17,  1.60s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_batch1 EXPORT:  69%|██████▊   | 351/512 [01:31<04:17,  1.60s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_batch1 VALIDATE:  69%|██████▊   | 351/512 [01:32<04:17,  1.60s/it]
cache-cache_patch:all-oblivious:half-rt/inputs_batch1 VALIDATE:  69%|██████▉   | 352/512 [01:32<04:11,  1.57s/it]
cache-cache_patch:all-strict/inputs EXPORT:  69%|██████▉   | 352/512 [01:32<04:11,  1.57s/it]                    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-strict/inputs EXPORT:  69%|██████▉   | 353/512 [01:34<03:52,  1.46s/it]
cache-cache_patch:all-strict/inputs2 EXPORT:  69%|██████▉   | 353/512 [01:34<03:52,  1.46s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-strict/inputs2 EXPORT:  69%|██████▉   | 354/512 [01:35<03:40,  1.40s/it]
cache-cache_patch:all-strict/inputs_empty_cache EXPORT:  69%|██████▉   | 354/512 [01:35<03:40,  1.40s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-strict/inputs_empty_cache EXPORT:  69%|██████▉   | 355/512 [01:36<03:25,  1.31s/it]
cache-cache_patch:all-strict/inputs_batch1 EXPORT:  69%|██████▉   | 355/512 [01:36<03:25,  1.31s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-strict/inputs_batch1 EXPORT:  70%|██████▉   | 356/512 [01:38<03:54,  1.50s/it]
cache-cache_patch:all-rt-strict/inputs EXPORT:  70%|██████▉   | 356/512 [01:38<03:54,  1.50s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-rt-strict/inputs EXPORT:  70%|██████▉   | 357/512 [01:40<04:43,  1.83s/it]
cache-cache_patch:all-rt-strict/inputs2 EXPORT:  70%|██████▉   | 357/512 [01:40<04:43,  1.83s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-rt-strict/inputs2 EXPORT:  70%|██████▉   | 358/512 [01:41<03:46,  1.47s/it]
cache-cache_patch:all-rt-strict/inputs_empty_cache EXPORT:  70%|██████▉   | 358/512 [01:41<03:46,  1.47s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-rt-strict/inputs_empty_cache EXPORT:  70%|███████   | 359/512 [01:43<04:22,  1.71s/it]
cache-cache_patch:all-rt-strict/inputs_batch1 EXPORT:  70%|███████   | 359/512 [01:43<04:22,  1.71s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-rt-strict/inputs_batch1 EXPORT:  70%|███████   | 360/512 [01:45<04:32,  1.79s/it]
cache-cache_patch:all-oblivious-strict/inputs EXPORT:  70%|███████   | 360/512 [01:45<04:32,  1.79s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-strict/inputs EXPORT:  71%|███████   | 361/512 [01:46<03:57,  1.57s/it]
cache-cache_patch:all-oblivious-strict/inputs2 EXPORT:  71%|███████   | 361/512 [01:46<03:57,  1.57s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-strict/inputs2 EXPORT:  71%|███████   | 362/512 [01:47<03:33,  1.43s/it]
cache-cache_patch:all-oblivious-strict/inputs_empty_cache EXPORT:  71%|███████   | 362/512 [01:47<03:33,  1.43s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-strict/inputs_empty_cache EXPORT:  71%|███████   | 363/512 [01:49<03:19,  1.34s/it]
cache-cache_patch:all-oblivious-strict/inputs_batch1 EXPORT:  71%|███████   | 363/512 [01:49<03:19,  1.34s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-strict/inputs_batch1 EXPORT:  71%|███████   | 364/512 [01:51<03:56,  1.60s/it]
cache-cache_patch:all-oblivious-rt-strict/inputs EXPORT:  71%|███████   | 364/512 [01:51<03:56,  1.60s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-rt-strict/inputs EXPORT:  71%|███████▏  | 365/512 [01:53<04:20,  1.77s/it]
cache-cache_patch:all-oblivious-rt-strict/inputs2 EXPORT:  71%|███████▏  | 365/512 [01:53<04:20,  1.77s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-rt-strict/inputs2 EXPORT:  71%|███████▏  | 366/512 [01:57<06:18,  2.59s/it]
cache-cache_patch:all-oblivious-rt-strict/inputs_empty_cache EXPORT:  71%|███████▏  | 366/512 [01:57<06:18,  2.59s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-rt-strict/inputs_empty_cache EXPORT:  72%|███████▏  | 367/512 [02:00<06:15,  2.59s/it]
cache-cache_patch:all-oblivious-rt-strict/inputs_batch1 EXPORT:  72%|███████▏  | 367/512 [02:00<06:15,  2.59s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious-rt-strict/inputs_batch1 EXPORT:  72%|███████▏  | 368/512 [02:02<05:54,  2.46s/it]
cache-cache_patch:all-oblivious:auto-strict/inputs EXPORT:  72%|███████▏  | 368/512 [02:02<05:54,  2.46s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-strict/inputs EXPORT:  72%|███████▏  | 369/512 [02:03<04:56,  2.07s/it]
cache-cache_patch:all-oblivious:auto-strict/inputs2 EXPORT:  72%|███████▏  | 369/512 [02:03<04:56,  2.07s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-strict/inputs2 EXPORT:  72%|███████▏  | 370/512 [02:05<04:14,  1.79s/it]
cache-cache_patch:all-oblivious:auto-strict/inputs_empty_cache EXPORT:  72%|███████▏  | 370/512 [02:05<04:14,  1.79s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-strict/inputs_empty_cache EXPORT:  72%|███████▏  | 371/512 [02:06<03:42,  1.58s/it]
cache-cache_patch:all-oblivious:auto-strict/inputs_batch1 EXPORT:  72%|███████▏  | 371/512 [02:06<03:42,  1.58s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-strict/inputs_batch1 EXPORT:  73%|███████▎  | 372/512 [02:08<04:05,  1.75s/it]
cache-cache_patch:all-oblivious:auto-rt-strict/inputs EXPORT:  73%|███████▎  | 372/512 [02:08<04:05,  1.75s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-rt-strict/inputs EXPORT:  73%|███████▎  | 373/512 [02:10<04:25,  1.91s/it]
cache-cache_patch:all-oblivious:auto-rt-strict/inputs2 EXPORT:  73%|███████▎  | 373/512 [02:10<04:25,  1.91s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-rt-strict/inputs2 EXPORT:  73%|███████▎  | 374/512 [02:11<03:31,  1.53s/it]
cache-cache_patch:all-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  73%|███████▎  | 374/512 [02:11<03:31,  1.53s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  73%|███████▎  | 375/512 [02:13<04:00,  1.76s/it]
cache-cache_patch:all-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  73%|███████▎  | 375/512 [02:13<04:00,  1.76s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  73%|███████▎  | 376/512 [02:15<04:13,  1.86s/it]
cache-cache_patch:all-oblivious:half-strict/inputs EXPORT:  73%|███████▎  | 376/512 [02:15<04:13,  1.86s/it]          ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-strict/inputs EXPORT:  74%|███████▎  | 377/512 [02:16<03:40,  1.63s/it]
cache-cache_patch:all-oblivious:half-strict/inputs2 EXPORT:  74%|███████▎  | 377/512 [02:16<03:40,  1.63s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-strict/inputs2 EXPORT:  74%|███████▍  | 378/512 [02:17<03:17,  1.48s/it]
cache-cache_patch:all-oblivious:half-strict/inputs_empty_cache EXPORT:  74%|███████▍  | 378/512 [02:17<03:17,  1.48s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-strict/inputs_empty_cache EXPORT:  74%|███████▍  | 379/512 [02:18<02:58,  1.35s/it]
cache-cache_patch:all-oblivious:half-strict/inputs_batch1 EXPORT:  74%|███████▍  | 379/512 [02:18<02:58,  1.35s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-strict/inputs_batch1 EXPORT:  74%|███████▍  | 380/512 [02:20<03:27,  1.57s/it]
cache-cache_patch:all-oblivious:half-rt-strict/inputs EXPORT:  74%|███████▍  | 380/512 [02:20<03:27,  1.57s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-rt-strict/inputs EXPORT:  74%|███████▍  | 381/512 [02:23<03:51,  1.77s/it]
cache-cache_patch:all-oblivious:half-rt-strict/inputs2 EXPORT:  74%|███████▍  | 381/512 [02:23<03:51,  1.77s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-rt-strict/inputs2 EXPORT:  75%|███████▍  | 382/512 [02:25<04:04,  1.88s/it]
cache-cache_patch:all-oblivious:half-rt-strict/inputs_empty_cache EXPORT:  75%|███████▍  | 382/512 [02:25<04:04,  1.88s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-rt-strict/inputs_empty_cache EXPORT:  75%|███████▍  | 383/512 [02:27<04:16,  1.99s/it]
cache-cache_patch:all-oblivious:half-rt-strict/inputs_batch1 EXPORT:  75%|███████▍  | 383/512 [02:27<04:16,  1.99s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:all-oblivious:half-rt-strict/inputs_batch1 EXPORT:  75%|███████▌  | 384/512 [02:29<04:23,  2.06s/it]
cache-cache_patch:torch/inputs EXPORT:  75%|███████▌  | 384/512 [02:29<04:23,  2.06s/it]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch/inputs EXPORT:  75%|███████▌  | 385/512 [02:30<03:18,  1.57s/it]
cache-cache_patch:torch/inputs2 EXPORT:  75%|███████▌  | 385/512 [02:30<03:18,  1.57s/it]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch/inputs2 EXPORT:  75%|███████▌  | 386/512 [02:30<02:30,  1.19s/it]
cache-cache_patch:torch/inputs_empty_cache EXPORT:  75%|███████▌  | 386/512 [02:30<02:30,  1.19s/it]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch/inputs_empty_cache EXPORT:  76%|███████▌  | 387/512 [02:30<01:56,  1.08it/s]
cache-cache_patch:torch/inputs_batch1 EXPORT:  76%|███████▌  | 387/512 [02:30<01:56,  1.08it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch/inputs_batch1 EXPORT:  76%|███████▌  | 388/512 [02:31<01:31,  1.35it/s]
cache-cache_patch:torch-rt/inputs EXPORT:  76%|███████▌  | 388/512 [02:31<01:31,  1.35it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-rt/inputs EXPORT:  76%|███████▌  | 389/512 [02:31<01:14,  1.66it/s]
cache-cache_patch:torch-rt/inputs2 EXPORT:  76%|███████▌  | 389/512 [02:31<01:14,  1.66it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-rt/inputs2 EXPORT:  76%|███████▌  | 390/512 [02:31<01:02,  1.96it/s]
cache-cache_patch:torch-rt/inputs_empty_cache EXPORT:  76%|███████▌  | 390/512 [02:31<01:02,  1.96it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-rt/inputs_empty_cache EXPORT:  76%|███████▋  | 391/512 [02:32<00:53,  2.24it/s]
cache-cache_patch:torch-rt/inputs_batch1 EXPORT:  76%|███████▋  | 391/512 [02:32<00:53,  2.24it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-rt/inputs_batch1 EXPORT:  77%|███████▋  | 392/512 [02:32<00:47,  2.50it/s]
cache-cache_patch:torch-oblivious/inputs EXPORT:  77%|███████▋  | 392/512 [02:32<00:47,  2.50it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious/inputs EXPORT:  77%|███████▋  | 393/512 [02:32<00:43,  2.74it/s]
cache-cache_patch:torch-oblivious/inputs2 EXPORT:  77%|███████▋  | 393/512 [02:32<00:43,  2.74it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious/inputs2 EXPORT:  77%|███████▋  | 394/512 [02:32<00:40,  2.92it/s]
cache-cache_patch:torch-oblivious/inputs_empty_cache EXPORT:  77%|███████▋  | 394/512 [02:32<00:40,  2.92it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious/inputs_empty_cache EXPORT:  77%|███████▋  | 395/512 [02:33<00:38,  3.00it/s]
cache-cache_patch:torch-oblivious/inputs_batch1 EXPORT:  77%|███████▋  | 395/512 [02:33<00:38,  3.00it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious/inputs_batch1 EXPORT:  77%|███████▋  | 396/512 [02:33<00:37,  3.09it/s]
cache-cache_patch:torch-oblivious-rt/inputs EXPORT:  77%|███████▋  | 396/512 [02:33<00:37,  3.09it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious-rt/inputs EXPORT:  78%|███████▊  | 397/512 [02:33<00:35,  3.21it/s]
cache-cache_patch:torch-oblivious-rt/inputs2 EXPORT:  78%|███████▊  | 397/512 [02:33<00:35,  3.21it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious-rt/inputs2 EXPORT:  78%|███████▊  | 398/512 [02:36<02:05,  1.10s/it]
cache-cache_patch:torch-oblivious-rt/inputs_empty_cache EXPORT:  78%|███████▊  | 398/512 [02:36<02:05,  1.10s/it]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious-rt/inputs_empty_cache EXPORT:  78%|███████▊  | 399/512 [02:36<01:36,  1.17it/s]
cache-cache_patch:torch-oblivious-rt/inputs_batch1 EXPORT:  78%|███████▊  | 399/512 [02:36<01:36,  1.17it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious-rt/inputs_batch1 EXPORT:  78%|███████▊  | 400/512 [02:37<01:17,  1.44it/s]
cache-cache_patch:torch-oblivious:auto/inputs EXPORT:  78%|███████▊  | 400/512 [02:37<01:17,  1.44it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto/inputs EXPORT:  78%|███████▊  | 401/512 [02:37<01:03,  1.74it/s]
cache-cache_patch:torch-oblivious:auto/inputs2 EXPORT:  78%|███████▊  | 401/512 [02:37<01:03,  1.74it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto/inputs2 EXPORT:  79%|███████▊  | 402/512 [02:37<00:54,  2.01it/s]
cache-cache_patch:torch-oblivious:auto/inputs_empty_cache EXPORT:  79%|███████▊  | 402/512 [02:37<00:54,  2.01it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto/inputs_empty_cache EXPORT:  79%|███████▊  | 403/512 [02:38<00:48,  2.23it/s]
cache-cache_patch:torch-oblivious:auto/inputs_batch1 EXPORT:  79%|███████▊  | 403/512 [02:38<00:48,  2.23it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto/inputs_batch1 EXPORT:  79%|███████▉  | 404/512 [02:38<00:43,  2.47it/s]
cache-cache_patch:torch-oblivious:auto-rt/inputs EXPORT:  79%|███████▉  | 404/512 [02:38<00:43,  2.47it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto-rt/inputs EXPORT:  79%|███████▉  | 405/512 [02:38<00:40,  2.64it/s]
cache-cache_patch:torch-oblivious:auto-rt/inputs2 EXPORT:  79%|███████▉  | 405/512 [02:38<00:40,  2.64it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto-rt/inputs2 EXPORT:  79%|███████▉  | 406/512 [02:39<00:37,  2.84it/s]
cache-cache_patch:torch-oblivious:auto-rt/inputs_empty_cache EXPORT:  79%|███████▉  | 406/512 [02:39<00:37,  2.84it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto-rt/inputs_empty_cache EXPORT:  79%|███████▉  | 407/512 [02:39<00:34,  3.03it/s]
cache-cache_patch:torch-oblivious:auto-rt/inputs_batch1 EXPORT:  79%|███████▉  | 407/512 [02:39<00:34,  3.03it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:auto-rt/inputs_batch1 EXPORT:  80%|███████▉  | 408/512 [02:39<00:32,  3.18it/s]
cache-cache_patch:torch-oblivious:half/inputs EXPORT:  80%|███████▉  | 408/512 [02:39<00:32,  3.18it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half/inputs EXPORT:  80%|███████▉  | 409/512 [02:40<00:32,  3.17it/s]
cache-cache_patch:torch-oblivious:half/inputs2 EXPORT:  80%|███████▉  | 409/512 [02:40<00:32,  3.17it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half/inputs2 EXPORT:  80%|████████  | 410/512 [02:40<00:32,  3.13it/s]
cache-cache_patch:torch-oblivious:half/inputs_empty_cache EXPORT:  80%|████████  | 410/512 [02:40<00:32,  3.13it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half/inputs_empty_cache EXPORT:  80%|████████  | 411/512 [02:40<00:32,  3.07it/s]
cache-cache_patch:torch-oblivious:half/inputs_batch1 EXPORT:  80%|████████  | 411/512 [02:40<00:32,  3.07it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half/inputs_batch1 EXPORT:  80%|████████  | 412/512 [02:41<00:32,  3.07it/s]
cache-cache_patch:torch-oblivious:half-rt/inputs EXPORT:  80%|████████  | 412/512 [02:41<00:32,  3.07it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half-rt/inputs EXPORT:  81%|████████  | 413/512 [02:41<00:31,  3.10it/s]
cache-cache_patch:torch-oblivious:half-rt/inputs2 EXPORT:  81%|████████  | 413/512 [02:41<00:31,  3.10it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half-rt/inputs2 EXPORT:  81%|████████  | 414/512 [02:41<00:30,  3.18it/s]
cache-cache_patch:torch-oblivious:half-rt/inputs_empty_cache EXPORT:  81%|████████  | 414/512 [02:41<00:30,  3.18it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half-rt/inputs_empty_cache EXPORT:  81%|████████  | 415/512 [02:41<00:30,  3.15it/s]
cache-cache_patch:torch-oblivious:half-rt/inputs_batch1 EXPORT:  81%|████████  | 415/512 [02:41<00:30,  3.15it/s]


def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None




def forward(self, arg0_1: "f32[32000, 192]", arg1_1: "f32[192, 192]", arg2_1: "f32[96, 192]", arg3_1: "f32[96, 192]", arg4_1: "f32[192, 192]", arg5_1: "f32[1024, 192]", arg6_1: "f32[1024, 192]", arg7_1: "f32[192, 1024]", arg8_1: "f32[192]", arg9_1: "f32[192]", arg10_1: "f32[192]", arg11_1: "f32[32000, 192]", arg12_1: "f32[48]", arg13_1: "i64[s72, s70]", arg14_1: "i64[s43, s53]", arg15_1: "i64[s44, s9]", arg16_1: "f32[s23, 1, s31, 96]", arg17_1: "f32[s4, 1, s11, 96]"):
    # No stacktrace found for following nodes
    _tensor_constant0: "f32[0]" = self._tensor_constant0
    lift_fresh_copy: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
    detach_: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy);  lift_fresh_copy = None
    _tensor_constant1: "f32[0]" = self._tensor_constant1
    lift_fresh_copy_1: "f32[0]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant1);  _tensor_constant1 = None
    detach__1: "f32[0]" = torch.ops.aten.detach_.default(lift_fresh_copy_1);  lift_fresh_copy_1 = None
    cat: "f32[s23, 1, s31, 96]" = torch.ops.aten.cat.default([detach_, arg16_1], -2);  detach_ = arg16_1 = None
    cat_1: "f32[s4, 1, s11, 96]" = torch.ops.aten.cat.default([detach__1, arg17_1], -2);  detach__1 = arg17_1 = cat_1 = None

     # File: ~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/sparse.py:192 in forward, code: return F.embedding(
    embedding: "f32[s72, s70, 192]" = torch.ops.aten.embedding.default(arg0_1, arg13_1);  arg0_1 = embedding = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:371 in forward, code: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
    sym_numel_default: "Sym(96*s23*s31)" = torch.ops.aten.sym_numel.default(cat)
    eq: "Sym(False)" = sym_numel_default == 0;  eq = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:373 in forward, code: past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    sym_size_int: "Sym(s31)" = torch.ops.aten.sym_size.int(cat, 2);  cat = None
    sym_size_int_1: "Sym(s70)" = torch.ops.aten.sym_size.int(arg13_1, 1)
    add: "Sym(s31 + s70)" = sym_size_int + sym_size_int_1;  sym_size_int_1 = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:372 in forward, code: cache_position: torch.Tensor = torch.arange(
    arange: "i64[s70]" = torch.ops.aten.arange.start(sym_size_int, add, device = device(type='cpu'), pin_memory = False);  add = None

     # File: ~/github/transformers/src/transformers/models/llama/modeling_llama.py:379 in forward, code: causal_mask = create_causal_mask(
    to: "b8[s43, s53]" = torch.ops.aten.to.device(arg14_1, device(type='cpu'), torch.bool);  to = None
    eq_1: "Sym(False)" = sym_numel_default == 0;  sym_numel_default = eq_1 = None
    sym_size_int_2: "Sym(s70)" = torch.ops.aten.sym_size.int(arange, 0)
    add_1: "Sym(s31 + s70)" = sym_size_int + sym_size_int_2;  sym_size_int = None
    add_2: "Sym(s31 + s70)" = add_1 + 0
    sym_size_int_3: "Sym(s53)" = torch.ops.aten.sym_size.int(arg14_1, 1);  arg14_1 = None
    sub: "Sym(s31 - s53 + s70)" = add_2 - sym_size_int_3;  add_2 = None
    gt: "Sym(s31 - s53 + s70 > 0)" = sub > 0;  sub = gt = None
    gt_1: "Sym(s53 > s31 + s70)" = sym_size_int_3 > add_1;  sym_size_int_3 = gt_1 = None
    arange_1: "i64[s31 + s70]" = torch.ops.aten.arange.default(add_1, device = device(type='cpu'), pin_memory = False);  add_1 = None
    add_: "i64[s31 + s70]" = torch.ops.aten.add_.Tensor(arange_1, 0)
    sym_size_int_4: "Sym(s72)" = torch.ops.aten.sym_size.int(arg13_1, 0);  arg13_1 = None
    arange_2: "i64[s72]" = torch.ops.aten.arange.default(sym_size_int_4, device = device(type='cpu'), pin_memory = False);  sym_size_int_4 = None
    arange_3: "i64[1]" = torch.ops.aten.arange.default(1, device = device(type='cpu'), pin_memory = False)
    movedim: "i64[s72]" = torch.ops.aten.movedim.int(arange_2, 0, 0);  movedim = None
    unsqueeze: "i64[1, 1]" = torch.ops.aten.unsqueeze.default(arange_3, 0);  arange_3 = None
    sym_size_int_5: "Sym(s72)" = torch.ops.aten.sym_size.int(arange_2, 0);  arange_2 = None
    expand: "i64[s72, 1]" = torch.ops.aten.expand.default(unsqueeze, [sym_size_int_5, 1]);  unsqueeze = expand = None
    unsqueeze_1: "i64[1, s70]" = torch.ops.aten.unsqueeze.default(arange, 0);  arange = None
    expand_1: "i64[s72, s70]" = torch.ops.aten.expand.default(unsqueeze_1, [sym_size_int_5, sym_size_int_2]);  unsqueeze_1 = sym_size_int_2 = expand_1 = None
    unsqueeze_2: "i64[1, s31 + s70]" = torch.ops.aten.unsqueeze.default(add_, 0);  add_ = None
    sym_size_int_6: "Sym(s31 + s70)" = torch.ops.aten.sym_size.int(arange_1, 0);  arange_1 = None
    expand_2: "i64[s72, s31 + s70]" = torch.ops.aten.expand.default(unsqueeze_2, [sym_size_int_5, sym_size_int_6]);  unsqueeze_2 = sym_size_int_5 = sym_size_int_6 = expand_2 = None


cache-cache_patch:torch-oblivious:half-rt/inputs_batch1 EXPORT:  81%|████████▏ | 416/512 [02:42<00:29,  3.23it/s]
cache-cache_patch:torch-strict/inputs EXPORT:  81%|████████▏ | 416/512 [02:42<00:29,  3.23it/s]
cache-cache_patch:torch-strict/inputs2 EXPORT:  81%|████████▏ | 416/512 [02:40<00:29,  3.23it/s]
cache-cache_patch:torch-strict/inputs_empty_cache EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-strict/inputs_batch1 EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-rt-strict/inputs EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-rt-strict/inputs2 EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-rt-strict/inputs_empty_cache EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-rt-strict/inputs_batch1 EXPORT:  81%|████████▏ | 416/512 [02:41<00:29,  3.23it/s]
cache-cache_patch:torch-oblivious-strict/inputs EXPORT:  81%|████████▏ | 416/512 [02:42<00:29,  3.23it/s]
cache-cache_patch:torch-oblivious-strict/inputs2 EXPORT:  81%|████████▏ | 416/512 [02:42<00:29,  3.23it/s]
cache-cache_patch:torch-oblivious-strict/inputs2 EXPORT:  83%|████████▎ | 426/512 [02:42<00:05, 14.67it/s]
cache-cache_patch:torch-oblivious-strict/inputs_empty_cache EXPORT:  83%|████████▎ | 426/512 [02:42<00:05, 14.67it/s]
cache-cache_patch:torch-oblivious-strict/inputs_batch1 EXPORT:  83%|████████▎ | 426/512 [02:42<00:05, 14.67it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs EXPORT:  83%|████████▎ | 426/512 [02:42<00:05, 14.67it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs EXPORT:  84%|████████▍ | 429/512 [02:42<00:07, 11.35it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs2 EXPORT:  84%|████████▍ | 429/512 [02:42<00:07, 11.35it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs_empty_cache EXPORT:  84%|████████▍ | 429/512 [02:42<00:07, 11.35it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs_batch1 EXPORT:  84%|████████▍ | 429/512 [02:43<00:07, 11.35it/s]
cache-cache_patch:torch-oblivious-rt-strict/inputs_batch1 EXPORT:  84%|████████▍ | 432/512 [02:43<00:08,  9.64it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs EXPORT:  84%|████████▍ | 432/512 [02:43<00:08,  9.64it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs2 EXPORT:  84%|████████▍ | 432/512 [02:43<00:08,  9.64it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs2 EXPORT:  85%|████████▍ | 434/512 [02:43<00:08,  8.68it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs_empty_cache EXPORT:  85%|████████▍ | 434/512 [02:43<00:08,  8.68it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs_batch1 EXPORT:  85%|████████▍ | 434/512 [02:43<00:08,  8.68it/s]
cache-cache_patch:torch-oblivious:auto-strict/inputs_batch1 EXPORT:  85%|████████▌ | 436/512 [02:43<00:09,  7.85it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs EXPORT:  85%|████████▌ | 436/512 [02:43<00:09,  7.85it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs2 EXPORT:  85%|████████▌ | 436/512 [02:44<00:09,  7.85it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs2 EXPORT:  86%|████████▌ | 438/512 [02:44<00:10,  7.38it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  86%|████████▌ | 438/512 [02:44<00:10,  7.38it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  86%|████████▌ | 439/512 [02:44<00:10,  7.22it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  86%|████████▌ | 439/512 [02:44<00:10,  7.22it/s]
cache-cache_patch:torch-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  86%|████████▌ | 440/512 [02:44<00:10,  6.98it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs EXPORT:  86%|████████▌ | 440/512 [02:44<00:10,  6.98it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs EXPORT:  86%|████████▌ | 441/512 [02:44<00:11,  6.14it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs2 EXPORT:  86%|████████▌ | 441/512 [02:44<00:11,  6.14it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs2 EXPORT:  86%|████████▋ | 442/512 [02:45<00:12,  5.80it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs_empty_cache EXPORT:  86%|████████▋ | 442/512 [02:45<00:12,  5.80it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs_empty_cache EXPORT:  87%|████████▋ | 443/512 [02:45<00:15,  4.45it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs_batch1 EXPORT:  87%|████████▋ | 443/512 [02:45<00:15,  4.45it/s]
cache-cache_patch:torch-oblivious:half-strict/inputs_batch1 EXPORT:  87%|████████▋ | 444/512 [02:45<00:16,  4.13it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs EXPORT:  87%|████████▋ | 444/512 [02:45<00:16,  4.13it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs EXPORT:  87%|████████▋ | 445/512 [02:45<00:17,  3.92it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs2 EXPORT:  87%|████████▋ | 445/512 [02:45<00:17,  3.92it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs2 EXPORT:  87%|████████▋ | 446/512 [02:46<00:16,  3.98it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs_empty_cache EXPORT:  87%|████████▋ | 446/512 [02:46<00:16,  3.98it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs_empty_cache EXPORT:  87%|████████▋ | 447/512 [02:46<00:17,  3.70it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs_batch1 EXPORT:  87%|████████▋ | 447/512 [02:46<00:17,  3.70it/s]
cache-cache_patch:torch-oblivious:half-rt-strict/inputs_batch1 EXPORT:  88%|████████▊ | 448/512 [02:46<00:16,  3.82it/s]
cache-cache_patch:transformers/inputs EXPORT:  88%|████████▊ | 448/512 [02:46<00:16,  3.82it/s]
cache-cache_patch:transformers/inputs VALIDATE:  88%|████████▊ | 448/512 [02:48<00:16,  3.82it/s]
cache-cache_patch:transformers/inputs VALIDATE:  88%|████████▊ | 449/512 [02:48<00:49,  1.28it/s]
cache-cache_patch:transformers/inputs2 EXPORT:  88%|████████▊ | 449/512 [02:48<00:49,  1.28it/s]
cache-cache_patch:transformers/inputs2 VALIDATE:  88%|████████▊ | 449/512 [02:50<00:49,  1.28it/s]
cache-cache_patch:transformers/inputs2 VALIDATE:  88%|████████▊ | 450/512 [02:50<01:03,  1.02s/it]
cache-cache_patch:transformers/inputs_empty_cache EXPORT:  88%|████████▊ | 450/512 [02:50<01:03,  1.02s/it]
cache-cache_patch:transformers/inputs_empty_cache VALIDATE:  88%|████████▊ | 450/512 [02:51<01:03,  1.02s/it]
cache-cache_patch:transformers/inputs_empty_cache VALIDATE:  88%|████████▊ | 451/512 [02:51<01:11,  1.17s/it]
cache-cache_patch:transformers/inputs_batch1 EXPORT:  88%|████████▊ | 451/512 [02:51<01:11,  1.17s/it]
cache-cache_patch:transformers/inputs_batch1 EXPORT:  88%|████████▊ | 452/512 [02:52<01:07,  1.12s/it]
cache-cache_patch:transformers-rt/inputs EXPORT:  88%|████████▊ | 452/512 [02:52<01:07,  1.12s/it]
cache-cache_patch:transformers-rt/inputs VALIDATE:  88%|████████▊ | 452/512 [02:54<01:07,  1.12s/it]
cache-cache_patch:transformers-rt/inputs VALIDATE:  88%|████████▊ | 453/512 [02:54<01:14,  1.26s/it]
cache-cache_patch:transformers-rt/inputs2 EXPORT:  88%|████████▊ | 453/512 [02:54<01:14,  1.26s/it]
cache-cache_patch:transformers-rt/inputs2 VALIDATE:  88%|████████▊ | 453/512 [02:55<01:14,  1.26s/it]
cache-cache_patch:transformers-rt/inputs2 VALIDATE:  89%|████████▊ | 454/512 [02:56<01:16,  1.33s/it]
cache-cache_patch:transformers-rt/inputs_empty_cache EXPORT:  89%|████████▊ | 454/512 [02:56<01:16,  1.33s/it]
cache-cache_patch:transformers-rt/inputs_empty_cache VALIDATE:  89%|████████▊ | 454/512 [02:57<01:16,  1.33s/it]
cache-cache_patch:transformers-rt/inputs_empty_cache VALIDATE:  89%|████████▉ | 455/512 [02:57<01:20,  1.41s/it]
cache-cache_patch:transformers-rt/inputs_batch1 EXPORT:  89%|████████▉ | 455/512 [02:57<01:20,  1.41s/it]
cache-cache_patch:transformers-rt/inputs_batch1 EXPORT:  89%|████████▉ | 456/512 [02:58<01:12,  1.29s/it]
cache-cache_patch:transformers-oblivious/inputs EXPORT:  89%|████████▉ | 456/512 [02:58<01:12,  1.29s/it]
cache-cache_patch:transformers-oblivious/inputs VALIDATE:  89%|████████▉ | 456/512 [02:59<01:12,  1.29s/it]
cache-cache_patch:transformers-oblivious/inputs VALIDATE:  89%|████████▉ | 457/512 [03:00<01:14,  1.36s/it]
cache-cache_patch:transformers-oblivious/inputs2 EXPORT:  89%|████████▉ | 457/512 [03:00<01:14,  1.36s/it]
cache-cache_patch:transformers-oblivious/inputs2 VALIDATE:  89%|████████▉ | 457/512 [03:01<01:14,  1.36s/it]
cache-cache_patch:transformers-oblivious/inputs2 VALIDATE:  89%|████████▉ | 458/512 [03:01<01:16,  1.42s/it]
cache-cache_patch:transformers-oblivious/inputs_empty_cache EXPORT:  89%|████████▉ | 458/512 [03:01<01:16,  1.42s/it]
cache-cache_patch:transformers-oblivious/inputs_empty_cache VALIDATE:  89%|████████▉ | 458/512 [03:02<01:16,  1.42s/it]
cache-cache_patch:transformers-oblivious/inputs_empty_cache VALIDATE:  90%|████████▉ | 459/512 [03:03<01:15,  1.43s/it]
cache-cache_patch:transformers-oblivious/inputs_batch1 EXPORT:  90%|████████▉ | 459/512 [03:03<01:15,  1.43s/it]
cache-cache_patch:transformers-oblivious/inputs_batch1 VALIDATE:  90%|████████▉ | 459/512 [03:04<01:15,  1.43s/it]
cache-cache_patch:transformers-oblivious/inputs_batch1 VALIDATE:  90%|████████▉ | 460/512 [03:04<01:15,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs EXPORT:  90%|████████▉ | 460/512 [03:04<01:15,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs VALIDATE:  90%|████████▉ | 460/512 [03:05<01:15,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs VALIDATE:  90%|█████████ | 461/512 [03:06<01:13,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs2 EXPORT:  90%|█████████ | 461/512 [03:06<01:13,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs2 VALIDATE:  90%|█████████ | 461/512 [03:07<01:13,  1.45s/it]
cache-cache_patch:transformers-oblivious-rt/inputs2 VALIDATE:  90%|█████████ | 462/512 [03:07<01:13,  1.47s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_empty_cache EXPORT:  90%|█████████ | 462/512 [03:07<01:13,  1.47s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_empty_cache VALIDATE:  90%|█████████ | 462/512 [03:08<01:13,  1.47s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_empty_cache VALIDATE:  90%|█████████ | 463/512 [03:09<01:13,  1.49s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_batch1 EXPORT:  90%|█████████ | 463/512 [03:09<01:13,  1.49s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_batch1 VALIDATE:  90%|█████████ | 463/512 [03:10<01:13,  1.49s/it]
cache-cache_patch:transformers-oblivious-rt/inputs_batch1 VALIDATE:  91%|█████████ | 464/512 [03:10<01:10,  1.46s/it]
cache-cache_patch:transformers-oblivious:auto/inputs EXPORT:  91%|█████████ | 464/512 [03:10<01:10,  1.46s/it]
cache-cache_patch:transformers-oblivious:auto/inputs VALIDATE:  91%|█████████ | 464/512 [03:13<01:10,  1.46s/it]
cache-cache_patch:transformers-oblivious:auto/inputs VALIDATE:  91%|█████████ | 465/512 [03:13<01:26,  1.85s/it]
cache-cache_patch:transformers-oblivious:auto/inputs2 EXPORT:  91%|█████████ | 465/512 [03:13<01:26,  1.85s/it]
cache-cache_patch:transformers-oblivious:auto/inputs2 VALIDATE:  91%|█████████ | 465/512 [03:14<01:26,  1.85s/it]
cache-cache_patch:transformers-oblivious:auto/inputs2 VALIDATE:  91%|█████████ | 466/512 [03:14<01:21,  1.76s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_empty_cache EXPORT:  91%|█████████ | 466/512 [03:14<01:21,  1.76s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_empty_cache VALIDATE:  91%|█████████ | 466/512 [03:16<01:21,  1.76s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_empty_cache VALIDATE:  91%|█████████ | 467/512 [03:16<01:16,  1.70s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_batch1 EXPORT:  91%|█████████ | 467/512 [03:16<01:16,  1.70s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_batch1 VALIDATE:  91%|█████████ | 467/512 [03:17<01:16,  1.70s/it]
cache-cache_patch:transformers-oblivious:auto/inputs_batch1 VALIDATE:  91%|█████████▏| 468/512 [03:17<01:13,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs EXPORT:  91%|█████████▏| 468/512 [03:17<01:13,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs VALIDATE:  91%|█████████▏| 468/512 [03:19<01:13,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs VALIDATE:  92%|█████████▏| 469/512 [03:19<01:10,  1.64s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs2 EXPORT:  92%|█████████▏| 469/512 [03:19<01:10,  1.64s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs2 VALIDATE:  92%|█████████▏| 469/512 [03:21<01:10,  1.64s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs2 VALIDATE:  92%|█████████▏| 470/512 [03:21<01:09,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_empty_cache EXPORT:  92%|█████████▏| 470/512 [03:21<01:09,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_empty_cache VALIDATE:  92%|█████████▏| 470/512 [03:22<01:09,  1.66s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_empty_cache VALIDATE:  92%|█████████▏| 471/512 [03:22<01:07,  1.65s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_batch1 EXPORT:  92%|█████████▏| 471/512 [03:22<01:07,  1.65s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_batch1 VALIDATE:  92%|█████████▏| 471/512 [03:24<01:07,  1.65s/it]
cache-cache_patch:transformers-oblivious:auto-rt/inputs_batch1 VALIDATE:  92%|█████████▏| 472/512 [03:24<01:04,  1.62s/it]
cache-cache_patch:transformers-oblivious:half/inputs EXPORT:  92%|█████████▏| 472/512 [03:24<01:04,  1.62s/it]
cache-cache_patch:transformers-oblivious:half/inputs VALIDATE:  92%|█████████▏| 472/512 [03:25<01:04,  1.62s/it]
cache-cache_patch:transformers-oblivious:half/inputs VALIDATE:  92%|█████████▏| 473/512 [03:26<01:05,  1.67s/it]
cache-cache_patch:transformers-oblivious:half/inputs2 EXPORT:  92%|█████████▏| 473/512 [03:26<01:05,  1.67s/it]
cache-cache_patch:transformers-oblivious:half/inputs2 VALIDATE:  92%|█████████▏| 473/512 [03:27<01:05,  1.67s/it]
cache-cache_patch:transformers-oblivious:half/inputs2 VALIDATE:  93%|█████████▎| 474/512 [03:27<01:02,  1.64s/it]
cache-cache_patch:transformers-oblivious:half/inputs_empty_cache EXPORT:  93%|█████████▎| 474/512 [03:27<01:02,  1.64s/it]
cache-cache_patch:transformers-oblivious:half/inputs_empty_cache VALIDATE:  93%|█████████▎| 474/512 [03:29<01:02,  1.64s/it]
cache-cache_patch:transformers-oblivious:half/inputs_empty_cache VALIDATE:  93%|█████████▎| 475/512 [03:29<00:59,  1.61s/it]
cache-cache_patch:transformers-oblivious:half/inputs_batch1 EXPORT:  93%|█████████▎| 475/512 [03:29<00:59,  1.61s/it]
cache-cache_patch:transformers-oblivious:half/inputs_batch1 VALIDATE:  93%|█████████▎| 475/512 [03:30<00:59,  1.61s/it]
cache-cache_patch:transformers-oblivious:half/inputs_batch1 VALIDATE:  93%|█████████▎| 476/512 [03:30<00:56,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs EXPORT:  93%|█████████▎| 476/512 [03:30<00:56,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs VALIDATE:  93%|█████████▎| 476/512 [03:32<00:56,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs VALIDATE:  93%|█████████▎| 477/512 [03:32<00:54,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs2 EXPORT:  93%|█████████▎| 477/512 [03:32<00:54,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs2 VALIDATE:  93%|█████████▎| 477/512 [03:33<00:54,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs2 VALIDATE:  93%|█████████▎| 478/512 [03:33<00:53,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_empty_cache EXPORT:  93%|█████████▎| 478/512 [03:33<00:53,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_empty_cache VALIDATE:  93%|█████████▎| 478/512 [03:35<00:53,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_empty_cache VALIDATE:  94%|█████████▎| 479/512 [03:35<00:51,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_batch1 EXPORT:  94%|█████████▎| 479/512 [03:35<00:51,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_batch1 VALIDATE:  94%|█████████▎| 479/512 [03:36<00:51,  1.57s/it]
cache-cache_patch:transformers-oblivious:half-rt/inputs_batch1 VALIDATE:  94%|█████████▍| 480/512 [03:37<00:49,  1.54s/it]
cache-cache_patch:transformers-strict/inputs EXPORT:  94%|█████████▍| 480/512 [03:37<00:49,  1.54s/it]                    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-strict/inputs EXPORT:  94%|█████████▍| 481/512 [03:38<00:43,  1.42s/it]
cache-cache_patch:transformers-strict/inputs2 EXPORT:  94%|█████████▍| 481/512 [03:38<00:43,  1.42s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-strict/inputs2 EXPORT:  94%|█████████▍| 482/512 [03:39<00:40,  1.36s/it]
cache-cache_patch:transformers-strict/inputs_empty_cache EXPORT:  94%|█████████▍| 482/512 [03:39<00:40,  1.36s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-strict/inputs_empty_cache EXPORT:  94%|█████████▍| 483/512 [03:40<00:37,  1.29s/it]
cache-cache_patch:transformers-strict/inputs_batch1 EXPORT:  94%|█████████▍| 483/512 [03:40<00:37,  1.29s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-strict/inputs_batch1 EXPORT:  95%|█████████▍| 484/512 [03:42<00:41,  1.47s/it]
cache-cache_patch:transformers-rt-strict/inputs EXPORT:  95%|█████████▍| 484/512 [03:42<00:41,  1.47s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-rt-strict/inputs EXPORT:  95%|█████████▍| 485/512 [03:43<00:33,  1.23s/it]
cache-cache_patch:transformers-rt-strict/inputs2 EXPORT:  95%|█████████▍| 485/512 [03:43<00:33,  1.23s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-rt-strict/inputs2 EXPORT:  95%|█████████▍| 486/512 [03:45<00:40,  1.54s/it]
cache-cache_patch:transformers-rt-strict/inputs_empty_cache EXPORT:  95%|█████████▍| 486/512 [03:45<00:40,  1.54s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-rt-strict/inputs_empty_cache EXPORT:  95%|█████████▌| 487/512 [03:47<00:43,  1.73s/it]
cache-cache_patch:transformers-rt-strict/inputs_batch1 EXPORT:  95%|█████████▌| 487/512 [03:47<00:43,  1.73s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-rt-strict/inputs_batch1 EXPORT:  95%|█████████▌| 488/512 [03:49<00:42,  1.79s/it]
cache-cache_patch:transformers-oblivious-strict/inputs EXPORT:  95%|█████████▌| 488/512 [03:49<00:42,  1.79s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-strict/inputs EXPORT:  96%|█████████▌| 489/512 [03:50<00:36,  1.57s/it]
cache-cache_patch:transformers-oblivious-strict/inputs2 EXPORT:  96%|█████████▌| 489/512 [03:50<00:36,  1.57s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-strict/inputs2 EXPORT:  96%|█████████▌| 490/512 [03:51<00:30,  1.40s/it]
cache-cache_patch:transformers-oblivious-strict/inputs_empty_cache EXPORT:  96%|█████████▌| 490/512 [03:51<00:30,  1.40s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-strict/inputs_empty_cache EXPORT:  96%|█████████▌| 491/512 [03:52<00:26,  1.28s/it]
cache-cache_patch:transformers-oblivious-strict/inputs_batch1 EXPORT:  96%|█████████▌| 491/512 [03:52<00:26,  1.28s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-strict/inputs_batch1 EXPORT:  96%|█████████▌| 492/512 [03:54<00:29,  1.49s/it]
cache-cache_patch:transformers-oblivious-rt-strict/inputs EXPORT:  96%|█████████▌| 492/512 [03:54<00:29,  1.49s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-rt-strict/inputs EXPORT:  96%|█████████▋| 493/512 [03:56<00:31,  1.68s/it]
cache-cache_patch:transformers-oblivious-rt-strict/inputs2 EXPORT:  96%|█████████▋| 493/512 [03:56<00:31,  1.68s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-rt-strict/inputs2 EXPORT:  96%|█████████▋| 494/512 [03:58<00:33,  1.85s/it]
cache-cache_patch:transformers-oblivious-rt-strict/inputs_empty_cache EXPORT:  96%|█████████▋| 494/512 [03:58<00:33,  1.85s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-rt-strict/inputs_empty_cache EXPORT:  97%|█████████▋| 495/512 [04:01<00:33,  1.97s/it]
cache-cache_patch:transformers-oblivious-rt-strict/inputs_batch1 EXPORT:  97%|█████████▋| 495/512 [04:01<00:33,  1.97s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious-rt-strict/inputs_batch1 EXPORT:  97%|█████████▋| 496/512 [04:06<00:47,  2.96s/it]
cache-cache_patch:transformers-oblivious:auto-strict/inputs EXPORT:  97%|█████████▋| 496/512 [04:06<00:47,  2.96s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-strict/inputs EXPORT:  97%|█████████▋| 497/512 [04:07<00:36,  2.42s/it]
cache-cache_patch:transformers-oblivious:auto-strict/inputs2 EXPORT:  97%|█████████▋| 497/512 [04:07<00:36,  2.42s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-strict/inputs2 EXPORT:  97%|█████████▋| 498/512 [04:08<00:28,  2.06s/it]
cache-cache_patch:transformers-oblivious:auto-strict/inputs_empty_cache EXPORT:  97%|█████████▋| 498/512 [04:08<00:28,  2.06s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-strict/inputs_empty_cache EXPORT:  97%|█████████▋| 499/512 [04:09<00:22,  1.75s/it]
cache-cache_patch:transformers-oblivious:auto-strict/inputs_batch1 EXPORT:  97%|█████████▋| 499/512 [04:09<00:22,  1.75s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-strict/inputs_batch1 EXPORT:  98%|█████████▊| 500/512 [04:11<00:21,  1.82s/it]
cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs EXPORT:  98%|█████████▊| 500/512 [04:11<00:21,  1.82s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs EXPORT:  98%|█████████▊| 501/512 [04:14<00:21,  1.96s/it]
cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs2 EXPORT:  98%|█████████▊| 501/512 [04:14<00:21,  1.96s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs2 EXPORT:  98%|█████████▊| 502/512 [04:14<00:15,  1.58s/it]
cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  98%|█████████▊| 502/512 [04:14<00:15,  1.58s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs_empty_cache EXPORT:  98%|█████████▊| 503/512 [04:16<00:16,  1.78s/it]
cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  98%|█████████▊| 503/512 [04:16<00:16,  1.78s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:auto-rt-strict/inputs_batch1 EXPORT:  98%|█████████▊| 504/512 [04:19<00:14,  1.86s/it]
cache-cache_patch:transformers-oblivious:half-strict/inputs EXPORT:  98%|█████████▊| 504/512 [04:19<00:14,  1.86s/it]          ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-strict/inputs EXPORT:  99%|█████████▊| 505/512 [04:20<00:11,  1.62s/it]
cache-cache_patch:transformers-oblivious:half-strict/inputs2 EXPORT:  99%|█████████▊| 505/512 [04:20<00:11,  1.62s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-strict/inputs2 EXPORT:  99%|█████████▉| 506/512 [04:21<00:08,  1.45s/it]
cache-cache_patch:transformers-oblivious:half-strict/inputs_empty_cache EXPORT:  99%|█████████▉| 506/512 [04:21<00:08,  1.45s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-strict/inputs_empty_cache EXPORT:  99%|█████████▉| 507/512 [04:22<00:06,  1.32s/it]
cache-cache_patch:transformers-oblivious:half-strict/inputs_batch1 EXPORT:  99%|█████████▉| 507/512 [04:22<00:06,  1.32s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-strict/inputs_batch1 EXPORT:  99%|█████████▉| 508/512 [04:24<00:06,  1.56s/it]
cache-cache_patch:transformers-oblivious:half-rt-strict/inputs EXPORT:  99%|█████████▉| 508/512 [04:24<00:06,  1.56s/it]    ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-rt-strict/inputs EXPORT:  99%|█████████▉| 509/512 [04:26<00:05,  1.79s/it]
cache-cache_patch:transformers-oblivious:half-rt-strict/inputs2 EXPORT:  99%|█████████▉| 509/512 [04:26<00:05,  1.79s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-rt-strict/inputs2 EXPORT: 100%|█████████▉| 510/512 [04:28<00:03,  1.88s/it]
cache-cache_patch:transformers-oblivious:half-rt-strict/inputs_empty_cache EXPORT: 100%|█████████▉| 510/512 [04:28<00:03,  1.88s/it]~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-rt-strict/inputs_empty_cache EXPORT: 100%|█████████▉| 511/512 [04:31<00:02,  2.02s/it]
cache-cache_patch:transformers-oblivious:half-rt-strict/inputs_batch1 EXPORT: 100%|█████████▉| 511/512 [04:31<00:02,  2.02s/it]     ~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/output_graph.py:1824: UserWarning: While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['kwargs']['past_key_values'].layers[0]"]
  warnings.warn(

cache-cache_patch:transformers-oblivious:half-rt-strict/inputs_batch1 EXPORT: 100%|██████████| 512/512 [04:33<00:00,  2.04s/it]
cache-cache_patch:transformers-oblivious:half-rt-strict/inputs_batch1 EXPORT: 100%|██████████| 512/512 [04:33<00:00,  1.87it/s]

Let’s save the results.

df = pandas.DataFrame(results)
df.to_excel("plot_export_tiny_llm_dim01_onnx.xlsx")
df
cache cache_patch oblivious rt strict export_with EXPORT ERR-EXPORT run_with WORKS ERR-RUN
0 0 0 0 0 0 inputs 0 8*s72 (133002450292032)is not tracked with pro... NaN NaN NaN
1 0 0 0 0 0 inputs2 0 8*s72 (133002459943072)is not tracked with pro... NaN NaN NaN
2 0 0 0 0 0 inputs_empty_cache 0 8*s72 (133002449100032)is not tracked with pro... NaN NaN NaN
3 0 0 0 0 0 inputs_batch1 0 8*s31 + 8*s70 (133002489454112)is not tracked ... NaN NaN NaN
4 0 0 0 1 0 inputs 0 8*s72 (133002453136352)is not tracked with pro... NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
501 1 transformers half 0 1 inputs_batch1 0 Found the following conflicts between user-spe... NaN NaN NaN
502 1 transformers half 1 1 inputs 0 Found the following conflicts between user-spe... NaN NaN NaN
503 1 transformers half 1 1 inputs2 0 Found the following conflicts between user-spe... NaN NaN NaN
504 1 transformers half 1 1 inputs_empty_cache 0 Found the following conflicts between user-spe... NaN NaN NaN
505 1 transformers half 1 1 inputs_batch1 0 Found the following conflicts between user-spe... NaN NaN NaN

506 rows × 11 columns



no_export = df[df.EXPORT == 0]
no_export.to_excel("plot_export_tiny_llm_dim01_onnx_custom.no_export.xlsx")
no_export
cache cache_patch oblivious rt strict export_with EXPORT ERR-EXPORT run_with WORKS ERR-RUN
0 0 0 0 0 0 inputs 0 8*s72 (133002450292032)is not tracked with pro... NaN NaN NaN
1 0 0 0 0 0 inputs2 0 8*s72 (133002459943072)is not tracked with pro... NaN NaN NaN
2 0 0 0 0 0 inputs_empty_cache 0 8*s72 (133002449100032)is not tracked with pro... NaN NaN NaN
3 0 0 0 0 0 inputs_batch1 0 8*s31 + 8*s70 (133002489454112)is not tracked ... NaN NaN NaN
4 0 0 0 1 0 inputs 0 8*s72 (133002453136352)is not tracked with pro... NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
501 1 transformers half 0 1 inputs_batch1 0 Found the following conflicts between user-spe... NaN NaN NaN
502 1 transformers half 1 1 inputs 0 Found the following conflicts between user-spe... NaN NaN NaN
503 1 transformers half 1 1 inputs2 0 Found the following conflicts between user-spe... NaN NaN NaN
504 1 transformers half 1 1 inputs_empty_cache 0 Found the following conflicts between user-spe... NaN NaN NaN
505 1 transformers half 1 1 inputs_batch1 0 Found the following conflicts between user-spe... NaN NaN NaN

258 rows × 11 columns



The validation failures.

invalid = df[(df.EXPORT == 1) & (df.WORKS == 0)].pivot(
    index=["cache", "cache_patch", "strict", "oblivious", "rt", "export_with"],
    columns=["run_with"],
    values=["WORKS", "ERR-RUN"],
)
invalid.to_excel("plot_export_tiny_llm_dim01_onnx_custom.invalid.xlsx")
invalid
WORKS ERR-RUN
run_with inputs inputs2 inputs_empty_cache inputs inputs2 inputs_empty_cache
cache cache_patch strict oblivious rt export_with
1 all 0 0 0 inputs_batch1 0.0 0.0 0.0 [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N... [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N... [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N...
1 inputs_batch1 0.0 0.0 0.0 [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N... [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N... [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : N...


success = df[(df.EXPORT == 1) & (df.WORKS == 1)].pivot(
    index=["cache", "cache_patch", "strict", "oblivious", "rt", "export_with"],
    columns=["run_with"],
    values=["WORKS"],
)
success.to_excel("plot_export_tiny_llm_dim01_onnx_custom.success.xlsx")
success
WORKS
run_with inputs inputs2 inputs_batch1 inputs_empty_cache
cache cache_patch strict oblivious rt export_with
1 all 0 0 0 inputs 1.0 1.0 1.0 1.0
inputs2 1.0 1.0 1.0 1.0
inputs_batch1 NaN NaN 1.0 NaN
inputs_empty_cache 1.0 1.0 1.0 1.0
1 inputs 1.0 1.0 1.0 1.0
... ... ... ... ... ... ... ... ...
transformers 0 half 0 inputs_empty_cache 1.0 1.0 1.0 1.0
1 inputs 1.0 1.0 1.0 1.0
inputs2 1.0 1.0 1.0 1.0
inputs_batch1 1.0 1.0 1.0 1.0
inputs_empty_cache 1.0 1.0 1.0 1.0

62 rows × 4 columns



If you have any error, then look at example Export Tiny-LLM with patches.

doc.plot_legend("Tiny-LLM\nexport with\ndimension in {0,1}", "to_onnx", "tomato")
plot export tiny llm dim01 onnx custom

Total running time of the script: (4 minutes 35.306 seconds)

Related examples

Export with dynamic dimensions in {0,1} into ONNX

Export with dynamic dimensions in {0,1} into ONNX

Export with dynamic dimensions in {0,1}

Export with dynamic dimensions in {0,1}

Export microsoft/phi-2

Export microsoft/phi-2

Gallery generated by Sphinx-Gallery