From a LLM to processing a prompt

Method generate generates the model answer for a given prompt. Let’s implement our own to understand better how it works and then apply it to an ONNX model.

Example with Phi 1.5

epkg:microsoft/Phi-1.5 is a small LLM. The example given

import os
import time
import sys
import pandas
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from onnx_diagnostic.ext_test_case import unit_test_going
from onnx_diagnostic.helpers import string_type
from onnx_diagnostic.helpers.torch_helper import to_any, get_weight_type
from onnx_diagnostic.helpers.rt_helper import onnx_generate
from onnx_diagnostic.torch_export_patches import torch_export_patches
from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config, task_from_id
from onnx_diagnostic.tasks import random_input_kwargs
from onnx_diagnostic.export.api import to_onnx

device = "cuda" if torch.cuda.is_available() else "cpu"
data = []

print("-- load the model...")
if unit_test_going():
    # unit_test_going() returns True if UNITTEST_GOING is 1
    # The example switches to a faster scenario.
    model_id = "arnir0/Tiny-LLM"
    data_export = get_untrained_model_with_inputs(model_id)
    model = data_export["model"]
    export_inputs = data_export["inputs"]
    export_shapes = data_export["dynamic_shapes"]
    tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
    model_id = "microsoft/phi-1_5"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    config = get_pretrained_config(model_id)
    task = task = task_from_id(model_id)
    kwargs, fct = random_input_kwargs(config, task)
    res = fct(model, config, add_second_input=False, **kwargs)
    export_inputs = res["inputs"]
    export_shapes = res["dynamic_shapes"]
model = model.to(device)
print("-- done.")

print("-- tokenize the prompt...")
inputs = tokenizer(
    '''def print_prime(n):
   """
   Print all primes between 1 and n
   """''',
    return_tensors="pt",
    return_attention_mask=False,
).to(device)
print("-- done.")

print("-- compute the answer...")
begin = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=100)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="generate", duration=duration))
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- load the model...

Loading weights:   0%|          | 0/341 [00:00<?, ?it/s]
Loading weights:   0%|          | 1/341 [00:00<00:00, 23172.95it/s, Materializing param=lm_head.bias]
Loading weights:   0%|          | 1/341 [00:00<00:00, 6820.01it/s, Materializing param=lm_head.bias]
Loading weights:   1%|          | 2/341 [00:00<00:00, 4619.28it/s, Materializing param=lm_head.weight]
Loading weights:   1%|          | 2/341 [00:00<00:00, 3724.96it/s, Materializing param=lm_head.weight]
Loading weights:   1%|          | 3/341 [00:00<00:00, 3200.94it/s, Materializing param=model.embed_tokens.weight]
Loading weights:   1%|          | 3/341 [00:00<00:00, 2912.71it/s, Materializing param=model.embed_tokens.weight]
Loading weights:   1%|          | 4/341 [00:00<00:00, 2793.41it/s, Materializing param=model.final_layernorm.bias]
Loading weights:   1%|          | 4/341 [00:00<00:00, 2628.42it/s, Materializing param=model.final_layernorm.bias]
Loading weights:   1%|▏         | 5/341 [00:00<00:00, 2719.34it/s, Materializing param=model.final_layernorm.weight]
Loading weights:   1%|▏         | 5/341 [00:00<00:00, 2587.16it/s, Materializing param=model.final_layernorm.weight]
Loading weights:   2%|▏         | 6/341 [00:00<00:00, 1225.03it/s, Materializing param=model.layers.0.input_layernorm.bias]
Loading weights:   2%|▏         | 6/341 [00:00<00:00, 1175.42it/s, Materializing param=model.layers.0.input_layernorm.bias]
Loading weights:   2%|▏         | 7/341 [00:00<00:00, 1295.62it/s, Materializing param=model.layers.0.input_layernorm.weight]
Loading weights:   2%|▏         | 7/341 [00:00<00:00, 1271.55it/s, Materializing param=model.layers.0.input_layernorm.weight]
Loading weights:   2%|▏         | 8/341 [00:00<00:00, 785.89it/s, Materializing param=model.layers.0.mlp.fc1.bias]
Loading weights:   2%|▏         | 8/341 [00:00<00:00, 769.24it/s, Materializing param=model.layers.0.mlp.fc1.bias]
Loading weights:   3%|▎         | 9/341 [00:00<00:00, 825.99it/s, Materializing param=model.layers.0.mlp.fc1.weight]
Loading weights:   3%|▎         | 9/341 [00:00<00:00, 816.61it/s, Materializing param=model.layers.0.mlp.fc1.weight]
Loading weights:   3%|▎         | 10/341 [00:00<00:00, 881.06it/s, Materializing param=model.layers.0.mlp.fc2.bias]
Loading weights:   3%|▎         | 10/341 [00:00<00:00, 874.00it/s, Materializing param=model.layers.0.mlp.fc2.bias]
Loading weights:   3%|▎         | 11/341 [00:00<00:00, 934.07it/s, Materializing param=model.layers.0.mlp.fc2.weight]
Loading weights:   3%|▎         | 11/341 [00:00<00:00, 927.20it/s, Materializing param=model.layers.0.mlp.fc2.weight]
Loading weights:   4%|▎         | 12/341 [00:00<00:00, 966.06it/s, Materializing param=model.layers.0.self_attn.dense.bias]
Loading weights:   4%|▎         | 12/341 [00:00<00:00, 944.98it/s, Materializing param=model.layers.0.self_attn.dense.bias]
Loading weights:   4%|▍         | 13/341 [00:00<00:00, 959.25it/s, Materializing param=model.layers.0.self_attn.dense.weight]
Loading weights:   4%|▍         | 13/341 [00:00<00:00, 939.44it/s, Materializing param=model.layers.0.self_attn.dense.weight]
Loading weights:   4%|▍         | 14/341 [00:00<00:00, 920.47it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]
Loading weights:   4%|▍         | 14/341 [00:00<00:00, 906.62it/s, Materializing param=model.layers.0.self_attn.k_proj.bias]
Loading weights:   4%|▍         | 15/341 [00:00<00:00, 922.05it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]
Loading weights:   4%|▍         | 15/341 [00:00<00:00, 914.52it/s, Materializing param=model.layers.0.self_attn.k_proj.weight]
Loading weights:   5%|▍         | 16/341 [00:00<00:00, 953.96it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]
Loading weights:   5%|▍         | 16/341 [00:00<00:00, 948.63it/s, Materializing param=model.layers.0.self_attn.q_proj.bias]
Loading weights:   5%|▍         | 17/341 [00:00<00:00, 988.74it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]
Loading weights:   5%|▍         | 17/341 [00:00<00:00, 984.03it/s, Materializing param=model.layers.0.self_attn.q_proj.weight]
Loading weights:   5%|▌         | 18/341 [00:00<00:00, 1022.60it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]
Loading weights:   5%|▌         | 18/341 [00:00<00:00, 1017.65it/s, Materializing param=model.layers.0.self_attn.v_proj.bias]
Loading weights:   6%|▌         | 19/341 [00:00<00:00, 1053.93it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]
Loading weights:   6%|▌         | 19/341 [00:00<00:00, 1048.80it/s, Materializing param=model.layers.0.self_attn.v_proj.weight]
Loading weights:   6%|▌         | 20/341 [00:00<00:00, 958.46it/s, Materializing param=model.layers.1.input_layernorm.bias]
Loading weights:   6%|▌         | 20/341 [00:00<00:00, 949.68it/s, Materializing param=model.layers.1.input_layernorm.bias]
Loading weights:   6%|▌         | 21/341 [00:00<00:00, 914.10it/s, Materializing param=model.layers.1.input_layernorm.weight]
Loading weights:   6%|▌         | 21/341 [00:00<00:00, 906.76it/s, Materializing param=model.layers.1.input_layernorm.weight]
Loading weights:   6%|▋         | 22/341 [00:00<00:00, 939.24it/s, Materializing param=model.layers.1.mlp.fc1.bias]
Loading weights:   6%|▋         | 22/341 [00:00<00:00, 935.50it/s, Materializing param=model.layers.1.mlp.fc1.bias]
Loading weights:   7%|▋         | 23/341 [00:00<00:00, 851.68it/s, Materializing param=model.layers.1.mlp.fc1.weight]
Loading weights:   7%|▋         | 23/341 [00:00<00:00, 846.49it/s, Materializing param=model.layers.1.mlp.fc1.weight]
Loading weights:   7%|▋         | 24/341 [00:00<00:00, 867.36it/s, Materializing param=model.layers.1.mlp.fc2.bias]
Loading weights:   7%|▋         | 24/341 [00:00<00:00, 863.05it/s, Materializing param=model.layers.1.mlp.fc2.bias]
Loading weights:   7%|▋         | 25/341 [00:00<00:00, 855.95it/s, Materializing param=model.layers.1.mlp.fc2.weight]
Loading weights:   7%|▋         | 25/341 [00:00<00:00, 852.79it/s, Materializing param=model.layers.1.mlp.fc2.weight]
Loading weights:   8%|▊         | 26/341 [00:00<00:00, 875.22it/s, Materializing param=model.layers.1.self_attn.dense.bias]
Loading weights:   8%|▊         | 26/341 [00:00<00:00, 871.40it/s, Materializing param=model.layers.1.self_attn.dense.bias]
Loading weights:   8%|▊         | 27/341 [00:00<00:00, 883.39it/s, Materializing param=model.layers.1.self_attn.dense.weight]
Loading weights:   8%|▊         | 27/341 [00:00<00:00, 880.55it/s, Materializing param=model.layers.1.self_attn.dense.weight]
Loading weights:   8%|▊         | 28/341 [00:00<00:00, 858.24it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]
Loading weights:   8%|▊         | 28/341 [00:00<00:00, 854.95it/s, Materializing param=model.layers.1.self_attn.k_proj.bias]
Loading weights:   9%|▊         | 29/341 [00:00<00:00, 837.73it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]
Loading weights:   9%|▊         | 29/341 [00:00<00:00, 831.75it/s, Materializing param=model.layers.1.self_attn.k_proj.weight]
Loading weights:   9%|▉         | 30/341 [00:00<00:00, 841.06it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]
Loading weights:   9%|▉         | 30/341 [00:00<00:00, 835.96it/s, Materializing param=model.layers.1.self_attn.q_proj.bias]
Loading weights:   9%|▉         | 31/341 [00:00<00:00, 833.13it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]
Loading weights:   9%|▉         | 31/341 [00:00<00:00, 827.35it/s, Materializing param=model.layers.1.self_attn.q_proj.weight]
Loading weights:   9%|▉         | 32/341 [00:00<00:00, 821.82it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]
Loading weights:   9%|▉         | 32/341 [00:00<00:00, 818.38it/s, Materializing param=model.layers.1.self_attn.v_proj.bias]
Loading weights:  10%|▉         | 33/341 [00:00<00:00, 827.46it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]
Loading weights:  10%|▉         | 33/341 [00:00<00:00, 823.15it/s, Materializing param=model.layers.1.self_attn.v_proj.weight]
Loading weights:  10%|▉         | 34/341 [00:00<00:00, 734.18it/s, Materializing param=model.layers.2.input_layernorm.bias]
Loading weights:  10%|▉         | 34/341 [00:00<00:00, 729.26it/s, Materializing param=model.layers.2.input_layernorm.bias]
Loading weights:  10%|█         | 35/341 [00:00<00:00, 719.92it/s, Materializing param=model.layers.2.input_layernorm.weight]
Loading weights:  10%|█         | 35/341 [00:00<00:00, 715.11it/s, Materializing param=model.layers.2.input_layernorm.weight]
Loading weights:  11%|█         | 36/341 [00:00<00:00, 663.17it/s, Materializing param=model.layers.2.mlp.fc1.bias]
Loading weights:  11%|█         | 36/341 [00:00<00:00, 659.07it/s, Materializing param=model.layers.2.mlp.fc1.bias]
Loading weights:  11%|█         | 37/341 [00:00<00:00, 636.22it/s, Materializing param=model.layers.2.mlp.fc1.weight]
Loading weights:  11%|█         | 37/341 [00:00<00:00, 634.47it/s, Materializing param=model.layers.2.mlp.fc1.weight]
Loading weights:  11%|█         | 38/341 [00:00<00:00, 647.84it/s, Materializing param=model.layers.2.mlp.fc2.bias]
Loading weights:  11%|█         | 38/341 [00:00<00:00, 646.74it/s, Materializing param=model.layers.2.mlp.fc2.bias]
Loading weights:  11%|█▏        | 39/341 [00:00<00:00, 602.76it/s, Materializing param=model.layers.2.mlp.fc2.weight]
Loading weights:  11%|█▏        | 39/341 [00:00<00:00, 599.78it/s, Materializing param=model.layers.2.mlp.fc2.weight]
Loading weights:  12%|█▏        | 40/341 [00:00<00:00, 609.43it/s, Materializing param=model.layers.2.self_attn.dense.bias]
Loading weights:  12%|█▏        | 40/341 [00:00<00:00, 607.91it/s, Materializing param=model.layers.2.self_attn.dense.bias]
Loading weights:  12%|█▏        | 41/341 [00:00<00:00, 610.21it/s, Materializing param=model.layers.2.self_attn.dense.weight]
Loading weights:  12%|█▏        | 41/341 [00:00<00:00, 608.67it/s, Materializing param=model.layers.2.self_attn.dense.weight]
Loading weights:  12%|█▏        | 42/341 [00:00<00:00, 587.33it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]
Loading weights:  12%|█▏        | 42/341 [00:00<00:00, 585.57it/s, Materializing param=model.layers.2.self_attn.k_proj.bias]
Loading weights:  13%|█▎        | 43/341 [00:00<00:00, 559.95it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]
Loading weights:  13%|█▎        | 43/341 [00:00<00:00, 557.55it/s, Materializing param=model.layers.2.self_attn.k_proj.weight]
Loading weights:  13%|█▎        | 44/341 [00:00<00:00, 532.06it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]
Loading weights:  13%|█▎        | 44/341 [00:00<00:00, 529.95it/s, Materializing param=model.layers.2.self_attn.q_proj.bias]
Loading weights:  13%|█▎        | 45/341 [00:00<00:00, 529.71it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]
Loading weights:  13%|█▎        | 45/341 [00:00<00:00, 527.79it/s, Materializing param=model.layers.2.self_attn.q_proj.weight]
Loading weights:  13%|█▎        | 46/341 [00:00<00:00, 523.09it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]
Loading weights:  13%|█▎        | 46/341 [00:00<00:00, 520.68it/s, Materializing param=model.layers.2.self_attn.v_proj.bias]
Loading weights:  14%|█▍        | 47/341 [00:00<00:00, 520.51it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]
Loading weights:  14%|█▍        | 47/341 [00:00<00:00, 519.49it/s, Materializing param=model.layers.2.self_attn.v_proj.weight]
Loading weights:  14%|█▍        | 48/341 [00:00<00:00, 528.36it/s, Materializing param=model.layers.3.input_layernorm.bias]
Loading weights:  14%|█▍        | 48/341 [00:00<00:00, 527.78it/s, Materializing param=model.layers.3.input_layernorm.bias]
Loading weights:  14%|█▍        | 49/341 [00:00<00:00, 536.26it/s, Materializing param=model.layers.3.input_layernorm.weight]
Loading weights:  14%|█▍        | 49/341 [00:00<00:00, 535.73it/s, Materializing param=model.layers.3.input_layernorm.weight]
Loading weights:  15%|█▍        | 50/341 [00:00<00:00, 544.94it/s, Materializing param=model.layers.3.mlp.fc1.bias]
Loading weights:  15%|█▍        | 50/341 [00:00<00:00, 544.43it/s, Materializing param=model.layers.3.mlp.fc1.bias]
Loading weights:  15%|█▍        | 51/341 [00:00<00:00, 547.72it/s, Materializing param=model.layers.3.mlp.fc1.weight]
Loading weights:  15%|█▍        | 51/341 [00:00<00:00, 546.09it/s, Materializing param=model.layers.3.mlp.fc1.weight]
Loading weights:  15%|█▌        | 52/341 [00:00<00:00, 541.83it/s, Materializing param=model.layers.3.mlp.fc2.bias]
Loading weights:  15%|█▌        | 52/341 [00:00<00:00, 539.83it/s, Materializing param=model.layers.3.mlp.fc2.bias]
Loading weights:  16%|█▌        | 53/341 [00:00<00:00, 542.15it/s, Materializing param=model.layers.3.mlp.fc2.weight]
Loading weights:  16%|█▌        | 53/341 [00:00<00:00, 541.32it/s, Materializing param=model.layers.3.mlp.fc2.weight]
Loading weights:  16%|█▌        | 54/341 [00:00<00:00, 549.76it/s, Materializing param=model.layers.3.self_attn.dense.bias]
Loading weights:  16%|█▌        | 54/341 [00:00<00:00, 549.22it/s, Materializing param=model.layers.3.self_attn.dense.bias]
Loading weights:  16%|█▌        | 55/341 [00:00<00:00, 557.28it/s, Materializing param=model.layers.3.self_attn.dense.weight]
Loading weights:  16%|█▌        | 55/341 [00:00<00:00, 556.77it/s, Materializing param=model.layers.3.self_attn.dense.weight]
Loading weights:  16%|█▋        | 56/341 [00:00<00:00, 564.73it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]
Loading weights:  16%|█▋        | 56/341 [00:00<00:00, 564.24it/s, Materializing param=model.layers.3.self_attn.k_proj.bias]
Loading weights:  17%|█▋        | 57/341 [00:00<00:00, 572.12it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]
Loading weights:  17%|█▋        | 57/341 [00:00<00:00, 571.63it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]
Loading weights:  17%|█▋        | 58/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.k_proj.weight]
Loading weights:  17%|█▋        | 58/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]
Loading weights:  17%|█▋        | 58/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.q_proj.bias]
Loading weights:  17%|█▋        | 59/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]
Loading weights:  17%|█▋        | 59/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.q_proj.weight]
Loading weights:  18%|█▊        | 60/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]
Loading weights:  18%|█▊        | 60/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.v_proj.bias]
Loading weights:  18%|█▊        | 61/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]
Loading weights:  18%|█▊        | 61/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.3.self_attn.v_proj.weight]
Loading weights:  18%|█▊        | 62/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.input_layernorm.bias]
Loading weights:  18%|█▊        | 62/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.input_layernorm.bias]
Loading weights:  18%|█▊        | 63/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.input_layernorm.weight]
Loading weights:  18%|█▊        | 63/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.input_layernorm.weight]
Loading weights:  19%|█▉        | 64/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc1.bias]
Loading weights:  19%|█▉        | 64/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc1.bias]
Loading weights:  19%|█▉        | 65/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc1.weight]
Loading weights:  19%|█▉        | 65/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc1.weight]
Loading weights:  19%|█▉        | 66/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc2.bias]
Loading weights:  19%|█▉        | 66/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc2.bias]
Loading weights:  20%|█▉        | 67/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc2.weight]
Loading weights:  20%|█▉        | 67/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.mlp.fc2.weight]
Loading weights:  20%|█▉        | 68/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.dense.bias]
Loading weights:  20%|█▉        | 68/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.dense.bias]
Loading weights:  20%|██        | 69/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.dense.weight]
Loading weights:  20%|██        | 69/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.dense.weight]
Loading weights:  21%|██        | 70/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]
Loading weights:  21%|██        | 70/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.k_proj.bias]
Loading weights:  21%|██        | 71/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]
Loading weights:  21%|██        | 71/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.k_proj.weight]
Loading weights:  21%|██        | 72/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]
Loading weights:  21%|██        | 72/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.q_proj.bias]
Loading weights:  21%|██▏       | 73/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]
Loading weights:  21%|██▏       | 73/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.q_proj.weight]
Loading weights:  22%|██▏       | 74/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]
Loading weights:  22%|██▏       | 74/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.v_proj.bias]
Loading weights:  22%|██▏       | 75/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]
Loading weights:  22%|██▏       | 75/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.4.self_attn.v_proj.weight]
Loading weights:  22%|██▏       | 76/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.input_layernorm.bias]
Loading weights:  22%|██▏       | 76/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.input_layernorm.bias]
Loading weights:  23%|██▎       | 77/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.input_layernorm.weight]
Loading weights:  23%|██▎       | 77/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.input_layernorm.weight]
Loading weights:  23%|██▎       | 78/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc1.bias]
Loading weights:  23%|██▎       | 78/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc1.bias]
Loading weights:  23%|██▎       | 79/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc1.weight]
Loading weights:  23%|██▎       | 79/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc1.weight]
Loading weights:  23%|██▎       | 80/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc2.bias]
Loading weights:  23%|██▎       | 80/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc2.bias]
Loading weights:  24%|██▍       | 81/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc2.weight]
Loading weights:  24%|██▍       | 81/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.mlp.fc2.weight]
Loading weights:  24%|██▍       | 82/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.dense.bias]
Loading weights:  24%|██▍       | 82/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.dense.bias]
Loading weights:  24%|██▍       | 83/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.dense.weight]
Loading weights:  24%|██▍       | 83/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.dense.weight]
Loading weights:  25%|██▍       | 84/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]
Loading weights:  25%|██▍       | 84/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.k_proj.bias]
Loading weights:  25%|██▍       | 85/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]
Loading weights:  25%|██▍       | 85/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.k_proj.weight]
Loading weights:  25%|██▌       | 86/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.q_proj.bias]
Loading weights:  25%|██▌       | 86/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.q_proj.bias]
Loading weights:  26%|██▌       | 87/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]
Loading weights:  26%|██▌       | 87/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.q_proj.weight]
Loading weights:  26%|██▌       | 88/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.v_proj.bias]
Loading weights:  26%|██▌       | 88/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.v_proj.bias]
Loading weights:  26%|██▌       | 89/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]
Loading weights:  26%|██▌       | 89/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.5.self_attn.v_proj.weight]
Loading weights:  26%|██▋       | 90/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.input_layernorm.bias]
Loading weights:  26%|██▋       | 90/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.input_layernorm.bias]
Loading weights:  27%|██▋       | 91/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.input_layernorm.weight]
Loading weights:  27%|██▋       | 91/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.input_layernorm.weight]
Loading weights:  27%|██▋       | 92/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc1.bias]
Loading weights:  27%|██▋       | 92/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc1.bias]
Loading weights:  27%|██▋       | 93/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc1.weight]
Loading weights:  27%|██▋       | 93/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc1.weight]
Loading weights:  28%|██▊       | 94/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc2.bias]
Loading weights:  28%|██▊       | 94/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc2.bias]
Loading weights:  28%|██▊       | 95/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc2.weight]
Loading weights:  28%|██▊       | 95/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.mlp.fc2.weight]
Loading weights:  28%|██▊       | 96/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.dense.bias]
Loading weights:  28%|██▊       | 96/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.dense.bias]
Loading weights:  28%|██▊       | 97/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.dense.weight]
Loading weights:  28%|██▊       | 97/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.dense.weight]
Loading weights:  29%|██▊       | 98/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.k_proj.bias]
Loading weights:  29%|██▊       | 98/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.k_proj.bias]
Loading weights:  29%|██▉       | 99/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]
Loading weights:  29%|██▉       | 99/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.k_proj.weight]
Loading weights:  29%|██▉       | 100/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.q_proj.bias]
Loading weights:  29%|██▉       | 100/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.q_proj.bias]
Loading weights:  30%|██▉       | 101/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]
Loading weights:  30%|██▉       | 101/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.q_proj.weight]
Loading weights:  30%|██▉       | 102/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.v_proj.bias]
Loading weights:  30%|██▉       | 102/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.v_proj.bias]
Loading weights:  30%|███       | 103/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]
Loading weights:  30%|███       | 103/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.6.self_attn.v_proj.weight]
Loading weights:  30%|███       | 104/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.input_layernorm.bias]
Loading weights:  30%|███       | 104/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.input_layernorm.bias]
Loading weights:  31%|███       | 105/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.input_layernorm.weight]
Loading weights:  31%|███       | 105/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.input_layernorm.weight]
Loading weights:  31%|███       | 106/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc1.bias]
Loading weights:  31%|███       | 106/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc1.bias]
Loading weights:  31%|███▏      | 107/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc1.weight]
Loading weights:  31%|███▏      | 107/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc1.weight]
Loading weights:  32%|███▏      | 108/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc2.bias]
Loading weights:  32%|███▏      | 108/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc2.bias]
Loading weights:  32%|███▏      | 109/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc2.weight]
Loading weights:  32%|███▏      | 109/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.mlp.fc2.weight]
Loading weights:  32%|███▏      | 110/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.dense.bias]
Loading weights:  32%|███▏      | 110/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.dense.bias]
Loading weights:  33%|███▎      | 111/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.dense.weight]
Loading weights:  33%|███▎      | 111/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.dense.weight]
Loading weights:  33%|███▎      | 112/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.k_proj.bias]
Loading weights:  33%|███▎      | 112/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.k_proj.bias]
Loading weights:  33%|███▎      | 113/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]
Loading weights:  33%|███▎      | 113/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.k_proj.weight]
Loading weights:  33%|███▎      | 114/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.q_proj.bias]
Loading weights:  33%|███▎      | 114/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.q_proj.bias]
Loading weights:  34%|███▎      | 115/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]
Loading weights:  34%|███▎      | 115/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.q_proj.weight]
Loading weights:  34%|███▍      | 116/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.v_proj.bias]
Loading weights:  34%|███▍      | 116/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.v_proj.bias]
Loading weights:  34%|███▍      | 117/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]
Loading weights:  34%|███▍      | 117/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.7.self_attn.v_proj.weight]
Loading weights:  35%|███▍      | 118/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.input_layernorm.bias]
Loading weights:  35%|███▍      | 118/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.input_layernorm.bias]
Loading weights:  35%|███▍      | 119/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.input_layernorm.weight]
Loading weights:  35%|███▍      | 119/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.input_layernorm.weight]
Loading weights:  35%|███▌      | 120/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.mlp.fc1.bias]
Loading weights:  35%|███▌      | 120/341 [00:00<00:00, 579.61it/s, Materializing param=model.layers.8.mlp.fc1.bias]
Loading weights:  35%|███▌      | 121/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc1.bias]
Loading weights:  35%|███▌      | 121/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc1.weight]
Loading weights:  35%|███▌      | 121/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc1.weight]
Loading weights:  36%|███▌      | 122/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc2.bias]
Loading weights:  36%|███▌      | 122/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc2.bias]
Loading weights:  36%|███▌      | 123/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc2.weight]
Loading weights:  36%|███▌      | 123/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.mlp.fc2.weight]
Loading weights:  36%|███▋      | 124/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.dense.bias]
Loading weights:  36%|███▋      | 124/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.dense.bias]
Loading weights:  37%|███▋      | 125/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.dense.weight]
Loading weights:  37%|███▋      | 125/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.dense.weight]
Loading weights:  37%|███▋      | 126/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.k_proj.bias]
Loading weights:  37%|███▋      | 126/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.k_proj.bias]
Loading weights:  37%|███▋      | 127/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]
Loading weights:  37%|███▋      | 127/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.k_proj.weight]
Loading weights:  38%|███▊      | 128/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.q_proj.bias]
Loading weights:  38%|███▊      | 128/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.q_proj.bias]
Loading weights:  38%|███▊      | 129/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]
Loading weights:  38%|███▊      | 129/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.q_proj.weight]
Loading weights:  38%|███▊      | 130/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.v_proj.bias]
Loading weights:  38%|███▊      | 130/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.v_proj.bias]
Loading weights:  38%|███▊      | 131/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]
Loading weights:  38%|███▊      | 131/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.8.self_attn.v_proj.weight]
Loading weights:  39%|███▊      | 132/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.input_layernorm.bias]
Loading weights:  39%|███▊      | 132/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.input_layernorm.bias]
Loading weights:  39%|███▉      | 133/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.input_layernorm.weight]
Loading weights:  39%|███▉      | 133/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.input_layernorm.weight]
Loading weights:  39%|███▉      | 134/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc1.bias]
Loading weights:  39%|███▉      | 134/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc1.bias]
Loading weights:  40%|███▉      | 135/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc1.weight]
Loading weights:  40%|███▉      | 135/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc1.weight]
Loading weights:  40%|███▉      | 136/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc2.bias]
Loading weights:  40%|███▉      | 136/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc2.bias]
Loading weights:  40%|████      | 137/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc2.weight]
Loading weights:  40%|████      | 137/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.mlp.fc2.weight]
Loading weights:  40%|████      | 138/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.dense.bias]
Loading weights:  40%|████      | 138/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.dense.bias]
Loading weights:  41%|████      | 139/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.dense.weight]
Loading weights:  41%|████      | 139/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.dense.weight]
Loading weights:  41%|████      | 140/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.k_proj.bias]
Loading weights:  41%|████      | 140/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.k_proj.bias]
Loading weights:  41%|████▏     | 141/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]
Loading weights:  41%|████▏     | 141/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.k_proj.weight]
Loading weights:  42%|████▏     | 142/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.q_proj.bias]
Loading weights:  42%|████▏     | 142/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.q_proj.bias]
Loading weights:  42%|████▏     | 143/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]
Loading weights:  42%|████▏     | 143/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.q_proj.weight]
Loading weights:  42%|████▏     | 144/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.v_proj.bias]
Loading weights:  42%|████▏     | 144/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.v_proj.bias]
Loading weights:  43%|████▎     | 145/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]
Loading weights:  43%|████▎     | 145/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.9.self_attn.v_proj.weight]
Loading weights:  43%|████▎     | 146/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.input_layernorm.bias]
Loading weights:  43%|████▎     | 146/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.input_layernorm.bias]
Loading weights:  43%|████▎     | 147/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.input_layernorm.weight]
Loading weights:  43%|████▎     | 147/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.input_layernorm.weight]
Loading weights:  43%|████▎     | 148/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc1.bias]
Loading weights:  43%|████▎     | 148/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc1.bias]
Loading weights:  44%|████▎     | 149/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc1.weight]
Loading weights:  44%|████▎     | 149/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc1.weight]
Loading weights:  44%|████▍     | 150/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc2.bias]
Loading weights:  44%|████▍     | 150/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc2.bias]
Loading weights:  44%|████▍     | 151/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc2.weight]
Loading weights:  44%|████▍     | 151/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.mlp.fc2.weight]
Loading weights:  45%|████▍     | 152/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.dense.bias]
Loading weights:  45%|████▍     | 152/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.dense.bias]
Loading weights:  45%|████▍     | 153/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.dense.weight]
Loading weights:  45%|████▍     | 153/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.dense.weight]
Loading weights:  45%|████▌     | 154/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.k_proj.bias]
Loading weights:  45%|████▌     | 154/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.k_proj.bias]
Loading weights:  45%|████▌     | 155/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]
Loading weights:  45%|████▌     | 155/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.k_proj.weight]
Loading weights:  46%|████▌     | 156/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.q_proj.bias]
Loading weights:  46%|████▌     | 156/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.q_proj.bias]
Loading weights:  46%|████▌     | 157/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]
Loading weights:  46%|████▌     | 157/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.q_proj.weight]
Loading weights:  46%|████▋     | 158/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.v_proj.bias]
Loading weights:  46%|████▋     | 158/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.v_proj.bias]
Loading weights:  47%|████▋     | 159/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]
Loading weights:  47%|████▋     | 159/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.10.self_attn.v_proj.weight]
Loading weights:  47%|████▋     | 160/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.input_layernorm.bias]
Loading weights:  47%|████▋     | 160/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.input_layernorm.bias]
Loading weights:  47%|████▋     | 161/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.input_layernorm.weight]
Loading weights:  47%|████▋     | 161/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.input_layernorm.weight]
Loading weights:  48%|████▊     | 162/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc1.bias]
Loading weights:  48%|████▊     | 162/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc1.bias]
Loading weights:  48%|████▊     | 163/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc1.weight]
Loading weights:  48%|████▊     | 163/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc1.weight]
Loading weights:  48%|████▊     | 164/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc2.bias]
Loading weights:  48%|████▊     | 164/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc2.bias]
Loading weights:  48%|████▊     | 165/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc2.weight]
Loading weights:  48%|████▊     | 165/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.mlp.fc2.weight]
Loading weights:  49%|████▊     | 166/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.dense.bias]
Loading weights:  49%|████▊     | 166/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.dense.bias]
Loading weights:  49%|████▉     | 167/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.dense.weight]
Loading weights:  49%|████▉     | 167/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.dense.weight]
Loading weights:  49%|████▉     | 168/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.k_proj.bias]
Loading weights:  49%|████▉     | 168/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.k_proj.bias]
Loading weights:  50%|████▉     | 169/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]
Loading weights:  50%|████▉     | 169/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.k_proj.weight]
Loading weights:  50%|████▉     | 170/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.q_proj.bias]
Loading weights:  50%|████▉     | 170/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.q_proj.bias]
Loading weights:  50%|█████     | 171/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]
Loading weights:  50%|█████     | 171/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.q_proj.weight]
Loading weights:  50%|█████     | 172/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.v_proj.bias]
Loading weights:  50%|█████     | 172/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.v_proj.bias]
Loading weights:  51%|█████     | 173/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]
Loading weights:  51%|█████     | 173/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.11.self_attn.v_proj.weight]
Loading weights:  51%|█████     | 174/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.input_layernorm.bias]
Loading weights:  51%|█████     | 174/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.input_layernorm.bias]
Loading weights:  51%|█████▏    | 175/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.input_layernorm.weight]
Loading weights:  51%|█████▏    | 175/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.input_layernorm.weight]
Loading weights:  52%|█████▏    | 176/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc1.bias]
Loading weights:  52%|█████▏    | 176/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc1.bias]
Loading weights:  52%|█████▏    | 177/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc1.weight]
Loading weights:  52%|█████▏    | 177/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc1.weight]
Loading weights:  52%|█████▏    | 178/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc2.bias]
Loading weights:  52%|█████▏    | 178/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc2.bias]
Loading weights:  52%|█████▏    | 179/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc2.weight]
Loading weights:  52%|█████▏    | 179/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.mlp.fc2.weight]
Loading weights:  53%|█████▎    | 180/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.dense.bias]
Loading weights:  53%|█████▎    | 180/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.dense.bias]
Loading weights:  53%|█████▎    | 181/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.dense.weight]
Loading weights:  53%|█████▎    | 181/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.dense.weight]
Loading weights:  53%|█████▎    | 182/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.k_proj.bias]
Loading weights:  53%|█████▎    | 182/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.k_proj.bias]
Loading weights:  54%|█████▎    | 183/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]
Loading weights:  54%|█████▎    | 183/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.k_proj.weight]
Loading weights:  54%|█████▍    | 184/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.q_proj.bias]
Loading weights:  54%|█████▍    | 184/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.q_proj.bias]
Loading weights:  54%|█████▍    | 185/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]
Loading weights:  54%|█████▍    | 185/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.q_proj.weight]
Loading weights:  55%|█████▍    | 186/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.v_proj.bias]
Loading weights:  55%|█████▍    | 186/341 [00:00<00:00, 588.87it/s, Materializing param=model.layers.12.self_attn.v_proj.bias]
Loading weights:  55%|█████▍    | 187/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.12.self_attn.v_proj.bias]
Loading weights:  55%|█████▍    | 187/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]
Loading weights:  55%|█████▍    | 187/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.12.self_attn.v_proj.weight]
Loading weights:  55%|█████▌    | 188/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.input_layernorm.bias]
Loading weights:  55%|█████▌    | 188/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.input_layernorm.bias]
Loading weights:  55%|█████▌    | 189/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.input_layernorm.weight]
Loading weights:  55%|█████▌    | 189/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.input_layernorm.weight]
Loading weights:  56%|█████▌    | 190/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc1.bias]
Loading weights:  56%|█████▌    | 190/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc1.bias]
Loading weights:  56%|█████▌    | 191/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc1.weight]
Loading weights:  56%|█████▌    | 191/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc1.weight]
Loading weights:  56%|█████▋    | 192/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc2.bias]
Loading weights:  56%|█████▋    | 192/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc2.bias]
Loading weights:  57%|█████▋    | 193/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc2.weight]
Loading weights:  57%|█████▋    | 193/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.mlp.fc2.weight]
Loading weights:  57%|█████▋    | 194/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.dense.bias]
Loading weights:  57%|█████▋    | 194/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.dense.bias]
Loading weights:  57%|█████▋    | 195/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.dense.weight]
Loading weights:  57%|█████▋    | 195/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.dense.weight]
Loading weights:  57%|█████▋    | 196/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.k_proj.bias]
Loading weights:  57%|█████▋    | 196/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.k_proj.bias]
Loading weights:  58%|█████▊    | 197/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]
Loading weights:  58%|█████▊    | 197/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.k_proj.weight]
Loading weights:  58%|█████▊    | 198/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.q_proj.bias]
Loading weights:  58%|█████▊    | 198/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.q_proj.bias]
Loading weights:  58%|█████▊    | 199/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]
Loading weights:  58%|█████▊    | 199/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.q_proj.weight]
Loading weights:  59%|█████▊    | 200/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.v_proj.bias]
Loading weights:  59%|█████▊    | 200/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.v_proj.bias]
Loading weights:  59%|█████▉    | 201/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]
Loading weights:  59%|█████▉    | 201/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.13.self_attn.v_proj.weight]
Loading weights:  59%|█████▉    | 202/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.input_layernorm.bias]
Loading weights:  59%|█████▉    | 202/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.input_layernorm.bias]
Loading weights:  60%|█████▉    | 203/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.input_layernorm.weight]
Loading weights:  60%|█████▉    | 203/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.input_layernorm.weight]
Loading weights:  60%|█████▉    | 204/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc1.bias]
Loading weights:  60%|█████▉    | 204/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc1.bias]
Loading weights:  60%|██████    | 205/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc1.weight]
Loading weights:  60%|██████    | 205/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc1.weight]
Loading weights:  60%|██████    | 206/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc2.bias]
Loading weights:  60%|██████    | 206/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc2.bias]
Loading weights:  61%|██████    | 207/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc2.weight]
Loading weights:  61%|██████    | 207/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.mlp.fc2.weight]
Loading weights:  61%|██████    | 208/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.dense.bias]
Loading weights:  61%|██████    | 208/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.dense.bias]
Loading weights:  61%|██████▏   | 209/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.dense.weight]
Loading weights:  61%|██████▏   | 209/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.dense.weight]
Loading weights:  62%|██████▏   | 210/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.k_proj.bias]
Loading weights:  62%|██████▏   | 210/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.k_proj.bias]
Loading weights:  62%|██████▏   | 211/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]
Loading weights:  62%|██████▏   | 211/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.k_proj.weight]
Loading weights:  62%|██████▏   | 212/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.q_proj.bias]
Loading weights:  62%|██████▏   | 212/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.q_proj.bias]
Loading weights:  62%|██████▏   | 213/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]
Loading weights:  62%|██████▏   | 213/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.q_proj.weight]
Loading weights:  63%|██████▎   | 214/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.v_proj.bias]
Loading weights:  63%|██████▎   | 214/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.v_proj.bias]
Loading weights:  63%|██████▎   | 215/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]
Loading weights:  63%|██████▎   | 215/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.14.self_attn.v_proj.weight]
Loading weights:  63%|██████▎   | 216/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.input_layernorm.bias]
Loading weights:  63%|██████▎   | 216/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.input_layernorm.bias]
Loading weights:  64%|██████▎   | 217/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.input_layernorm.weight]
Loading weights:  64%|██████▎   | 217/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.input_layernorm.weight]
Loading weights:  64%|██████▍   | 218/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc1.bias]
Loading weights:  64%|██████▍   | 218/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc1.bias]
Loading weights:  64%|██████▍   | 219/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc1.weight]
Loading weights:  64%|██████▍   | 219/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc1.weight]
Loading weights:  65%|██████▍   | 220/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc2.bias]
Loading weights:  65%|██████▍   | 220/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc2.bias]
Loading weights:  65%|██████▍   | 221/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc2.weight]
Loading weights:  65%|██████▍   | 221/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.mlp.fc2.weight]
Loading weights:  65%|██████▌   | 222/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.dense.bias]
Loading weights:  65%|██████▌   | 222/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.dense.bias]
Loading weights:  65%|██████▌   | 223/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.dense.weight]
Loading weights:  65%|██████▌   | 223/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.dense.weight]
Loading weights:  66%|██████▌   | 224/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.k_proj.bias]
Loading weights:  66%|██████▌   | 224/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.k_proj.bias]
Loading weights:  66%|██████▌   | 225/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]
Loading weights:  66%|██████▌   | 225/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.k_proj.weight]
Loading weights:  66%|██████▋   | 226/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.q_proj.bias]
Loading weights:  66%|██████▋   | 226/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.q_proj.bias]
Loading weights:  67%|██████▋   | 227/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]
Loading weights:  67%|██████▋   | 227/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.q_proj.weight]
Loading weights:  67%|██████▋   | 228/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.v_proj.bias]
Loading weights:  67%|██████▋   | 228/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.v_proj.bias]
Loading weights:  67%|██████▋   | 229/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]
Loading weights:  67%|██████▋   | 229/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.15.self_attn.v_proj.weight]
Loading weights:  67%|██████▋   | 230/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.input_layernorm.bias]
Loading weights:  67%|██████▋   | 230/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.input_layernorm.bias]
Loading weights:  68%|██████▊   | 231/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.input_layernorm.weight]
Loading weights:  68%|██████▊   | 231/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.input_layernorm.weight]
Loading weights:  68%|██████▊   | 232/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc1.bias]
Loading weights:  68%|██████▊   | 232/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc1.bias]
Loading weights:  68%|██████▊   | 233/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc1.weight]
Loading weights:  68%|██████▊   | 233/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc1.weight]
Loading weights:  69%|██████▊   | 234/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc2.bias]
Loading weights:  69%|██████▊   | 234/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc2.bias]
Loading weights:  69%|██████▉   | 235/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc2.weight]
Loading weights:  69%|██████▉   | 235/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.mlp.fc2.weight]
Loading weights:  69%|██████▉   | 236/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.dense.bias]
Loading weights:  69%|██████▉   | 236/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.dense.bias]
Loading weights:  70%|██████▉   | 237/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.dense.weight]
Loading weights:  70%|██████▉   | 237/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.dense.weight]
Loading weights:  70%|██████▉   | 238/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.k_proj.bias]
Loading weights:  70%|██████▉   | 238/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.k_proj.bias]
Loading weights:  70%|███████   | 239/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]
Loading weights:  70%|███████   | 239/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.k_proj.weight]
Loading weights:  70%|███████   | 240/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.q_proj.bias]
Loading weights:  70%|███████   | 240/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.q_proj.bias]
Loading weights:  71%|███████   | 241/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]
Loading weights:  71%|███████   | 241/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.q_proj.weight]
Loading weights:  71%|███████   | 242/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.v_proj.bias]
Loading weights:  71%|███████   | 242/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.v_proj.bias]
Loading weights:  71%|███████▏  | 243/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]
Loading weights:  71%|███████▏  | 243/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.16.self_attn.v_proj.weight]
Loading weights:  72%|███████▏  | 244/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.input_layernorm.bias]
Loading weights:  72%|███████▏  | 244/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.input_layernorm.bias]
Loading weights:  72%|███████▏  | 245/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.input_layernorm.weight]
Loading weights:  72%|███████▏  | 245/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.input_layernorm.weight]
Loading weights:  72%|███████▏  | 246/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc1.bias]
Loading weights:  72%|███████▏  | 246/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc1.bias]
Loading weights:  72%|███████▏  | 247/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc1.weight]
Loading weights:  72%|███████▏  | 247/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc1.weight]
Loading weights:  73%|███████▎  | 248/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc2.bias]
Loading weights:  73%|███████▎  | 248/341 [00:00<00:00, 619.34it/s, Materializing param=model.layers.17.mlp.fc2.bias]
Loading weights:  73%|███████▎  | 249/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.mlp.fc2.bias]
Loading weights:  73%|███████▎  | 249/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.mlp.fc2.weight]
Loading weights:  73%|███████▎  | 249/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.mlp.fc2.weight]
Loading weights:  73%|███████▎  | 250/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.dense.bias]
Loading weights:  73%|███████▎  | 250/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.dense.bias]
Loading weights:  74%|███████▎  | 251/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.dense.weight]
Loading weights:  74%|███████▎  | 251/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.dense.weight]
Loading weights:  74%|███████▍  | 252/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.k_proj.bias]
Loading weights:  74%|███████▍  | 252/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.k_proj.bias]
Loading weights:  74%|███████▍  | 253/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]
Loading weights:  74%|███████▍  | 253/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.k_proj.weight]
Loading weights:  74%|███████▍  | 254/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.q_proj.bias]
Loading weights:  74%|███████▍  | 254/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.q_proj.bias]
Loading weights:  75%|███████▍  | 255/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]
Loading weights:  75%|███████▍  | 255/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.q_proj.weight]
Loading weights:  75%|███████▌  | 256/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.v_proj.bias]
Loading weights:  75%|███████▌  | 256/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.v_proj.bias]
Loading weights:  75%|███████▌  | 257/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]
Loading weights:  75%|███████▌  | 257/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.17.self_attn.v_proj.weight]
Loading weights:  76%|███████▌  | 258/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.input_layernorm.bias]
Loading weights:  76%|███████▌  | 258/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.input_layernorm.bias]
Loading weights:  76%|███████▌  | 259/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.input_layernorm.weight]
Loading weights:  76%|███████▌  | 259/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.input_layernorm.weight]
Loading weights:  76%|███████▌  | 260/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc1.bias]
Loading weights:  76%|███████▌  | 260/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc1.bias]
Loading weights:  77%|███████▋  | 261/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc1.weight]
Loading weights:  77%|███████▋  | 261/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc1.weight]
Loading weights:  77%|███████▋  | 262/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc2.bias]
Loading weights:  77%|███████▋  | 262/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc2.bias]
Loading weights:  77%|███████▋  | 263/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc2.weight]
Loading weights:  77%|███████▋  | 263/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.mlp.fc2.weight]
Loading weights:  77%|███████▋  | 264/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.dense.bias]
Loading weights:  77%|███████▋  | 264/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.dense.bias]
Loading weights:  78%|███████▊  | 265/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.dense.weight]
Loading weights:  78%|███████▊  | 265/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.dense.weight]
Loading weights:  78%|███████▊  | 266/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.k_proj.bias]
Loading weights:  78%|███████▊  | 266/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.k_proj.bias]
Loading weights:  78%|███████▊  | 267/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]
Loading weights:  78%|███████▊  | 267/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.k_proj.weight]
Loading weights:  79%|███████▊  | 268/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.q_proj.bias]
Loading weights:  79%|███████▊  | 268/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.q_proj.bias]
Loading weights:  79%|███████▉  | 269/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]
Loading weights:  79%|███████▉  | 269/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.q_proj.weight]
Loading weights:  79%|███████▉  | 270/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.v_proj.bias]
Loading weights:  79%|███████▉  | 270/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.v_proj.bias]
Loading weights:  79%|███████▉  | 271/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]
Loading weights:  79%|███████▉  | 271/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.18.self_attn.v_proj.weight]
Loading weights:  80%|███████▉  | 272/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.input_layernorm.bias]
Loading weights:  80%|███████▉  | 272/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.input_layernorm.bias]
Loading weights:  80%|████████  | 273/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.input_layernorm.weight]
Loading weights:  80%|████████  | 273/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.input_layernorm.weight]
Loading weights:  80%|████████  | 274/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc1.bias]
Loading weights:  80%|████████  | 274/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc1.bias]
Loading weights:  81%|████████  | 275/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc1.weight]
Loading weights:  81%|████████  | 275/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc1.weight]
Loading weights:  81%|████████  | 276/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc2.bias]
Loading weights:  81%|████████  | 276/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc2.bias]
Loading weights:  81%|████████  | 277/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc2.weight]
Loading weights:  81%|████████  | 277/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.mlp.fc2.weight]
Loading weights:  82%|████████▏ | 278/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.dense.bias]
Loading weights:  82%|████████▏ | 278/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.dense.bias]
Loading weights:  82%|████████▏ | 279/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.dense.weight]
Loading weights:  82%|████████▏ | 279/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.dense.weight]
Loading weights:  82%|████████▏ | 280/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.k_proj.bias]
Loading weights:  82%|████████▏ | 280/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.k_proj.bias]
Loading weights:  82%|████████▏ | 281/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]
Loading weights:  82%|████████▏ | 281/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.k_proj.weight]
Loading weights:  83%|████████▎ | 282/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.q_proj.bias]
Loading weights:  83%|████████▎ | 282/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.q_proj.bias]
Loading weights:  83%|████████▎ | 283/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]
Loading weights:  83%|████████▎ | 283/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.q_proj.weight]
Loading weights:  83%|████████▎ | 284/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.v_proj.bias]
Loading weights:  83%|████████▎ | 284/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.v_proj.bias]
Loading weights:  84%|████████▎ | 285/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]
Loading weights:  84%|████████▎ | 285/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.19.self_attn.v_proj.weight]
Loading weights:  84%|████████▍ | 286/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.input_layernorm.bias]
Loading weights:  84%|████████▍ | 286/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.input_layernorm.bias]
Loading weights:  84%|████████▍ | 287/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.input_layernorm.weight]
Loading weights:  84%|████████▍ | 287/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.input_layernorm.weight]
Loading weights:  84%|████████▍ | 288/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc1.bias]
Loading weights:  84%|████████▍ | 288/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc1.bias]
Loading weights:  85%|████████▍ | 289/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc1.weight]
Loading weights:  85%|████████▍ | 289/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc1.weight]
Loading weights:  85%|████████▌ | 290/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc2.bias]
Loading weights:  85%|████████▌ | 290/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc2.bias]
Loading weights:  85%|████████▌ | 291/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc2.weight]
Loading weights:  85%|████████▌ | 291/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.mlp.fc2.weight]
Loading weights:  86%|████████▌ | 292/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.dense.bias]
Loading weights:  86%|████████▌ | 292/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.dense.bias]
Loading weights:  86%|████████▌ | 293/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.dense.weight]
Loading weights:  86%|████████▌ | 293/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.dense.weight]
Loading weights:  86%|████████▌ | 294/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.k_proj.bias]
Loading weights:  86%|████████▌ | 294/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.k_proj.bias]
Loading weights:  87%|████████▋ | 295/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]
Loading weights:  87%|████████▋ | 295/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.k_proj.weight]
Loading weights:  87%|████████▋ | 296/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.q_proj.bias]
Loading weights:  87%|████████▋ | 296/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.q_proj.bias]
Loading weights:  87%|████████▋ | 297/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]
Loading weights:  87%|████████▋ | 297/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.q_proj.weight]
Loading weights:  87%|████████▋ | 298/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.v_proj.bias]
Loading weights:  87%|████████▋ | 298/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.v_proj.bias]
Loading weights:  88%|████████▊ | 299/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]
Loading weights:  88%|████████▊ | 299/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.20.self_attn.v_proj.weight]
Loading weights:  88%|████████▊ | 300/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.input_layernorm.bias]
Loading weights:  88%|████████▊ | 300/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.input_layernorm.bias]
Loading weights:  88%|████████▊ | 301/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.input_layernorm.weight]
Loading weights:  88%|████████▊ | 301/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.input_layernorm.weight]
Loading weights:  89%|████████▊ | 302/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc1.bias]
Loading weights:  89%|████████▊ | 302/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc1.bias]
Loading weights:  89%|████████▉ | 303/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc1.weight]
Loading weights:  89%|████████▉ | 303/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc1.weight]
Loading weights:  89%|████████▉ | 304/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc2.bias]
Loading weights:  89%|████████▉ | 304/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc2.bias]
Loading weights:  89%|████████▉ | 305/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc2.weight]
Loading weights:  89%|████████▉ | 305/341 [00:00<00:00, 557.23it/s, Materializing param=model.layers.21.mlp.fc2.weight]
Loading weights:  90%|████████▉ | 306/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.mlp.fc2.weight]
Loading weights:  90%|████████▉ | 306/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.dense.bias]
Loading weights:  90%|████████▉ | 306/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.dense.bias]
Loading weights:  90%|█████████ | 307/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.dense.weight]
Loading weights:  90%|█████████ | 307/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.dense.weight]
Loading weights:  90%|█████████ | 308/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.k_proj.bias]
Loading weights:  90%|█████████ | 308/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.k_proj.bias]
Loading weights:  91%|█████████ | 309/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]
Loading weights:  91%|█████████ | 309/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.k_proj.weight]
Loading weights:  91%|█████████ | 310/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.q_proj.bias]
Loading weights:  91%|█████████ | 310/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.q_proj.bias]
Loading weights:  91%|█████████ | 311/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]
Loading weights:  91%|█████████ | 311/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.q_proj.weight]
Loading weights:  91%|█████████▏| 312/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.v_proj.bias]
Loading weights:  91%|█████████▏| 312/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.v_proj.bias]
Loading weights:  92%|█████████▏| 313/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]
Loading weights:  92%|█████████▏| 313/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.21.self_attn.v_proj.weight]
Loading weights:  92%|█████████▏| 314/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.input_layernorm.bias]
Loading weights:  92%|█████████▏| 314/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.input_layernorm.bias]
Loading weights:  92%|█████████▏| 315/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.input_layernorm.weight]
Loading weights:  92%|█████████▏| 315/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.input_layernorm.weight]
Loading weights:  93%|█████████▎| 316/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc1.bias]
Loading weights:  93%|█████████▎| 316/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc1.bias]
Loading weights:  93%|█████████▎| 317/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc1.weight]
Loading weights:  93%|█████████▎| 317/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc1.weight]
Loading weights:  93%|█████████▎| 318/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc2.bias]
Loading weights:  93%|█████████▎| 318/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc2.bias]
Loading weights:  94%|█████████▎| 319/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc2.weight]
Loading weights:  94%|█████████▎| 319/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.mlp.fc2.weight]
Loading weights:  94%|█████████▍| 320/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.dense.bias]
Loading weights:  94%|█████████▍| 320/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.dense.bias]
Loading weights:  94%|█████████▍| 321/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.dense.weight]
Loading weights:  94%|█████████▍| 321/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.dense.weight]
Loading weights:  94%|█████████▍| 322/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.k_proj.bias]
Loading weights:  94%|█████████▍| 322/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.k_proj.bias]
Loading weights:  95%|█████████▍| 323/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]
Loading weights:  95%|█████████▍| 323/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.k_proj.weight]
Loading weights:  95%|█████████▌| 324/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.q_proj.bias]
Loading weights:  95%|█████████▌| 324/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.q_proj.bias]
Loading weights:  95%|█████████▌| 325/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]
Loading weights:  95%|█████████▌| 325/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.q_proj.weight]
Loading weights:  96%|█████████▌| 326/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.v_proj.bias]
Loading weights:  96%|█████████▌| 326/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.v_proj.bias]
Loading weights:  96%|█████████▌| 327/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]
Loading weights:  96%|█████████▌| 327/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.22.self_attn.v_proj.weight]
Loading weights:  96%|█████████▌| 328/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.input_layernorm.bias]
Loading weights:  96%|█████████▌| 328/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.input_layernorm.bias]
Loading weights:  96%|█████████▋| 329/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.input_layernorm.weight]
Loading weights:  96%|█████████▋| 329/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.input_layernorm.weight]
Loading weights:  97%|█████████▋| 330/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc1.bias]
Loading weights:  97%|█████████▋| 330/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc1.bias]
Loading weights:  97%|█████████▋| 331/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc1.weight]
Loading weights:  97%|█████████▋| 331/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc1.weight]
Loading weights:  97%|█████████▋| 332/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc2.bias]
Loading weights:  97%|█████████▋| 332/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc2.bias]
Loading weights:  98%|█████████▊| 333/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc2.weight]
Loading weights:  98%|█████████▊| 333/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.mlp.fc2.weight]
Loading weights:  98%|█████████▊| 334/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.dense.bias]
Loading weights:  98%|█████████▊| 334/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.dense.bias]
Loading weights:  98%|█████████▊| 335/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.dense.weight]
Loading weights:  98%|█████████▊| 335/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.dense.weight]
Loading weights:  99%|█████████▊| 336/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.k_proj.bias]
Loading weights:  99%|█████████▊| 336/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.k_proj.bias]
Loading weights:  99%|█████████▉| 337/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]
Loading weights:  99%|█████████▉| 337/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.k_proj.weight]
Loading weights:  99%|█████████▉| 338/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.q_proj.bias]
Loading weights:  99%|█████████▉| 338/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.q_proj.bias]
Loading weights:  99%|█████████▉| 339/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]
Loading weights:  99%|█████████▉| 339/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.q_proj.weight]
Loading weights: 100%|█████████▉| 340/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.v_proj.bias]
Loading weights: 100%|█████████▉| 340/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.v_proj.bias]
Loading weights: 100%|██████████| 341/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]
Loading weights: 100%|██████████| 341/341 [00:00<00:00, 557.86it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]
Loading weights: 100%|██████████| 341/341 [00:00<00:00, 589.26it/s, Materializing param=model.layers.23.self_attn.v_proj.weight]
-- done.
-- tokenize the prompt...
-- done.
-- compute the answer...
-- done in 3.7324481080013356
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

eos_token_id?

This token means the end of the answer.

print("eos_token_id=", tokenizer.eos_token_id)
eos_token_id= 50256

Custom method generate

Let’s implement a simple function replicating when method generate does.

def simple_generate_with_cache(
    model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
):
    # First call: prefill
    outputs = model(input_ids, use_cache=True)

    # Next calls: decode
    for _ in tqdm(list(range(max_new_tokens))):
        next_token_logits = outputs.logits[:, -1, :]
        past_key_values = outputs.past_key_values

        # The most probable next token is chosen.
        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
        # But we could select it using a multinomial law
        # <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
        # <<< top_probs, top_indices = torch.topk(probs, top_k)
        # <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]

        if next_token_id.item() == eos_token_id:
            break
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

        # Feed only the new token, but with the cache
        outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)

    return input_ids


print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = simple_generate_with_cache(
    model, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="custom", duration=duration))

print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...

  0%|          | 0/100 [00:00<?, ?it/s]
  3%|▎         | 3/100 [00:00<00:04, 23.91it/s]
  6%|▌         | 6/100 [00:00<00:04, 21.31it/s]
  9%|▉         | 9/100 [00:00<00:04, 22.62it/s]
 13%|█▎        | 13/100 [00:00<00:03, 26.25it/s]
 17%|█▋        | 17/100 [00:00<00:03, 27.44it/s]
 20%|██        | 20/100 [00:00<00:02, 27.83it/s]
 23%|██▎       | 23/100 [00:00<00:02, 27.16it/s]
 26%|██▌       | 26/100 [00:00<00:02, 26.55it/s]
 29%|██▉       | 29/100 [00:01<00:02, 26.51it/s]
 32%|███▏      | 32/100 [00:01<00:02, 26.63it/s]
 35%|███▌      | 35/100 [00:01<00:02, 27.14it/s]
 38%|███▊      | 38/100 [00:01<00:02, 27.51it/s]
 41%|████      | 41/100 [00:01<00:02, 27.74it/s]
 44%|████▍     | 44/100 [00:01<00:02, 26.78it/s]
 47%|████▋     | 47/100 [00:01<00:02, 24.61it/s]
 50%|█████     | 50/100 [00:01<00:02, 24.23it/s]
 53%|█████▎    | 53/100 [00:02<00:02, 23.04it/s]
 56%|█████▌    | 56/100 [00:02<00:01, 23.82it/s]
 59%|█████▉    | 59/100 [00:02<00:01, 24.48it/s]
 62%|██████▏   | 62/100 [00:02<00:01, 25.21it/s]
 65%|██████▌   | 65/100 [00:02<00:01, 25.60it/s]
 68%|██████▊   | 68/100 [00:02<00:01, 24.59it/s]
 71%|███████   | 71/100 [00:02<00:01, 22.96it/s]
 74%|███████▍  | 74/100 [00:02<00:01, 23.63it/s]
 77%|███████▋  | 77/100 [00:03<00:01, 22.25it/s]
 94%|█████████▍| 94/100 [00:03<00:00, 54.61it/s]
100%|██████████| 100/100 [00:03<00:00, 41.89it/s]
100%|██████████| 100/100 [00:03<00:00, 28.96it/s]
-- done in 4.518989952999618
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

Method generate for onnx models

We first need to export the model into ONNX.

ONNX Conversion

if "position_ids" in export_inputs:
    del export_inputs["position_ids"]
    del export_shapes["position_ids"]
dtype = get_weight_type(model)
print("-- model dtype:", dtype)
export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
exporter = "onnx-dynamo" if "dynamo" in sys.argv else "custom"
model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
if not os.path.exists(model_name):
    # This step is slow so let's skip it if it was already done.
    print("-- conversion to ONNX.")
    begin = time.perf_counter()
    with torch_export_patches(patch_transformers=True):
        to_onnx(
            model,
            (),
            kwargs=to_any(export_inputs, device),
            dynamic_shapes=export_shapes,
            filename=model_name,
            verbose=1,
            exporter=exporter,
        )
    duration = time.perf_counter() - begin
    print(f"-- done in {duration}")
-- model dtype: torch.float16
-- conversion to ONNX.
[to_onnx] build the graph module from <class 'transformers.models.phi.modeling_phi.PhiForCausalLM'>, type(args)=<class 'tuple'>
[to_onnx] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] export_options=ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>))
[_make_builder_interpreter] input args=()
[_make_builder_interpreter] input kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[_make_builder_interpreter] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] same_signature=True, tracing_mode=symbolic
[ExportOptions.export] ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)) - torch._dynamo.export 'PhiForCausalLM'
[ExportOptions.export] aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)
[ExportOptions.export] torch_export strict=False, verbose=1
[ExportOptions.export] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[ExportOptions.export] args=()
[ExportOptions.export] kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[ExportOptions.export] export start with strict=False...
[ExportOptions.export] export with backed_size_oblivious=auto
[torch_export] backed_size_oblivious='auto'
[torch_export] inferred backed_size_oblivious=None
[torch_export] export starts with backed_size_oblivious=None
[ExportOptions.export] export done in 11.546899875000236
[ExportOptions.export] post_process_exported_program with decomposition_table=None
[ExportOptions.export] remove inplace nodes
[ExportOptions.export] slices: 3 slices nodes were removed
[CustomTracer.remove_inplace] starts with 1891 nodes (n_inplace_submobules=0)
[CustomTracer.remove_inplace] S1: 80 inplace nodes
[CustomTracer.remove_inplace] S2: 74 inplace nodes and 100 iterations
[CustomTracer.remove_inplace] end with 95 iterations and 1706 nodes (n_inplace=74)
[ExportOptions.export] inplaces: 80 inplaced nodes were removed
[ExportOptions.export] done remove inplace in 0.045854885000153445, modified=80
[ExportOptions.export] done with no decomposition in 0.046567682999011595
[to_onnx] graph module done in 11.606618785999672 s
[to_onnx] start creating the onnx nodes
[to_onnx] interpreter.function_options=FunctionOptions(export_as_function=True, name='*', domain='*', external_threshold=256, move_initializer_to_constant=True, return_initializer=True, merge_allowed=True, rename_allowed=True)

  0%|          | 0/1706 [00:00<?, ?it/s]
 26%|██▌       | 443/1706 [00:00<00:00, 4422.22it/s]
 52%|█████▏    | 886/1706 [00:00<00:00, 1750.22it/s]
 67%|██████▋   | 1141/1706 [00:00<00:00, 1482.28it/s]
 78%|███████▊  | 1331/1706 [00:00<00:00, 1350.91it/s]
 87%|████████▋ | 1489/1706 [00:01<00:00, 1257.91it/s]
 95%|█████████▌| 1627/1706 [00:01<00:00, 1217.29it/s]
100%|██████████| 1706/1706 [00:01<00:00, 1364.36it/s]
[to_onnx] 2308 onnx nodes done in 1.4222290480010997 s
[to_onnx] start conversion to onnx (before optimization) mask_outputs=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
[GraphBuilder-LOA.inline_functions] begin inlining graph
[GraphBuilder-LOA.inline_functions] skip_functions=set()
[GraphBuilder-LOA._inline_functions_iterations] inline function 'submod_3' domain 'local_functions' [n_replacements=1]
[GraphBuilder-LOA._inline_functions_iterations] done with 9 new nodes for 'submod_3', 'local_functions'
[GraphBuilder-LOA.inline_functions] done inlining graph 140708725480128 in 0.032655167999109835
[GraphBuilder-LOA._add_shape_information] dynamic shapes replacements={'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch': 'batch', 'batch^s3^batch^s41': 'batch', 's77': 'batch', 's67': 'batch', 's90': 'batch', 's89': 'batch', 's104': 'batch', 's47': 'batch', 's62': 'batch', 'batch^s49^batch^s26': 'batch', 's83': 'batch', 's26': 'batch', 'batch^s87^batch^s23': 'batch', 'batch^s82^batch^s62': 'batch', 's36': 'batch', 's59': 'batch', 's64': 'batch', 's102': 'batch', 's3': 'batch', 's75': 'batch', 's10': 'batch', 'batch^s52^batch^s93': 'batch', 's45': 'batch', 's106': 'batch', 's56': 'batch', 's57': 'batch', 's41': 'batch', 'batch^s104^batch^s106': 'batch', 'batch^s36^batch^s13': 'batch', 's93': 'batch', 's13': 'batch', 's97': 'batch', 's91': 'batch', 's71': 'batch', 'batch^s35^batch^s60': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s64^batch^s86': 'batch', 's82': 'batch', 's86': 'batch', 's98': 'batch', 'batch^s48^batch^s59': 'batch', 's34': 'batch', 's35': 'batch', 'batch^s34^batch^s77': 'batch', 's84': 'batch', 's8': 'batch', 's79': 'batch', 'batch^s98^batch^s79': 'batch', 's29': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s90^batch^s57': 'batch', 's60': 'batch', 's52': 'batch', 'batch^s97^batch^s10': 'batch', 's30': 'batch', 'batch^s84^batch^s91': 'batch', 's39': 'batch', 's69': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s39^batch^s71': 'batch', 's61': 'batch', 's23': 'batch', 's49': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s30^batch^s89': 'batch', 's87': 'batch', 's72': 'batch', 's1': 'batch', 's92': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s1^batch^s75': 'batch', 's43': 'batch', 's100': 'batch', 's70': 'seq_length', 's9': 'cache_length', 's11': 'cache_length', 's40': 'cache_length', 's24': 'cache_length', 's4': 'cache_length', 's44': 'cache_length', 's78': 'cache_length', 's51': 'cache_length', 's88': 'cache_length', 's27': 'cache_length', 's81': 'cache_length', 's31': 'cache_length', 's18': 'cache_length', 's74': 'cache_length', 's94': 'cache_length', 's38': 'cache_length', 's96': 'cache_length', 's33': 'cache_length', 's63': 'cache_length', 's73': 'cache_length', 's80': 'cache_length', 's42': 'cache_length', 's21': 'cache_length', 's7': 'cache_length', 's15': 'cache_length', 's85': 'cache_length', 's65': 'cache_length', 's14': 'cache_length', 's32': 'cache_length', 's46': 'cache_length', 's105': 'cache_length', 's58': 'cache_length', 's99': 'cache_length', 's103': 'cache_length', 's66': 'cache_length', 's107': 'cache_length', 's76': 'cache_length', 's37': 'cache_length', 's2': 'cache_length', 's28': 'cache_length', 's101': 'cache_length', 's54': 'cache_length', 's95': 'cache_length', 's68': 'cache_length', 's22': 'cache_length', 's55': 'cache_length', 's50': 'cache_length', 's25': 'cache_length'}
[GraphBuilder-LOA.optimize] start with 2316 nodes
[GraphBuilder-LOA.optimize] #patterns=111
[GraphBuilder-LOA.optimize] start with subgraphs
[GraphBuilder-LOA.optimize] done with subgraphs
[GraphBuilderPatternOptimization-LOA.optimize] start with 1987 nodes, 461 initializers, 111 patterns, priorities=[0, 1, 2, 3], max_iter=7948
[GraphBuilderPatternOptimization-LOA.optimize] same children={'SameChildrenFromInputPattern', 'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] iteration 0: 1987 nodes, priority=0
[GraphBuilderPatternOptimization-LOA.optimize] applies 226 matches, 75*CastPattern, 2*IdentityPattern, 3*ShapeBasedReshapeIsSqueezePattern, 96*ShapeBasedEditDistanceReshapePattern, 18*ShapeBasedIdentityPattern, 5*SameChildrenPattern, 1*SqueezeAddPattern, 1*SqueezeUnsqueezePattern, 1*UnsqueezeUnsqueezePattern, 24*FunctionAttentionPattern - time=0.150 | max_time=IdentityPattern:0.048
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=204, n_removed=259, n_applied=276 applied patterns, 1595 nodes left with 23 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 1
[GraphBuilderPatternOptimization-LOA.optimize] iteration 1: 1595 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] applies 202 matches, 2*ConcatTwiceUnaryPattern, 1*ConstantToInitializerPattern, 49*DropoutPattern, 25*LayerNormalizationPattern, 1*ShapeBasedExpandBroadcastPattern, 1*ShapeBasedExpandSwapPattern, 96*SlicesSplitPattern, 3*SqueezeUnsqueezePattern, 24*GeluOrtPattern - time=0.210 | max_time=IdentityPattern:0.018
[GraphBuilderPatternOptimization-LOA.optimize] iteration 2: 1127 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] applies 101 matches, 2*ConcatTwiceUnaryPattern, 25*LayerNormalizationScalePattern, 2*ShapeBasedExpandSwapPattern, 48*FunctionHalfRotaryEmbeddingPattern, 24*FastGeluPattern - time=0.142 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-LOA.optimize] iteration 3: 911 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] applies 26 matches, 1*ShapeBasedExpandBroadcastPattern, 1*FunctionCausalMaskPattern, 24*SkipLayerNormalizationPattern - time=0.106 | max_time=IdentityPattern:0.016
[GraphBuilderPatternOptimization-LOA.optimize] iteration 4: 885 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] applies 2 matches, 1*ShapeBasedConcatExpandPattern, 1*FunctionCausalMaskMulAddPattern - time=0.118 | max_time=IdentityPattern:0.015
[GraphBuilderPatternOptimization-LOA.optimize] iteration 5: 879 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] applies 1 matches, [0]=MatchResult: FunctionCosSinCachePattern replaces ['Squeeze', 'Squeeze', 'Range', 'Unsqueeze', 'Cast', 'Reshape', 'Mul', 'Cos', 'Cast', 'Sin', 'Cast'] - time=0.086 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-LOA.optimize] iteration 6: 869 nodes, priority=1
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 2
[GraphBuilderPatternOptimization-LOA.optimize] iteration 7: 869 nodes, priority=2
[GraphBuilderPatternOptimization-LOA.optimize] applies 1 matches, [0]=MatchResult: ContribRotaryEmbeddingPattern replaces ['Concat', 'Concat', 'Split', 'HalfRotaryEmbedding', 'Concat'] - time=0.111 | max_time=IdentityPattern:0.013
[GraphBuilderPatternOptimization-LOA.optimize] iteration 8: 874 nodes, priority=2
[GraphBuilderPatternOptimization-LOA.optimize] applies 3 matches, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern - time=0.106 | max_time=IdentityPattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] iteration 9: 878 nodes, priority=2
[GraphBuilderPatternOptimization-LOA.optimize] applies 6 matches, 2*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.092 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=19, n_removed=26, n_applied=624 applied patterns, 876 nodes left with 3 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 10: 876 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 5 matches, 1*ShapeBasedEditDistanceReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.110 | max_time=IdentityPattern:0.013
[GraphBuilderPatternOptimization-LOA.optimize] iteration 11: 882 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 9 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ReshapeReshapePattern, 3*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.107 | max_time=IdentityPattern:0.011
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=645 applied patterns, 875 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 12: 875 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.117 | max_time=IdentityPattern:0.016
[GraphBuilderPatternOptimization-LOA.optimize] iteration 13: 878 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 14 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 3*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.112 | max_time=IdentityPattern:0.013
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=674 applied patterns, 866 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 14: 866 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.113 | max_time=IdentityPattern:0.016
[GraphBuilderPatternOptimization-LOA.optimize] iteration 15: 865 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.113 | max_time=IdentityPattern:0.011
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=708 applied patterns, 852 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 16: 852 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.121 | max_time=IdentityPattern:0.014
[GraphBuilderPatternOptimization-LOA.optimize] iteration 17: 851 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.125 | max_time=ShapeBasedEditDistanceReshapePattern:0.010
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=742 applied patterns, 838 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 18: 838 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.116 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-LOA.optimize] iteration 19: 837 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.148 | max_time=ShapeBasedEditDistanceReshapePattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=776 applied patterns, 824 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 20: 824 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.108 | max_time=IdentityPattern:0.008
[GraphBuilderPatternOptimization-LOA.optimize] iteration 21: 823 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.109 | max_time=IdentityPattern:0.014
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=810 applied patterns, 810 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 22: 810 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.105 | max_time=IdentityPattern:0.008
[GraphBuilderPatternOptimization-LOA.optimize] iteration 23: 809 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.093 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=844 applied patterns, 796 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 24: 796 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.096 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] iteration 25: 795 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=IdentityPattern:0.011
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=878 applied patterns, 782 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 26: 782 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.106 | max_time=IdentityPattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] iteration 27: 781 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.106 | max_time=IdentityPattern:0.013
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=912 applied patterns, 768 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 28: 768 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.105 | max_time=IdentityPattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] iteration 29: 767 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.093 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=946 applied patterns, 754 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 30: 754 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.097 | max_time=IdentityPattern:0.016
[GraphBuilderPatternOptimization-LOA.optimize] iteration 31: 753 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=980 applied patterns, 740 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 32: 740 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.098 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-LOA.optimize] iteration 33: 739 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.086 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1014 applied patterns, 726 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 34: 726 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.092 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-LOA.optimize] iteration 35: 725 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 14 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 3*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.090 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1047 applied patterns, 713 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 36: 713 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.093 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-LOA.optimize] iteration 37: 712 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.090 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1081 applied patterns, 699 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 38: 699 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.086 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-LOA.optimize] iteration 39: 698 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.082 | max_time=ShapeBasedEditDistanceReshapePattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1115 applied patterns, 685 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 40: 685 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.086 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] iteration 41: 684 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.083 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1149 applied patterns, 671 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 42: 671 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.099 | max_time=IdentityPattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] iteration 43: 670 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.073 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1183 applied patterns, 657 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 44: 657 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.082 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] iteration 45: 656 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.080 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1217 applied patterns, 643 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 46: 643 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.086 | max_time=IdentityPattern:0.013
[GraphBuilderPatternOptimization-LOA.optimize] iteration 47: 642 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.074 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1251 applied patterns, 629 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 48: 629 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-LOA.optimize] iteration 49: 628 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.072 | max_time=SameChildrenPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1285 applied patterns, 615 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 50: 615 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.099 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-LOA.optimize] iteration 51: 614 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.071 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1319 applied patterns, 601 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 52: 601 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.081 | max_time=SameChildrenPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] iteration 53: 600 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.086 | max_time=SameChildrenPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=21, n_removed=29, n_applied=1353 applied patterns, 587 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 54: 587 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.080 | max_time=IdentityPattern:0.015
[GraphBuilderPatternOptimization-LOA.optimize] iteration 55: 584 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 14 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbedding3DPattern - time=0.070 | max_time=SameChildrenPattern:0.006
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=11, n_removed=15, n_applied=1383 applied patterns, 569 nodes left with 4 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 56: 569 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 1*MultiHeadAttention3DPattern - time=0.082 | max_time=IdentityPattern:0.012
[GraphBuilderPatternOptimization-LOA.optimize] iteration 57: 560 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 7 matches, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 1*SameChildrenPattern - time=0.097 | max_time=IdentityPattern:0.008
[GraphBuilderPatternOptimization-LOA.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-LOA.optimize] n_added=0, n_removed=0, n_applied=1398 applied patterns, 553 nodes left with 1 iterations
[GraphBuilderPatternOptimization-LOA.optimize] increase priority to 3
[GraphBuilderPatternOptimization-LOA.optimize] iteration 58: 553 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] applies 5 matches, 5*ShapedBasedReshapePattern - time=0.062 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-LOA.optimize] iteration 59: 548 nodes, priority=3
[GraphBuilderPatternOptimization-LOA.optimize] stops current_priority_index=4, priorities=[0, 1, 2, 3]
[GraphBuilderPatternOptimization-LOA.optimize] done after 60 iterations with 548 nodes in 12.214
[OrderOptimization.optimize] ALGO-2
[OrderOptimization.random_order] -- starts with 473 nodes, 353 initializers
[OrderOptimization.shape_order] done after in 0.0028569269998115487s with changed=4 scale=16
[GraphBuilder-LOA.optimize] done with 473 nodes in 14.628
[GraphBuilder-LOA.to_onnx] make_model 499 inits 341 params
[GraphBuilder-LOA.time_evaluation_constants_] 0.0009636320010031341
[GraphBuilder-LOA._build_initializers] start with 499 initializers, large_model=True, external_threshold=1024
[GraphBuilder-LOA._build_initializers] switch low/high order
[GraphBuilder-LOA._build_initializers] done in 7.992999599082395e-06s with 353 initializers, 341 large initializers
[GraphBuilder-LOA._add_shape_information] dynamic shapes replacements={'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch': 'batch', 'batch^s3^batch^s41': 'batch', 's77': 'batch', 's67': 'batch', 's90': 'batch', 's89': 'batch', 's104': 'batch', 's47': 'batch', 's62': 'batch', 'batch^s49^batch^s26': 'batch', 's83': 'batch', 's26': 'batch', 'batch^s87^batch^s23': 'batch', 'batch^s82^batch^s62': 'batch', 's36': 'batch', 's59': 'batch', 's64': 'batch', 's102': 'batch', 's3': 'batch', 's75': 'batch', 's10': 'batch', 'batch^s52^batch^s93': 'batch', 's45': 'batch', 's106': 'batch', 's56': 'batch', 's57': 'batch', 's41': 'batch', 'batch^s104^batch^s106': 'batch', 'batch^s36^batch^s13': 'batch', 's93': 'batch', 's13': 'batch', 's97': 'batch', 's91': 'batch', 's71': 'batch', 'batch^s35^batch^s60': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s64^batch^s86': 'batch', 's82': 'batch', 's86': 'batch', 's98': 'batch', 'batch^s48^batch^s59': 'batch', 's34': 'batch', 's35': 'batch', 'batch^s34^batch^s77': 'batch', 's84': 'batch', 's8': 'batch', 's79': 'batch', 'batch^s98^batch^s79': 'batch', 's29': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s90^batch^s57': 'batch', 's60': 'batch', 's52': 'batch', 'batch^s97^batch^s10': 'batch', 's30': 'batch', 'batch^s84^batch^s91': 'batch', 's39': 'batch', 's69': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s39^batch^s71': 'batch', 's61': 'batch', 's23': 'batch', 's49': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s30^batch^s89': 'batch', 's87': 'batch', 's72': 'batch', 's1': 'batch', 's92': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s1^batch^s75': 'batch', 's43': 'batch', 's100': 'batch', 's70': 'seq_length', 's9': 'cache_length', 's11': 'cache_length', 's40': 'cache_length', 's24': 'cache_length', 's4': 'cache_length', 's44': 'cache_length', 's78': 'cache_length', 's51': 'cache_length', 's88': 'cache_length', 's27': 'cache_length', 's81': 'cache_length', 's31': 'cache_length', 's18': 'cache_length', 's74': 'cache_length', 's94': 'cache_length', 's38': 'cache_length', 's96': 'cache_length', 's33': 'cache_length', 's63': 'cache_length', 's73': 'cache_length', 's80': 'cache_length', 's42': 'cache_length', 's21': 'cache_length', 's7': 'cache_length', 's15': 'cache_length', 's85': 'cache_length', 's65': 'cache_length', 's14': 'cache_length', 's32': 'cache_length', 's46': 'cache_length', 's105': 'cache_length', 's58': 'cache_length', 's99': 'cache_length', 's103': 'cache_length', 's66': 'cache_length', 's107': 'cache_length', 's76': 'cache_length', 's37': 'cache_length', 's2': 'cache_length', 's28': 'cache_length', 's101': 'cache_length', 's54': 'cache_length', 's95': 'cache_length', 's68': 'cache_length', 's22': 'cache_length', 's55': 'cache_length', 's50': 'cache_length', 's25': 'cache_length'}
[to_onnx] to_onnx done in 14.889057660999242s and 473 nodes, 353 initializers, 50 inputs, 49 outputs
-- done in 38.13969019199976

onnx_generate

Then we can call method generate for two tokens. This function is part of onnx_diagnostic but follows the implementation seen earlier for a torch model. Let’s ask first the function to return the session to avoid creating on the second call.

_res, session, _feeds = onnx_generate(
    model_name, inputs.input_ids, eos_token_id=2, max_new_tokens=2, return_session=True
)

# And now the full answer.
print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = onnx_generate(
    session, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="onnx", duration=duration))

print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...
-- done in 1.8495868000009068
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

Plots

df = pandas.DataFrame(data).set_index("name")
print(df)
          duration
name
generate  3.732448
custom    4.518990
onnx      1.849587
ax = df.plot(kind="bar", title="Time (s) comparison to generate a prompt.", rot=45)
ax.figure.tight_layout()
ax.figure.savefig("plot_generate.png")
Time (s) comparison to generate a prompt.

Total running time of the script: (0 minutes 57.100 seconds)

Related examples

LayerNormalization implementation cannot be exchanged

LayerNormalization implementation cannot be exchanged

Gemm or Matmul + Add

Gemm or Matmul + Add

Export with loops

Export with loops

Gallery generated by Sphinx-Gallery