From a LLM to processing a prompt

Method generate generates the model answer for a given prompt. Let’s implement our own to understand better how it works and then apply it to an ONNX model.

Example with Phi 1.5

epkg:microsoft/Phi-1.5 is a small LLM. The example given

import os
import time
import sys
import pandas
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from onnx_diagnostic.ext_test_case import unit_test_going
from onnx_diagnostic.helpers import string_type
from onnx_diagnostic.helpers.torch_helper import to_any, get_weight_type
from onnx_diagnostic.helpers.rt_helper import onnx_generate
from onnx_diagnostic.torch_export_patches import torch_export_patches
from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config, task_from_id
from onnx_diagnostic.tasks import random_input_kwargs
from onnx_diagnostic.export.api import to_onnx


device = "cuda" if torch.cuda.is_available() else "cpu"
data = []

print("-- load the model...")
if unit_test_going():
    # unit_test_going() returns True if UNITTEST_GOING is 1
    # The example switches to a faster scenario.
    model_id = "arnir0/Tiny-LLM"
    data_export = get_untrained_model_with_inputs(model_id)
    model = data_export["model"]
    export_inputs = data_export["inputs"]
    export_shapes = data_export["dynamic_shapes"]
    tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
    model_id = "microsoft/phi-1_5"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    config = get_pretrained_config(model_id)
    task = task = task_from_id(model_id)
    kwargs, fct = random_input_kwargs(config, task)
    res = fct(model, config, add_second_input=False, **kwargs)
    export_inputs = res["inputs"]
    export_shapes = res["dynamic_shapes"]
model = model.to(device)
print("-- done.")

print("-- tokenize the prompt...")
inputs = tokenizer(
    '''def print_prime(n):
   """
   Print all primes between 1 and n
   """''',
    return_tensors="pt",
    return_attention_mask=False,
).to(device)
print("-- done.")

print("-- compute the answer...")
begin = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=100)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="generate", duration=duration))
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- load the model...
-- done.
-- tokenize the prompt...
-- done.
-- compute the answer...
-- done in 4.1681223129999125
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

eos_token_id?

This token means the end of the answer.

print("eos_token_id=", tokenizer.eos_token_id)
eos_token_id= 50256

Custom method generate

Let’s implement a simple function replicating when method generate does.

def simple_generate_with_cache(
    model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
):
    # First call: prefill
    outputs = model(input_ids, use_cache=True)

    # Next calls: decode
    for _ in tqdm(list(range(max_new_tokens))):
        next_token_logits = outputs.logits[:, -1, :]
        past_key_values = outputs.past_key_values

        # The most probable next token is chosen.
        next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
        # But we could select it using a multinomial law
        # <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
        # <<< top_probs, top_indices = torch.topk(probs, top_k)
        # <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]

        if next_token_id.item() == eos_token_id:
            break
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

        # Feed only the new token, but with the cache
        outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)

    return input_ids


print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = simple_generate_with_cache(
    model, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="custom", duration=duration))

print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...

  0%|          | 0/100 [00:00<?, ?it/s]
  3%|▎         | 3/100 [00:00<00:03, 25.18it/s]
  6%|▌         | 6/100 [00:00<00:03, 25.08it/s]
  9%|▉         | 9/100 [00:00<00:03, 23.97it/s]
 12%|█▏        | 12/100 [00:00<00:03, 23.81it/s]
 15%|█▌        | 15/100 [00:00<00:03, 24.09it/s]
 18%|█▊        | 18/100 [00:00<00:03, 24.23it/s]
 21%|██        | 21/100 [00:00<00:03, 24.58it/s]
 24%|██▍       | 24/100 [00:00<00:03, 23.64it/s]
 27%|██▋       | 27/100 [00:01<00:03, 23.31it/s]
 30%|███       | 30/100 [00:01<00:03, 22.72it/s]
 33%|███▎      | 33/100 [00:01<00:02, 23.74it/s]
 36%|███▌      | 36/100 [00:01<00:02, 23.77it/s]
 39%|███▉      | 39/100 [00:01<00:02, 23.43it/s]
 42%|████▏     | 42/100 [00:01<00:02, 23.77it/s]
 45%|████▌     | 45/100 [00:01<00:02, 23.24it/s]
 48%|████▊     | 48/100 [00:02<00:02, 22.26it/s]
 51%|█████     | 51/100 [00:02<00:02, 22.43it/s]
 54%|█████▍    | 54/100 [00:02<00:02, 22.81it/s]
 57%|█████▋    | 57/100 [00:02<00:01, 22.52it/s]
 60%|██████    | 60/100 [00:02<00:01, 21.28it/s]
 63%|██████▎   | 63/100 [00:02<00:01, 20.49it/s]
 66%|██████▌   | 66/100 [00:02<00:01, 20.49it/s]
 69%|██████▉   | 69/100 [00:03<00:01, 19.81it/s]
 71%|███████   | 71/100 [00:03<00:01, 19.66it/s]
 73%|███████▎  | 73/100 [00:03<00:01, 19.16it/s]
 75%|███████▌  | 75/100 [00:03<00:01, 19.19it/s]
 77%|███████▋  | 77/100 [00:03<00:01, 18.72it/s]
 79%|███████▉  | 79/100 [00:03<00:01, 18.50it/s]
 81%|████████  | 81/100 [00:03<00:01, 17.96it/s]
 83%|████████▎ | 83/100 [00:03<00:00, 17.51it/s]
 85%|████████▌ | 85/100 [00:03<00:00, 17.62it/s]
 87%|████████▋ | 87/100 [00:04<00:00, 17.64it/s]
 89%|████████▉ | 89/100 [00:04<00:00, 18.15it/s]
 91%|█████████ | 91/100 [00:04<00:00, 17.16it/s]
 93%|█████████▎| 93/100 [00:04<00:00, 17.43it/s]
 96%|█████████▌| 96/100 [00:04<00:00, 18.53it/s]
 98%|█████████▊| 98/100 [00:04<00:00, 18.76it/s]
100%|██████████| 100/100 [00:04<00:00, 20.95it/s]
-- done in 4.9723236999998335
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

Method generate for onnx models

We first need to export the model into ONNX.

ONNX Conversion

if "position_ids" in export_inputs:
    del export_inputs["position_ids"]
    del export_shapes["position_ids"]
dtype = get_weight_type(model)
print("-- model dtype:", dtype)
export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
exporter = "onnx-dynamo" if "dynamo" in sys.argv else "custom"
model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
if not os.path.exists(model_name):
    # This step is slow so let's skip it if it was already done.
    print("-- conversion to ONNX.")
    begin = time.perf_counter()
    with torch_export_patches(patch_transformers=True):
        to_onnx(
            model,
            (),
            kwargs=to_any(export_inputs, device),
            dynamic_shapes=export_shapes,
            filename=model_name,
            verbose=1,
            exporter=exporter,
        )
    duration = time.perf_counter() - begin
    print(f"-- done in {duration}")
-- model dtype: torch.float16
-- conversion to ONNX.
[to_onnx] build the graph module from <class 'transformers.models.phi.modeling_phi.PhiForCausalLM'>, type(args)=<class 'tuple'>
[to_onnx] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] export_options=ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>))
[_make_builder_interpreter] input args=()
[_make_builder_interpreter] input kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[_make_builder_interpreter] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] same_signature=True, tracing_mode=symbolic
[ExportOptions.export] ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)) - torch._dynamo.export 'PhiForCausalLM'
[ExportOptions.export] aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)
[ExportOptions.export] torch_export strict=False, verbose=1
[ExportOptions.export] dynamic_shapes={'input_ids': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'attention_mask': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}]}
[ExportOptions.export] args=()
[ExportOptions.export] kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[ExportOptions.export] export start with strict=False...
[ExportOptions.export] export with backed_size_oblivious=auto
[torch_export] backed_size_oblivious='auto'
~/github/onnx-diagnostic/onnx_diagnostic/helpers/cache_helper.py:83: FutureWarning: `treespec.children_specs` is deprecated. Use `treespec.child(index)` to access a single child, or `treespec.children()` to get all children.
  for subspec in spec.children_specs:
[torch_export] inferred backed_size_oblivious=None
[torch_export] export starts with backed_size_oblivious=None
[ExportOptions.export] export done in 12.40310462600246
[ExportOptions.export] post_process_exported_program with decomposition_table=None
[ExportOptions.export] remove inplace nodes
[ExportOptions.export] slices: 6 slices nodes were removed
[CustomTracer.remove_inplace] starts with 1713 nodes
[CustomTracer.remove_inplace] S1: 1 inplace nodes
[CustomTracer.remove_inplace] S2: 1 inplace nodes and 10 iterations
[CustomTracer.remove_inplace] end with 10 iterations and 1709 nodes
[ExportOptions.export] inplaces: 1 inplaced nodes were removed
[ExportOptions.export] done remove inplace in 0.054876435999176465, modified=1
[ExportOptions.export] done with no decomposition in 0.055599531999178
[to_onnx] graph module done in 12.486493407002854 s
[to_onnx] start creating the onnx nodes
[to_onnx] interpreter.function_options=FunctionOptions(export_as_function=True, name='*', domain='*', external_threshold=256, move_initializer_to_constant=True, return_initializer=True, merge_allowed=True, rename_allowed=True)

  0%|          | 0/1709 [00:00<?, ?it/s]
 26%|██▌       | 444/1709 [00:00<00:00, 4417.52it/s]
 52%|█████▏    | 886/1709 [00:00<00:00, 965.77it/s]
 65%|██████▍   | 1105/1709 [00:00<00:00, 1030.38it/s]
 75%|███████▍  | 1280/1709 [00:01<00:00, 1063.70it/s]
 84%|████████▍ | 1433/1709 [00:01<00:00, 1093.32it/s]
 92%|█████████▏| 1574/1709 [00:01<00:00, 1121.95it/s]
100%|██████████| 1709/1709 [00:01<00:00, 1038.02it/s]
100%|██████████| 1709/1709 [00:01<00:00, 1110.16it/s]
[to_onnx] 2312 onnx nodes done in 1.5416379419984878 s
[to_onnx] start conversion to onnx (before optimization) mask_outputs=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
[inline_functions] begin graph 131165578613616
[inline_functions] skip_functions=set()
[_inline_functions_iterations] inline function 'submod_3' domain 'local_functions'
[_inline_functions_iterations] 9 new nodes for 'submod_3', 'local_functions'
[inline_functions] done graph 131165578613616 in 0.0366950959978567
[GraphBuilder-IBO._add_shape_information] dynamic shapes replacements={'batch': 'batch', 'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch^s52^batch^s93': 'batch', 's77': 'batch', 's8': 'batch', 's41': 'batch', 's60': 'batch', 'batch^s104^batch^s106': 'batch', 's91': 'batch', 's64': 'batch', 'batch^s48^batch^s59': 'batch', 's36': 'batch', 's3': 'batch', 's98': 'batch', 's45': 'batch', 's59': 'batch', 's34': 'batch', 's84': 'batch', 's87': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s98^batch^s79': 'batch', 's62': 'batch', 'batch^s39^batch^s71': 'batch', 's75': 'batch', 'batch^s3^batch^s41': 'batch', 'batch^s49^batch^s26': 'batch', 's71': 'batch', 'batch^s97^batch^s10': 'batch', 's57': 'batch', 's56': 'batch', 's100': 'batch', 's43': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 's93': 'batch', 's67': 'batch', 's1': 'batch', 'batch^s36^batch^s13': 'batch', 's49': 'batch', 's26': 'batch', 's79': 'batch', 's39': 'batch', 's83': 'batch', 'batch^s84^batch^s91': 'batch', 'batch^s34^batch^s77': 'batch', 's86': 'batch', 's13': 'batch', 's10': 'batch', 's69': 'batch', 's102': 'batch', 's92': 'batch', 's30': 'batch', 'batch^s30^batch^s89': 'batch', 's106': 'batch', 's29': 'batch', 'batch^s82^batch^s62': 'batch', 'batch^s87^batch^s23': 'batch', 's89': 'batch', 'batch^s90^batch^s57': 'batch', 'batch^s64^batch^s86': 'batch', 's104': 'batch', 's47': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s1^batch^s75': 'batch', 's52': 'batch', 's90': 'batch', 's23': 'batch', 's61': 'batch', 's35': 'batch', 'batch^s35^batch^s60': 'batch', 's72': 'batch', 's97': 'batch', 's82': 'batch', 's70': 'seq_length', 's37': 'cache_length', 's96': 'cache_length', 's105': 'cache_length', 's24': 'cache_length', 's88': 'cache_length', 's80': 'cache_length', 's51': 'cache_length', 's31': 'cache_length', 's55': 'cache_length', 's32': 'cache_length', 's101': 'cache_length', 's74': 'cache_length', 's14': 'cache_length', 's50': 'cache_length', 's7': 'cache_length', 's40': 'cache_length', 's54': 'cache_length', 's78': 'cache_length', 's94': 'cache_length', 's95': 'cache_length', 's65': 'cache_length', 's38': 'cache_length', 's15': 'cache_length', 's66': 'cache_length', 's22': 'cache_length', 's28': 'cache_length', 's85': 'cache_length', 's63': 'cache_length', 's2': 'cache_length', 's81': 'cache_length', 's73': 'cache_length', 's4': 'cache_length', 's11': 'cache_length', 's18': 'cache_length', 's25': 'cache_length', 's46': 'cache_length', 's58': 'cache_length', 's103': 'cache_length', 's33': 'cache_length', 's99': 'cache_length', 's107': 'cache_length', 's68': 'cache_length', 's44': 'cache_length', 's42': 'cache_length', 's9': 'cache_length', 's21': 'cache_length', 's27': 'cache_length', 's76': 'cache_length'}
[GraphBuilder-IBO.optimize] start with 2320 nodes
[GraphBuilder-IBO.optimize] #patterns=103
[GraphBuilder-IBO.optimize] start with subgraphs
[GraphBuilder-IBO.optimize] done with subgraphs
[GraphBuilderPatternOptimization-IBO.optimize] start with 1991 nodes, 459 initializers, 103 patterns, priorities=[0, 1, 2, 3], max_iter=7964
[GraphBuilderPatternOptimization-IBO.optimize] same children={'SameChildrenPattern', 'SameChildrenFromInputPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] iteration 0: 1991 nodes, priority=0
[GraphBuilderPatternOptimization-IBO.optimize] applies 229 matches, 75*CastPattern, 2*IdentityPattern, 1*ShapeBasedStaticExpandPattern, 96*ShapeBasedEditDistanceReshapePattern, 18*ShapeBasedIdentityPattern, 6*SameChildrenPattern, 1*SqueezeAddPattern, 1*SqueezeUnsqueezePattern, 2*SwapUnaryPattern, 3*UnsqueezeUnsqueezePattern, 24*FunctionAttentionPattern - time=0.144 | max_time=IdentityPattern:0.043
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=206, n_removed=260, n_applied=283 applied patterns, 1592 nodes left with 23 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 1
[GraphBuilderPatternOptimization-IBO.optimize] iteration 1: 1592 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 200 matches, 2*ConcatTwiceUnaryPattern, 49*DropoutPattern, 1*IdentityPattern, 25*LayerNormalizationPattern, 96*SlicesSplitPattern, 1*SqueezeUnsqueezePattern, 1*SwapUnaryPattern, 1*UnsqueezeUnsqueezePattern, 24*GeluOrtPattern - time=0.244 | max_time=SlicesSplitPattern:0.016
[GraphBuilderPatternOptimization-IBO.optimize] iteration 2: 1126 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 100 matches, 2*ConcatTwiceUnaryPattern, 25*LayerNormalizationScalePattern, 1*UnsqueezeUnsqueezePattern, 48*FunctionHalfRotaryEmbeddingPattern, 24*FastGeluPattern - time=0.120 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 3: 910 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 24 matches, 24*SkipLayerNormalizationPattern - time=0.113 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] iteration 4: 886 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 2
[GraphBuilderPatternOptimization-IBO.optimize] iteration 5: 886 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 1 matches, [0]=MatchResult: ContribRotaryEmbeddingPattern replaces ['Concat', 'Concat', 'Split', 'HalfRotaryEmbedding', 'Concat'] - time=0.096 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 6: 891 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 3 matches, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern - time=0.107 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 7: 895 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 6 matches, 2*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.102 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=625 applied patterns, 893 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 8: 893 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 5 matches, 1*ShapeBasedEditDistanceReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 9: 899 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 10 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.102 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=648 applied patterns, 892 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 10: 892 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.111 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 11: 895 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.098 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=679 applied patterns, 883 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 12: 883 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.113 | max_time=ShapeBasedEditDistanceReshapePattern:0.012
[GraphBuilderPatternOptimization-IBO.optimize] iteration 13: 882 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.108 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=715 applied patterns, 869 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 14: 869 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.111 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 15: 868 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=751 applied patterns, 855 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 16: 855 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.095 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] iteration 17: 854 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.137 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=787 applied patterns, 841 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 18: 841 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.112 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] iteration 19: 840 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.116 | max_time=ShapeBasedEditDistanceReshapePattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=823 applied patterns, 827 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 20: 827 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.100 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 21: 826 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.091 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=859 applied patterns, 813 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 22: 813 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.122 | max_time=IdentityPattern:0.014
[GraphBuilderPatternOptimization-IBO.optimize] iteration 23: 812 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=895 applied patterns, 799 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 24: 799 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.092 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 25: 798 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.089 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=931 applied patterns, 785 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 26: 785 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.089 | max_time=Reshape2Of3Pattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 27: 784 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.087 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=967 applied patterns, 771 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 28: 771 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.103 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 29: 770 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.115 | max_time=IdentityPattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1003 applied patterns, 757 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 30: 757 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.169 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 31: 756 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.090 | max_time=ShapeBasedEditDistanceReshapePattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1039 applied patterns, 743 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 32: 743 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.089 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 33: 742 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1074 applied patterns, 730 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 34: 730 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.095 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 35: 729 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.082 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1110 applied patterns, 716 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 36: 716 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.090 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 37: 715 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.139 | max_time=ShapeBasedEditDistanceReshapePattern:0.013
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1146 applied patterns, 702 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 38: 702 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.079 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 39: 701 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.086 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1182 applied patterns, 688 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 40: 688 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.084 | max_time=ShapeBasedEditDistanceReshapePattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 41: 687 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1218 applied patterns, 674 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 42: 674 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 43: 673 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.082 | max_time=ShapeBasedEditDistanceReshapePattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1254 applied patterns, 660 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 44: 660 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 45: 659 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.081 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1290 applied patterns, 646 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 46: 646 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 47: 645 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.085 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1326 applied patterns, 632 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 48: 632 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.066 | max_time=IdentityPattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] iteration 49: 631 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.069 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1362 applied patterns, 618 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 50: 618 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.091 | max_time=MatMulReshape2Of3Pattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] iteration 51: 617 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.162 | max_time=SameChildrenPattern:0.013
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1398 applied patterns, 604 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 52: 604 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.079 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 53: 601 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbedding3DPattern - time=0.089 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=11, n_removed=15, n_applied=1429 applied patterns, 586 nodes left with 3 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 54: 586 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 1*MultiHeadAttention3DPattern - time=0.074 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 55: 577 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 7 matches, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 1*SameChildrenPattern - time=0.068 | max_time=SameChildrenPattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=0, n_removed=0, n_applied=1444 applied patterns, 570 nodes left with 1 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 56: 570 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 5 matches, 5*ShapedBasedReshapePattern - time=0.076 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 57: 565 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] stops current_priority_index=4, priorities=[0, 1, 2, 3]
[GraphBuilderPatternOptimization-IBO.optimize] done after 58 iterations with 565 nodes in 16.602
[GraphBuilder-IBO.optimize] done with 490 nodes in 19.529
[GraphBuilder-IBO.to_onnx] make_model 502 inits 341 params
[GraphBuilder-IBO.time_evaluation_constants_] 0.0009538510057609528
[GraphBuilder-IBO._build_initializers] start with 502 initializers, large_model=True, external_threshold=1024
[GraphBuilder-IBO._build_initializers] switch low/high order
[GraphBuilder-IBO._build_initializers] done in 7.6120013545732945e-06s with 356 initializers, 341 large initializers
[GraphBuilder-IBO._add_shape_information] dynamic shapes replacements={'batch': 'batch', 'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch^s52^batch^s93': 'batch', 's77': 'batch', 's8': 'batch', 's41': 'batch', 's60': 'batch', 'batch^s104^batch^s106': 'batch', 's91': 'batch', 's64': 'batch', 'batch^s48^batch^s59': 'batch', 's36': 'batch', 's3': 'batch', 's98': 'batch', 's45': 'batch', 's59': 'batch', 's34': 'batch', 's84': 'batch', 's87': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s98^batch^s79': 'batch', 's62': 'batch', 'batch^s39^batch^s71': 'batch', 's75': 'batch', 'batch^s3^batch^s41': 'batch', 'batch^s49^batch^s26': 'batch', 's71': 'batch', 'batch^s97^batch^s10': 'batch', 's57': 'batch', 's56': 'batch', 's100': 'batch', 's43': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 's93': 'batch', 's67': 'batch', 's1': 'batch', 'batch^s36^batch^s13': 'batch', 's49': 'batch', 's26': 'batch', 's79': 'batch', 's39': 'batch', 's83': 'batch', 'batch^s84^batch^s91': 'batch', 'batch^s34^batch^s77': 'batch', 's86': 'batch', 's13': 'batch', 's10': 'batch', 's69': 'batch', 's102': 'batch', 's92': 'batch', 's30': 'batch', 'batch^s30^batch^s89': 'batch', 's106': 'batch', 's29': 'batch', 'batch^s82^batch^s62': 'batch', 'batch^s87^batch^s23': 'batch', 's89': 'batch', 'batch^s90^batch^s57': 'batch', 'batch^s64^batch^s86': 'batch', 's104': 'batch', 's47': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s1^batch^s75': 'batch', 's52': 'batch', 's90': 'batch', 's23': 'batch', 's61': 'batch', 's35': 'batch', 'batch^s35^batch^s60': 'batch', 's72': 'batch', 's97': 'batch', 's82': 'batch', 's70': 'seq_length', 's37': 'cache_length', 's96': 'cache_length', 's105': 'cache_length', 's24': 'cache_length', 's88': 'cache_length', 's80': 'cache_length', 's51': 'cache_length', 's31': 'cache_length', 's55': 'cache_length', 's32': 'cache_length', 's101': 'cache_length', 's74': 'cache_length', 's14': 'cache_length', 's50': 'cache_length', 's7': 'cache_length', 's40': 'cache_length', 's54': 'cache_length', 's78': 'cache_length', 's94': 'cache_length', 's95': 'cache_length', 's65': 'cache_length', 's38': 'cache_length', 's15': 'cache_length', 's66': 'cache_length', 's22': 'cache_length', 's28': 'cache_length', 's85': 'cache_length', 's63': 'cache_length', 's2': 'cache_length', 's81': 'cache_length', 's73': 'cache_length', 's4': 'cache_length', 's11': 'cache_length', 's18': 'cache_length', 's25': 'cache_length', 's46': 'cache_length', 's58': 'cache_length', 's103': 'cache_length', 's33': 'cache_length', 's99': 'cache_length', 's107': 'cache_length', 's68': 'cache_length', 's44': 'cache_length', 's42': 'cache_length', 's9': 'cache_length', 's21': 'cache_length', 's27': 'cache_length', 's76': 'cache_length'}
[to_onnx] to_onnx done in 19.814001822000137s and 490 nodes, 356 initializers, 50 inputs, 49 outputs
-- done in 43.5993395330006

onnx_generate

Then we can call method generate for two tokens. This function is part of onnx_diagnostic but follows the implementation seen earlier for a torch model. Let’s ask first the function to return the session to avoid creating on the second call.

_res, session, _feeds = onnx_generate(
    model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True
)

# And now the full answer.
print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = onnx_generate(
    session, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="onnx", duration=duration))

print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...
-- done in 2.345785806999629
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(math.sqrt(num))+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
``

Plots

df = pandas.DataFrame(data).set_index("name")
print(df)
          duration
name
generate  4.168122
custom    4.972324
onnx      2.345786
ax = df.plot(kind="bar", title="Time (s) comparison to generate a prompt.", rot=45)
ax.figure.tight_layout()
ax.figure.savefig("plot_generate.png")
Time (s) comparison to generate a prompt.

Total running time of the script: (1 minutes 2.813 seconds)

Related examples

Reproducible Parallelized Reduction is difficult

Reproducible Parallelized Reduction is difficult

LayerNormalization implementation cannot be exchanged

LayerNormalization implementation cannot be exchanged

Dynamic Shapes and Broadcasting

Dynamic Shapes and Broadcasting

Gallery generated by Sphinx-Gallery