Note
Go to the end to download the full example code.
From a LLM to processing a prompt¶
Method generate generates the model answer for a given prompt.
Let’s implement our own to understand better how it works and
then apply it to an ONNX model.
Example with Phi 1.5¶
epkg:microsoft/Phi-1.5 is a small LLM. The example given
import os
import time
import sys
import pandas
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from onnx_diagnostic.ext_test_case import unit_test_going
from onnx_diagnostic.helpers import string_type
from onnx_diagnostic.helpers.torch_helper import to_any, get_weight_type
from onnx_diagnostic.helpers.rt_helper import onnx_generate
from onnx_diagnostic.torch_export_patches import torch_export_patches
from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config, task_from_id
from onnx_diagnostic.tasks import random_input_kwargs
from onnx_diagnostic.export.api import to_onnx
device = "cuda" if torch.cuda.is_available() else "cpu"
data = []
print("-- load the model...")
if unit_test_going():
# unit_test_going() returns True if UNITTEST_GOING is 1
# The example switches to a faster scenario.
model_id = "arnir0/Tiny-LLM"
data_export = get_untrained_model_with_inputs(model_id)
model = data_export["model"]
export_inputs = data_export["inputs"]
export_shapes = data_export["dynamic_shapes"]
tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
model_id = "microsoft/phi-1_5"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
config = get_pretrained_config(model_id)
task = task = task_from_id(model_id)
kwargs, fct = random_input_kwargs(config, task)
res = fct(model, config, add_second_input=False, **kwargs)
export_inputs = res["inputs"]
export_shapes = res["dynamic_shapes"]
model = model.to(device)
print("-- done.")
print("-- tokenize the prompt...")
inputs = tokenizer(
'''def print_prime(n):
"""
Print all primes between 1 and n
"""''',
return_tensors="pt",
return_attention_mask=False,
).to(device)
print("-- done.")
print("-- compute the answer...")
begin = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=100)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="generate", duration=duration))
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- load the model...
-- done.
-- tokenize the prompt...
-- done.
-- compute the answer...
-- done in 4.1681223129999125
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
"""
Print all primes between 1 and n
"""
primes = []
for num in range(2, n+1):
is_prime = True
for i in range(2, int(math.sqrt(num))+1):
if num % i == 0:
is_prime = False
break
if is_prime:
primes.append(num)
print(primes)
print_prime(20)
``
eos_token_id?¶
This token means the end of the answer.
print("eos_token_id=", tokenizer.eos_token_id)
eos_token_id= 50256
Custom method generate¶
Let’s implement a simple function replicating when method
generate does.
def simple_generate_with_cache(
model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
):
# First call: prefill
outputs = model(input_ids, use_cache=True)
# Next calls: decode
for _ in tqdm(list(range(max_new_tokens))):
next_token_logits = outputs.logits[:, -1, :]
past_key_values = outputs.past_key_values
# The most probable next token is chosen.
next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
# But we could select it using a multinomial law
# <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
# <<< top_probs, top_indices = torch.topk(probs, top_k)
# <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]
if next_token_id.item() == eos_token_id:
break
input_ids = torch.cat([input_ids, next_token_id], dim=-1)
# Feed only the new token, but with the cache
outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)
return input_ids
print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = simple_generate_with_cache(
model, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="custom", duration=duration))
print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...
0%| | 0/100 [00:00<?, ?it/s]
3%|▎ | 3/100 [00:00<00:03, 25.18it/s]
6%|▌ | 6/100 [00:00<00:03, 25.08it/s]
9%|▉ | 9/100 [00:00<00:03, 23.97it/s]
12%|█▏ | 12/100 [00:00<00:03, 23.81it/s]
15%|█▌ | 15/100 [00:00<00:03, 24.09it/s]
18%|█▊ | 18/100 [00:00<00:03, 24.23it/s]
21%|██ | 21/100 [00:00<00:03, 24.58it/s]
24%|██▍ | 24/100 [00:00<00:03, 23.64it/s]
27%|██▋ | 27/100 [00:01<00:03, 23.31it/s]
30%|███ | 30/100 [00:01<00:03, 22.72it/s]
33%|███▎ | 33/100 [00:01<00:02, 23.74it/s]
36%|███▌ | 36/100 [00:01<00:02, 23.77it/s]
39%|███▉ | 39/100 [00:01<00:02, 23.43it/s]
42%|████▏ | 42/100 [00:01<00:02, 23.77it/s]
45%|████▌ | 45/100 [00:01<00:02, 23.24it/s]
48%|████▊ | 48/100 [00:02<00:02, 22.26it/s]
51%|█████ | 51/100 [00:02<00:02, 22.43it/s]
54%|█████▍ | 54/100 [00:02<00:02, 22.81it/s]
57%|█████▋ | 57/100 [00:02<00:01, 22.52it/s]
60%|██████ | 60/100 [00:02<00:01, 21.28it/s]
63%|██████▎ | 63/100 [00:02<00:01, 20.49it/s]
66%|██████▌ | 66/100 [00:02<00:01, 20.49it/s]
69%|██████▉ | 69/100 [00:03<00:01, 19.81it/s]
71%|███████ | 71/100 [00:03<00:01, 19.66it/s]
73%|███████▎ | 73/100 [00:03<00:01, 19.16it/s]
75%|███████▌ | 75/100 [00:03<00:01, 19.19it/s]
77%|███████▋ | 77/100 [00:03<00:01, 18.72it/s]
79%|███████▉ | 79/100 [00:03<00:01, 18.50it/s]
81%|████████ | 81/100 [00:03<00:01, 17.96it/s]
83%|████████▎ | 83/100 [00:03<00:00, 17.51it/s]
85%|████████▌ | 85/100 [00:03<00:00, 17.62it/s]
87%|████████▋ | 87/100 [00:04<00:00, 17.64it/s]
89%|████████▉ | 89/100 [00:04<00:00, 18.15it/s]
91%|█████████ | 91/100 [00:04<00:00, 17.16it/s]
93%|█████████▎| 93/100 [00:04<00:00, 17.43it/s]
96%|█████████▌| 96/100 [00:04<00:00, 18.53it/s]
98%|█████████▊| 98/100 [00:04<00:00, 18.76it/s]
100%|██████████| 100/100 [00:04<00:00, 20.95it/s]
-- done in 4.9723236999998335
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
"""
Print all primes between 1 and n
"""
primes = []
for num in range(2, n+1):
is_prime = True
for i in range(2, int(math.sqrt(num))+1):
if num % i == 0:
is_prime = False
break
if is_prime:
primes.append(num)
print(primes)
print_prime(20)
``
Method generate for onnx models¶
We first need to export the model into ONNX.
ONNX Conversion¶
if "position_ids" in export_inputs:
del export_inputs["position_ids"]
del export_shapes["position_ids"]
dtype = get_weight_type(model)
print("-- model dtype:", dtype)
export_inputs["past_key_values"] = to_any(export_inputs["past_key_values"], dtype)
exporter = "onnx-dynamo" if "dynamo" in sys.argv else "custom"
model_name = f"model_{model_id.replace('/', '-')}.{exporter}.onnx"
if not os.path.exists(model_name):
# This step is slow so let's skip it if it was already done.
print("-- conversion to ONNX.")
begin = time.perf_counter()
with torch_export_patches(patch_transformers=True):
to_onnx(
model,
(),
kwargs=to_any(export_inputs, device),
dynamic_shapes=export_shapes,
filename=model_name,
verbose=1,
exporter=exporter,
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
-- model dtype: torch.float16
-- conversion to ONNX.
[to_onnx] build the graph module from <class 'transformers.models.phi.modeling_phi.PhiForCausalLM'>, type(args)=<class 'tuple'>
[to_onnx] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] export_options=ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>))
[_make_builder_interpreter] input args=()
[_make_builder_interpreter] input kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[_make_builder_interpreter] dynamic_shapes={'input_ids': {0: 'batch', 1: 'seq_length'}, 'attention_mask': {0: 'batch', 1: 'cache+seq'}, 'past_key_values': [{0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}, {0: 'batch', 2: 'cache_length'}]}
[_make_builder_interpreter] same_signature=True, tracing_mode=symbolic
[ExportOptions.export] ExportOptions(aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)) - torch._dynamo.export 'PhiForCausalLM'
[ExportOptions.export] aten_as_function=('aten.index_copy.default', 'aten.index_put.default', 'aten.setitem', <built-in function setitem>)
[ExportOptions.export] torch_export strict=False, verbose=1
[ExportOptions.export] dynamic_shapes={'input_ids': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'attention_mask': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}]}
[ExportOptions.export] args=()
[ExportOptions.export] kwargs=dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#24[T10r4,...], value_cache=#24[T10r4,...]))
[ExportOptions.export] export start with strict=False...
[ExportOptions.export] export with backed_size_oblivious=auto
[torch_export] backed_size_oblivious='auto'
~/github/onnx-diagnostic/onnx_diagnostic/helpers/cache_helper.py:83: FutureWarning: `treespec.children_specs` is deprecated. Use `treespec.child(index)` to access a single child, or `treespec.children()` to get all children.
for subspec in spec.children_specs:
[torch_export] inferred backed_size_oblivious=None
[torch_export] export starts with backed_size_oblivious=None
[ExportOptions.export] export done in 12.40310462600246
[ExportOptions.export] post_process_exported_program with decomposition_table=None
[ExportOptions.export] remove inplace nodes
[ExportOptions.export] slices: 6 slices nodes were removed
[CustomTracer.remove_inplace] starts with 1713 nodes
[CustomTracer.remove_inplace] S1: 1 inplace nodes
[CustomTracer.remove_inplace] S2: 1 inplace nodes and 10 iterations
[CustomTracer.remove_inplace] end with 10 iterations and 1709 nodes
[ExportOptions.export] inplaces: 1 inplaced nodes were removed
[ExportOptions.export] done remove inplace in 0.054876435999176465, modified=1
[ExportOptions.export] done with no decomposition in 0.055599531999178
[to_onnx] graph module done in 12.486493407002854 s
[to_onnx] start creating the onnx nodes
[to_onnx] interpreter.function_options=FunctionOptions(export_as_function=True, name='*', domain='*', external_threshold=256, move_initializer_to_constant=True, return_initializer=True, merge_allowed=True, rename_allowed=True)
0%| | 0/1709 [00:00<?, ?it/s]
26%|██▌ | 444/1709 [00:00<00:00, 4417.52it/s]
52%|█████▏ | 886/1709 [00:00<00:00, 965.77it/s]
65%|██████▍ | 1105/1709 [00:00<00:00, 1030.38it/s]
75%|███████▍ | 1280/1709 [00:01<00:00, 1063.70it/s]
84%|████████▍ | 1433/1709 [00:01<00:00, 1093.32it/s]
92%|█████████▏| 1574/1709 [00:01<00:00, 1121.95it/s]
100%|██████████| 1709/1709 [00:01<00:00, 1038.02it/s]
100%|██████████| 1709/1709 [00:01<00:00, 1110.16it/s]
[to_onnx] 2312 onnx nodes done in 1.5416379419984878 s
[to_onnx] start conversion to onnx (before optimization) mask_outputs=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
[inline_functions] begin graph 131165578613616
[inline_functions] skip_functions=set()
[_inline_functions_iterations] inline function 'submod_3' domain 'local_functions'
[_inline_functions_iterations] 9 new nodes for 'submod_3', 'local_functions'
[inline_functions] done graph 131165578613616 in 0.0366950959978567
[GraphBuilder-IBO._add_shape_information] dynamic shapes replacements={'batch': 'batch', 'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch^s52^batch^s93': 'batch', 's77': 'batch', 's8': 'batch', 's41': 'batch', 's60': 'batch', 'batch^s104^batch^s106': 'batch', 's91': 'batch', 's64': 'batch', 'batch^s48^batch^s59': 'batch', 's36': 'batch', 's3': 'batch', 's98': 'batch', 's45': 'batch', 's59': 'batch', 's34': 'batch', 's84': 'batch', 's87': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s98^batch^s79': 'batch', 's62': 'batch', 'batch^s39^batch^s71': 'batch', 's75': 'batch', 'batch^s3^batch^s41': 'batch', 'batch^s49^batch^s26': 'batch', 's71': 'batch', 'batch^s97^batch^s10': 'batch', 's57': 'batch', 's56': 'batch', 's100': 'batch', 's43': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 's93': 'batch', 's67': 'batch', 's1': 'batch', 'batch^s36^batch^s13': 'batch', 's49': 'batch', 's26': 'batch', 's79': 'batch', 's39': 'batch', 's83': 'batch', 'batch^s84^batch^s91': 'batch', 'batch^s34^batch^s77': 'batch', 's86': 'batch', 's13': 'batch', 's10': 'batch', 's69': 'batch', 's102': 'batch', 's92': 'batch', 's30': 'batch', 'batch^s30^batch^s89': 'batch', 's106': 'batch', 's29': 'batch', 'batch^s82^batch^s62': 'batch', 'batch^s87^batch^s23': 'batch', 's89': 'batch', 'batch^s90^batch^s57': 'batch', 'batch^s64^batch^s86': 'batch', 's104': 'batch', 's47': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s1^batch^s75': 'batch', 's52': 'batch', 's90': 'batch', 's23': 'batch', 's61': 'batch', 's35': 'batch', 'batch^s35^batch^s60': 'batch', 's72': 'batch', 's97': 'batch', 's82': 'batch', 's70': 'seq_length', 's37': 'cache_length', 's96': 'cache_length', 's105': 'cache_length', 's24': 'cache_length', 's88': 'cache_length', 's80': 'cache_length', 's51': 'cache_length', 's31': 'cache_length', 's55': 'cache_length', 's32': 'cache_length', 's101': 'cache_length', 's74': 'cache_length', 's14': 'cache_length', 's50': 'cache_length', 's7': 'cache_length', 's40': 'cache_length', 's54': 'cache_length', 's78': 'cache_length', 's94': 'cache_length', 's95': 'cache_length', 's65': 'cache_length', 's38': 'cache_length', 's15': 'cache_length', 's66': 'cache_length', 's22': 'cache_length', 's28': 'cache_length', 's85': 'cache_length', 's63': 'cache_length', 's2': 'cache_length', 's81': 'cache_length', 's73': 'cache_length', 's4': 'cache_length', 's11': 'cache_length', 's18': 'cache_length', 's25': 'cache_length', 's46': 'cache_length', 's58': 'cache_length', 's103': 'cache_length', 's33': 'cache_length', 's99': 'cache_length', 's107': 'cache_length', 's68': 'cache_length', 's44': 'cache_length', 's42': 'cache_length', 's9': 'cache_length', 's21': 'cache_length', 's27': 'cache_length', 's76': 'cache_length'}
[GraphBuilder-IBO.optimize] start with 2320 nodes
[GraphBuilder-IBO.optimize] #patterns=103
[GraphBuilder-IBO.optimize] start with subgraphs
[GraphBuilder-IBO.optimize] done with subgraphs
[GraphBuilderPatternOptimization-IBO.optimize] start with 1991 nodes, 459 initializers, 103 patterns, priorities=[0, 1, 2, 3], max_iter=7964
[GraphBuilderPatternOptimization-IBO.optimize] same children={'SameChildrenPattern', 'SameChildrenFromInputPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] iteration 0: 1991 nodes, priority=0
[GraphBuilderPatternOptimization-IBO.optimize] applies 229 matches, 75*CastPattern, 2*IdentityPattern, 1*ShapeBasedStaticExpandPattern, 96*ShapeBasedEditDistanceReshapePattern, 18*ShapeBasedIdentityPattern, 6*SameChildrenPattern, 1*SqueezeAddPattern, 1*SqueezeUnsqueezePattern, 2*SwapUnaryPattern, 3*UnsqueezeUnsqueezePattern, 24*FunctionAttentionPattern - time=0.144 | max_time=IdentityPattern:0.043
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=206, n_removed=260, n_applied=283 applied patterns, 1592 nodes left with 23 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 1
[GraphBuilderPatternOptimization-IBO.optimize] iteration 1: 1592 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 200 matches, 2*ConcatTwiceUnaryPattern, 49*DropoutPattern, 1*IdentityPattern, 25*LayerNormalizationPattern, 96*SlicesSplitPattern, 1*SqueezeUnsqueezePattern, 1*SwapUnaryPattern, 1*UnsqueezeUnsqueezePattern, 24*GeluOrtPattern - time=0.244 | max_time=SlicesSplitPattern:0.016
[GraphBuilderPatternOptimization-IBO.optimize] iteration 2: 1126 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 100 matches, 2*ConcatTwiceUnaryPattern, 25*LayerNormalizationScalePattern, 1*UnsqueezeUnsqueezePattern, 48*FunctionHalfRotaryEmbeddingPattern, 24*FastGeluPattern - time=0.120 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 3: 910 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] applies 24 matches, 24*SkipLayerNormalizationPattern - time=0.113 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] iteration 4: 886 nodes, priority=1
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 2
[GraphBuilderPatternOptimization-IBO.optimize] iteration 5: 886 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 1 matches, [0]=MatchResult: ContribRotaryEmbeddingPattern replaces ['Concat', 'Concat', 'Split', 'HalfRotaryEmbedding', 'Concat'] - time=0.096 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 6: 891 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 3 matches, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern - time=0.107 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 7: 895 nodes, priority=2
[GraphBuilderPatternOptimization-IBO.optimize] applies 6 matches, 2*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.102 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=625 applied patterns, 893 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 8: 893 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 5 matches, 1*ShapeBasedEditDistanceReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 9: 899 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 10 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.102 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=648 applied patterns, 892 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 10: 892 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.111 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 11: 895 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.098 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=679 applied patterns, 883 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 12: 883 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.113 | max_time=ShapeBasedEditDistanceReshapePattern:0.012
[GraphBuilderPatternOptimization-IBO.optimize] iteration 13: 882 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.108 | max_time=IdentityPattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=715 applied patterns, 869 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 14: 869 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.111 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 15: 868 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=751 applied patterns, 855 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 16: 855 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.095 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] iteration 17: 854 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.137 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=787 applied patterns, 841 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 18: 841 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.112 | max_time=ShapeBasedEditDistanceReshapePattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] iteration 19: 840 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.116 | max_time=ShapeBasedEditDistanceReshapePattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=823 applied patterns, 827 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 20: 827 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.100 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 21: 826 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.091 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=859 applied patterns, 813 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 22: 813 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.122 | max_time=IdentityPattern:0.014
[GraphBuilderPatternOptimization-IBO.optimize] iteration 23: 812 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.099 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=895 applied patterns, 799 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 24: 799 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.092 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 25: 798 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.089 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=931 applied patterns, 785 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 26: 785 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.089 | max_time=Reshape2Of3Pattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 27: 784 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.087 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=967 applied patterns, 771 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 28: 771 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.103 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 29: 770 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.115 | max_time=IdentityPattern:0.008
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1003 applied patterns, 757 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 30: 757 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.169 | max_time=ShapeBasedEditDistanceReshapePattern:0.009
[GraphBuilderPatternOptimization-IBO.optimize] iteration 31: 756 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.090 | max_time=ShapeBasedEditDistanceReshapePattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1039 applied patterns, 743 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 32: 743 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.089 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 33: 742 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 4*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=IdentityPattern:0.010
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1074 applied patterns, 730 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 34: 730 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.095 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 35: 729 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.082 | max_time=ShapeBasedEditDistanceReshapePattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1110 applied patterns, 716 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 36: 716 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.090 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 37: 715 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.139 | max_time=ShapeBasedEditDistanceReshapePattern:0.013
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1146 applied patterns, 702 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 38: 702 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.079 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 39: 701 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.086 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1182 applied patterns, 688 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 40: 688 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.084 | max_time=ShapeBasedEditDistanceReshapePattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 41: 687 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.103 | max_time=ShapeBasedEditDistanceReshapePattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1218 applied patterns, 674 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 42: 674 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 43: 673 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.082 | max_time=ShapeBasedEditDistanceReshapePattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1254 applied patterns, 660 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 44: 660 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] iteration 45: 659 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.081 | max_time=IdentityPattern:0.006
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1290 applied patterns, 646 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 46: 646 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.083 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 47: 645 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.085 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1326 applied patterns, 632 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 48: 632 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.066 | max_time=IdentityPattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] iteration 49: 631 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.069 | max_time=IdentityPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1362 applied patterns, 618 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 50: 618 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.091 | max_time=MatMulReshape2Of3Pattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] iteration 51: 617 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 16 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern - time=0.162 | max_time=SameChildrenPattern:0.013
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=23, n_removed=31, n_applied=1398 applied patterns, 604 nodes left with 4 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 52: 604 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 12 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbeddingPattern, 1*ContribRotaryEmbedding3DPattern, 1*MultiHeadAttention3DPattern - time=0.079 | max_time=SameChildrenPattern:0.005
[GraphBuilderPatternOptimization-IBO.optimize] iteration 53: 601 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 15 matches, 1*ShapeBasedEditDistanceReshapePattern, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 5*SameChildrenPattern, 2*SqueezeUnsqueezePattern, 1*ContribRotaryEmbedding3DPattern - time=0.089 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=11, n_removed=15, n_applied=1429 applied patterns, 586 nodes left with 3 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 54: 586 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 8 matches, 1*ShapeBasedEditDistanceReshapePattern, 5*ShapedBasedReshapePattern, 1*ReshapeReshapePattern, 1*MultiHeadAttention3DPattern - time=0.074 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 55: 577 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 7 matches, 1*ShapedBasedReshapePattern, 5*ReshapeReshapePattern, 1*SameChildrenPattern - time=0.068 | max_time=SameChildrenPattern:0.004
[GraphBuilderPatternOptimization-IBO.optimize] reapply {'SameChildrenPattern'}
[GraphBuilderPatternOptimization-IBO.optimize] n_added=0, n_removed=0, n_applied=1444 applied patterns, 570 nodes left with 1 iterations
[GraphBuilderPatternOptimization-IBO.optimize] increase priority to 3
[GraphBuilderPatternOptimization-IBO.optimize] iteration 56: 570 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] applies 5 matches, 5*ShapedBasedReshapePattern - time=0.076 | max_time=IdentityPattern:0.007
[GraphBuilderPatternOptimization-IBO.optimize] iteration 57: 565 nodes, priority=3
[GraphBuilderPatternOptimization-IBO.optimize] stops current_priority_index=4, priorities=[0, 1, 2, 3]
[GraphBuilderPatternOptimization-IBO.optimize] done after 58 iterations with 565 nodes in 16.602
[GraphBuilder-IBO.optimize] done with 490 nodes in 19.529
[GraphBuilder-IBO.to_onnx] make_model 502 inits 341 params
[GraphBuilder-IBO.time_evaluation_constants_] 0.0009538510057609528
[GraphBuilder-IBO._build_initializers] start with 502 initializers, large_model=True, external_threshold=1024
[GraphBuilder-IBO._build_initializers] switch low/high order
[GraphBuilder-IBO._build_initializers] done in 7.6120013545732945e-06s with 356 initializers, 341 large initializers
[GraphBuilder-IBO._add_shape_information] dynamic shapes replacements={'batch': 'batch', 'seq_length': 'seq_length', 'cache_length': 'cache_length', 'batch^s52^batch^s93': 'batch', 's77': 'batch', 's8': 'batch', 's41': 'batch', 's60': 'batch', 'batch^s104^batch^s106': 'batch', 's91': 'batch', 's64': 'batch', 'batch^s48^batch^s59': 'batch', 's36': 'batch', 's3': 'batch', 's98': 'batch', 's45': 'batch', 's59': 'batch', 's34': 'batch', 's84': 'batch', 's87': 'batch', 'batch^s29^batch^s8': 'batch', 'batch^s45^batch^s47': 'batch', 'batch^s67^batch^s61': 'batch', 'batch^s92^batch^s83': 'batch', 'batch^s98^batch^s79': 'batch', 's62': 'batch', 'batch^s39^batch^s71': 'batch', 's75': 'batch', 'batch^s3^batch^s41': 'batch', 'batch^s49^batch^s26': 'batch', 's71': 'batch', 'batch^s97^batch^s10': 'batch', 's57': 'batch', 's56': 'batch', 's100': 'batch', 's43': 'batch', 'batch^s100^batch^s102': 'batch', 's48': 'batch', 's93': 'batch', 's67': 'batch', 's1': 'batch', 'batch^s36^batch^s13': 'batch', 's49': 'batch', 's26': 'batch', 's79': 'batch', 's39': 'batch', 's83': 'batch', 'batch^s84^batch^s91': 'batch', 'batch^s34^batch^s77': 'batch', 's86': 'batch', 's13': 'batch', 's10': 'batch', 's69': 'batch', 's102': 'batch', 's92': 'batch', 's30': 'batch', 'batch^s30^batch^s89': 'batch', 's106': 'batch', 's29': 'batch', 'batch^s82^batch^s62': 'batch', 'batch^s87^batch^s23': 'batch', 's89': 'batch', 'batch^s90^batch^s57': 'batch', 'batch^s64^batch^s86': 'batch', 's104': 'batch', 's47': 'batch', 'batch^s69^batch^s56': 'batch', 'batch^s1^batch^s75': 'batch', 's52': 'batch', 's90': 'batch', 's23': 'batch', 's61': 'batch', 's35': 'batch', 'batch^s35^batch^s60': 'batch', 's72': 'batch', 's97': 'batch', 's82': 'batch', 's70': 'seq_length', 's37': 'cache_length', 's96': 'cache_length', 's105': 'cache_length', 's24': 'cache_length', 's88': 'cache_length', 's80': 'cache_length', 's51': 'cache_length', 's31': 'cache_length', 's55': 'cache_length', 's32': 'cache_length', 's101': 'cache_length', 's74': 'cache_length', 's14': 'cache_length', 's50': 'cache_length', 's7': 'cache_length', 's40': 'cache_length', 's54': 'cache_length', 's78': 'cache_length', 's94': 'cache_length', 's95': 'cache_length', 's65': 'cache_length', 's38': 'cache_length', 's15': 'cache_length', 's66': 'cache_length', 's22': 'cache_length', 's28': 'cache_length', 's85': 'cache_length', 's63': 'cache_length', 's2': 'cache_length', 's81': 'cache_length', 's73': 'cache_length', 's4': 'cache_length', 's11': 'cache_length', 's18': 'cache_length', 's25': 'cache_length', 's46': 'cache_length', 's58': 'cache_length', 's103': 'cache_length', 's33': 'cache_length', 's99': 'cache_length', 's107': 'cache_length', 's68': 'cache_length', 's44': 'cache_length', 's42': 'cache_length', 's9': 'cache_length', 's21': 'cache_length', 's27': 'cache_length', 's76': 'cache_length'}
[to_onnx] to_onnx done in 19.814001822000137s and 490 nodes, 356 initializers, 50 inputs, 49 outputs
-- done in 43.5993395330006
onnx_generate¶
Then we can call method generate for two tokens.
This function is part of onnx_diagnostic but follows the implementation
seen earlier for a torch model.
Let’s ask first the function to return the session to avoid creating on the second call.
_res, session, _feeds = onnx_generate(
model_name, inputs.input_ids, 2, max_new_tokens=2, return_session=True
)
# And now the full answer.
print("-- compute the answer with custom generate...")
begin = time.perf_counter()
outputs = onnx_generate(
session, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
)
duration = time.perf_counter() - begin
print(f"-- done in {duration}")
data.append(dict(name="onnx", duration=duration))
print("-- done.")
print("output shape:", string_type(outputs, with_shape=True, with_min_max=True))
print("-- decode the answer...")
text = tokenizer.batch_decode(outputs)[0]
print("-- done.")
print(text)
-- compute the answer with custom generate...
-- done in 2.345785806999629
-- done.
output shape: T7s1x123[7,50285:A10138.878048780489]
-- decode the answer...
-- done.
def print_prime(n):
"""
Print all primes between 1 and n
"""
primes = []
for num in range(2, n+1):
is_prime = True
for i in range(2, int(math.sqrt(num))+1):
if num % i == 0:
is_prime = False
break
if is_prime:
primes.append(num)
print(primes)
print_prime(20)
``
Plots¶
df = pandas.DataFrame(data).set_index("name")
print(df)
duration
name
generate 4.168122
custom 4.972324
onnx 2.345786

Total running time of the script: (1 minutes 2.813 seconds)
Related examples
LayerNormalization implementation cannot be exchanged