Note
Go to the end to download the full example code.
Untrained microsoft/phi-2¶
microsoft/phi-2 is not a big models but still quite big
when it comes to write unittest. Function
onnx_diagnostic.torch_models.hghub.get_untrained_model_with_inputs()
can be used to create a reduced untrained version of a model coming from
HuggingFace. It downloads the configuration from the website
but creates a dummy model with 1 or 2 hidden layers in order to reduce
the size and get a fast execution. The goal is usually to test
the export or to compare performance. The relevance does not matter.
Create the dummy model¶
import copy
import pprint
import warnings
import torch
import onnxruntime
from onnx_diagnostic import doc
from onnx_diagnostic.helpers import max_diff, string_diff, string_type
from onnx_diagnostic.helpers.cache_helper import is_cache_dynamic_registered
from onnx_diagnostic.helpers.ort_session import make_feeds
from onnx_diagnostic.torch_export_patches import bypass_export_some_errors
from onnx_diagnostic.torch_models.hghub import (
get_untrained_model_with_inputs,
)
warnings.simplefilter("ignore")
# another tiny id: arnir0/Tiny-LLM
data = get_untrained_model_with_inputs("microsoft/phi-2")
untrained_model, inputs, dynamic_shapes, config, size, n_weights = (
data["model"],
data["inputs"],
data["dynamic_shapes"],
data["configuration"],
data["size"],
data["n_weights"],
)
print(f"model {size / 2**20:1.3f} Mb with {n_weights // 1000} mille parameters.")
model 432.330 Mb with 113332 mille parameters.
The original model has 2.7 billion parameters. It was divided by more than 10. Let’s see the configuration.
print(config)
PhiConfig {
"_attn_implementation_autoset": true,
"architectures": [
"PhiForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 50256,
"embd_pdrop": 0.0,
"eos_token_id": 50256,
"head_dim": 80,
"hidden_act": "gelu_new",
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 6144,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 2048,
"model_type": "phi",
"num_attention_heads": 32,
"num_hidden_layers": 2,
"num_key_value_heads": 32,
"partial_rotary_factor": 0.4,
"qk_layernorm": false,
"resid_pdrop": 0.1,
"rope_scaling": null,
"rope_theta": 10000.0,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.51.0.dev0",
"use_cache": true,
"vocab_size": 51200
}
Inputs:
print(string_type(inputs, with_shape=True))
dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))
With min/max values.
print(string_type(inputs, with_shape=True, with_min_max=True))
dict(input_ids:T7s2x3[4169,41586:A24195.666666666668],attention_mask:T7s2x33[1,1:A1.0],position_ids:T7s2x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80[-4.250247001647949,4.296894073486328:A0.00039666472688185903],T1s2x32x30x80[-4.584534645080566,4.687620162963867:A0.000881607897973394]], value_cache=#2[T1s2x32x30x80[-4.445925712585449,4.611501693725586:A-0.007746423489871968],T1s2x32x30x80[-4.628787517547607,4.660802841186523:A0.0030252687874702624]]))
And the dynamic shapes
{'attention_mask': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
1: _DimHint(type=<_DimHintType.DYNAMIC: 3>)},
'input_ids': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
1: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.seq_length'>},
'past_key_values': [[{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>},
{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>}],
[{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>},
{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>}]],
'position_ids': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
1: _DimHint(type=<_DimHintType.DYNAMIC: 3>)}}
We execute the model to produce expected outputs.
expected = untrained_model(**copy.deepcopy(inputs))
print(f"expected: {string_type(expected, with_shape=True, with_min_max=True)}")
expected: dict(logits:T1s2x3x51200[-2.3363828659057617,2.3946869373321533:A0.0014542278057812533],past_key_values:DynamicCache(key_cache=#2[T1s2x32x33x80[-4.250247001647949,4.296894073486328:A0.000191058535003111],T1s2x32x33x80[-4.584534645080566,4.687620162963867:A0.0002568543326058364]], value_cache=#2[T1s2x32x33x80[-4.445925712585449,4.611501693725586:A-0.007276355170857727],T1s2x32x33x80[-4.628787517547607,4.660802841186523:A0.0036349890182309982]]))
Export¶
with bypass_export_some_errors(patch_transformers=True) as modificator:
# Unnecessary steps but useful in case of an error
# We check the cache is registered.
assert is_cache_dynamic_registered()
# We check there is no discrepancies when the cache is applied.
d = max_diff(expected, untrained_model(**copy.deepcopy(inputs)))
assert (
d["abs"] < 1e-5
), f"The model with patches produces different outputs: {string_diff(d)}"
# Then we export.
ep = torch.export.export(
untrained_model,
(),
kwargs=modificator(copy.deepcopy(inputs)),
dynamic_shapes=dynamic_shapes,
strict=False, # mandatory for torch==2.6
)
# We check the exported program produces the same results as well.
d = max_diff(expected, ep.module()(**copy.deepcopy(inputs)))
assert d["abs"] < 1e-5, f"The exported model different outputs: {string_diff(d)}"
Export to ONNX¶
The export works. We can export to ONNX now.
Patches are still needed because the export
applies torch.export.ExportedProgram.run_decompositions()
may export local pieces of the model again.
with bypass_export_some_errors(patch_transformers=True):
epo = torch.onnx.export(
ep, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=dynamic_shapes, dynamo=True
)
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 53 of general pattern rewrite rules.
We can save it.
epo.save("plot_export_tiny_phi2.onnx", external_data=True)
# Or directly get the :class:`onnx.ModelProto`.
onx = epo.model_proto
Discrepancies¶
The we check the conversion to ONNX. Let’s make sure the ONNX model produces the same outputs. It takes flatten inputs.
feeds = make_feeds(onx, copy.deepcopy(inputs), use_numpy=True, copy=True)
print(f"torch inputs: {string_type(inputs)}")
print(f"onxrt inputs: {string_type(feeds)}")
torch inputs: dict(input_ids:T7r2,attention_mask:T7r2,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
onxrt inputs: dict(input_ids:A7r2,attention_mask:A7r2,position_ids:A7r2,past_key_values_key_cache_0:A1r4,past_key_values_key_cache_1:A1r4,past_key_values_value_cache_0:A1r4,past_key_values_value_cache_1:A1r4)
We then create a onnxruntime.InferenceSession
.
sess = onnxruntime.InferenceSession(
onx.SerializeToString(), providers=["CPUExecutionProvider"]
)
Let’s run.
And finally the discrepancies.
diff = max_diff(expected, got, flatten=True)
print(f"onnx discrepancies: {string_diff(diff)}")
onnx discrepancies: abs=2.086162567138672e-06, rel=0.0008807701056327913, n=983040.0
It looks good.
doc.plot_legend("untrained smaller\nmicrosoft/phi-2", "torch.onnx.export", "orange")

Total running time of the script: (0 minutes 23.712 seconds)
Related examples

Steel method forward to guess the dynamic shapes (with Tiny-LLM)