Untrained microsoft/phi-2

microsoft/phi-2 is not a big models but still quite big when it comes to write unittest. Function onnx_diagnostic.torch_models.hghub.get_untrained_model_with_inputs() can be used to create a reduced untrained version of a model coming from HuggingFace. It downloads the configuration from the website but creates a dummy model with 1 or 2 hidden layers in order to reduce the size and get a fast execution. The goal is usually to test the export or to compare performance. The relevance does not matter.

Create the dummy model

import copy
import pprint
import warnings
import torch
import onnxruntime
from onnx_diagnostic import doc
from onnx_diagnostic.helpers import max_diff, string_diff, string_type
from onnx_diagnostic.helpers.cache_helper import is_cache_dynamic_registered
from onnx_diagnostic.helpers.ort_session import make_feeds
from onnx_diagnostic.torch_export_patches import bypass_export_some_errors
from onnx_diagnostic.torch_models.hghub import (
    get_untrained_model_with_inputs,
)

warnings.simplefilter("ignore")

# another tiny id: arnir0/Tiny-LLM
data = get_untrained_model_with_inputs("microsoft/phi-2")
untrained_model, inputs, dynamic_shapes, config, size, n_weights = (
    data["model"],
    data["inputs"],
    data["dynamic_shapes"],
    data["configuration"],
    data["size"],
    data["n_weights"],
)

print(f"model {size / 2**20:1.3f} Mb with {n_weights // 1000} mille parameters.")
model 432.330 Mb with 113332 mille parameters.

The original model has 2.7 billion parameters. It was divided by more than 10. Let’s see the configuration.

print(config)
PhiConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "head_dim": 80,
  "hidden_act": "gelu_new",
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 2,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.4,
  "qk_layernorm": false,
  "resid_pdrop": 0.1,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.51.0.dev0",
  "use_cache": true,
  "vocab_size": 51200
}

Inputs:

print(string_type(inputs, with_shape=True))
dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80,T1s2x32x30x80], value_cache=#2[T1s2x32x30x80,T1s2x32x30x80]))

With min/max values.

print(string_type(inputs, with_shape=True, with_min_max=True))
dict(input_ids:T7s2x3[4169,41586:A24195.666666666668],attention_mask:T7s2x33[1,1:A1.0],position_ids:T7s2x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x80[-4.250247001647949,4.296894073486328:A0.00039666472688185903],T1s2x32x30x80[-4.584534645080566,4.687620162963867:A0.000881607897973394]], value_cache=#2[T1s2x32x30x80[-4.445925712585449,4.611501693725586:A-0.007746423489871968],T1s2x32x30x80[-4.628787517547607,4.660802841186523:A0.0030252687874702624]]))

And the dynamic shapes

{'attention_mask': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                    1: _DimHint(type=<_DimHintType.DYNAMIC: 3>)},
 'input_ids': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
               1: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.seq_length'>},
 'past_key_values': [[{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                       2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>},
                      {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                       2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>}],
                     [{0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                       2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>},
                      {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                       2: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.cache_length'>}]],
 'position_ids': {0: <class 'onnx_diagnostic.torch_models.hghub.model_inputs.batch'>,
                  1: _DimHint(type=<_DimHintType.DYNAMIC: 3>)}}

We execute the model to produce expected outputs.

expected = untrained_model(**copy.deepcopy(inputs))
print(f"expected: {string_type(expected, with_shape=True, with_min_max=True)}")
expected: dict(logits:T1s2x3x51200[-2.3363828659057617,2.3946869373321533:A0.0014542278057812533],past_key_values:DynamicCache(key_cache=#2[T1s2x32x33x80[-4.250247001647949,4.296894073486328:A0.000191058535003111],T1s2x32x33x80[-4.584534645080566,4.687620162963867:A0.0002568543326058364]], value_cache=#2[T1s2x32x33x80[-4.445925712585449,4.611501693725586:A-0.007276355170857727],T1s2x32x33x80[-4.628787517547607,4.660802841186523:A0.0036349890182309982]]))

Export

with bypass_export_some_errors(patch_transformers=True) as modificator:

    # Unnecessary steps but useful in case of an error
    # We check the cache is registered.
    assert is_cache_dynamic_registered()

    # We check there is no discrepancies when the cache is applied.
    d = max_diff(expected, untrained_model(**copy.deepcopy(inputs)))
    assert (
        d["abs"] < 1e-5
    ), f"The model with patches produces different outputs: {string_diff(d)}"

    # Then we export.
    ep = torch.export.export(
        untrained_model,
        (),
        kwargs=modificator(copy.deepcopy(inputs)),
        dynamic_shapes=dynamic_shapes,
        strict=False,  # mandatory for torch==2.6
    )

    # We check the exported program produces the same results as well.
    d = max_diff(expected, ep.module()(**copy.deepcopy(inputs)))
    assert d["abs"] < 1e-5, f"The exported model different outputs: {string_diff(d)}"

Export to ONNX

The export works. We can export to ONNX now. Patches are still needed because the export applies torch.export.ExportedProgram.run_decompositions() may export local pieces of the model again.

with bypass_export_some_errors(patch_transformers=True):
    epo = torch.onnx.export(
        ep, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=dynamic_shapes, dynamo=True
    )
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 53 of general pattern rewrite rules.

We can save it.

epo.save("plot_export_tiny_phi2.onnx", external_data=True)

# Or directly get the :class:`onnx.ModelProto`.
onx = epo.model_proto

Discrepancies

The we check the conversion to ONNX. Let’s make sure the ONNX model produces the same outputs. It takes flatten inputs.

feeds = make_feeds(onx, copy.deepcopy(inputs), use_numpy=True, copy=True)

print(f"torch inputs: {string_type(inputs)}")
print(f"onxrt inputs: {string_type(feeds)}")
torch inputs: dict(input_ids:T7r2,attention_mask:T7r2,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
onxrt inputs: dict(input_ids:A7r2,attention_mask:A7r2,position_ids:A7r2,past_key_values_key_cache_0:A1r4,past_key_values_key_cache_1:A1r4,past_key_values_value_cache_0:A1r4,past_key_values_value_cache_1:A1r4)

We then create a onnxruntime.InferenceSession.

sess = onnxruntime.InferenceSession(
    onx.SerializeToString(), providers=["CPUExecutionProvider"]
)

Let’s run.

got = sess.run(None, feeds)

And finally the discrepancies.

diff = max_diff(expected, got, flatten=True)
print(f"onnx discrepancies: {string_diff(diff)}")
onnx discrepancies: abs=2.086162567138672e-06, rel=0.0008807701056327913, n=983040.0

It looks good.

doc.plot_legend("untrained smaller\nmicrosoft/phi-2", "torch.onnx.export", "orange")
plot export tiny phi2

Total running time of the script: (0 minutes 23.712 seconds)

Related examples

Test the export on untrained models

Test the export on untrained models

Steel method forward to guess the dynamic shapes (with Tiny-LLM)

Steel method forward to guess the dynamic shapes (with Tiny-LLM)

Export Tiny-LLM with patches

Export Tiny-LLM with patches

Gallery generated by Sphinx-Gallery