Note
Go to the end to download the full example code.
Export Phi-3.5-mini-instruct piece by piece¶
torch.export.export() often breaks on big models because there
are control flows or instructions breaking the propagation of
dynamic shapes (see …). The function usually gives an indication where
the model implementation can be fixed but in case, that is not possible,
we can try to export the model piece by piece: every module
is converted separately from its submodule. A model can be exported even
if one of its submodules cannot.
Model¶
import pprint
from typing import Any, Dict
import torch
import torch._export.tools
import transformers
from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache
from experimental_experiment.helpers import string_type
from experimental_experiment.torch_interpreter.piece_by_piece import (
trace_execution_piece_by_piece,
)
def get_phi35_untrained(batch_size: int = 2, **kwargs) -> Dict[str, Any]:
"""
Gets a non initialized model with two sets of inputs and different shapes.
:param batch_size: batch size
:param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
:return: dictionary
See `Phi-3.5-mini-instruct/config.json
<https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json>`_.
"""
config = {
"_name_or_path": "Phi-3.5-mini-instruct",
"architectures": ["Phi3ForCausalLM"],
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "configuration_phi3.Phi3Config",
"AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
},
"bos_token_id": 1,
"embd_pdrop": 0.0,
"eos_token_id": 32000,
"hidden_act": "silu",
"hidden_size": 3072,
"initializer_range": 0.02,
"intermediate_size": 8192,
"max_position_embeddings": 131072,
"model_type": "phi3",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 32,
"original_max_position_embeddings": 4096,
"pad_token_id": 32000,
"resid_pdrop": 0.0,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"long_factor": [
1.0800000429153442,
1.1100000143051147,
1.1399999856948853,
1.340000033378601,
1.5899999141693115,
1.600000023841858,
1.6200000047683716,
2.620000123977661,
3.2300000190734863,
3.2300000190734863,
4.789999961853027,
7.400000095367432,
7.700000286102295,
9.09000015258789,
12.199999809265137,
17.670000076293945,
24.46000099182129,
28.57000160217285,
30.420001983642578,
30.840002059936523,
32.590003967285156,
32.93000411987305,
42.320003509521484,
44.96000289916992,
50.340003967285156,
50.45000457763672,
57.55000305175781,
57.93000411987305,
58.21000289916992,
60.1400032043457,
62.61000442504883,
62.62000274658203,
62.71000289916992,
63.1400032043457,
63.1400032043457,
63.77000427246094,
63.93000411987305,
63.96000289916992,
63.970001220703125,
64.02999877929688,
64.06999969482422,
64.08000183105469,
64.12000274658203,
64.41000366210938,
64.4800033569336,
64.51000213623047,
64.52999877929688,
64.83999633789062,
],
"short_factor": [
1.0,
1.0199999809265137,
1.0299999713897705,
1.0299999713897705,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0699999332427979,
1.0999999046325684,
1.1099998950958252,
1.1599998474121094,
1.1599998474121094,
1.1699998378753662,
1.2899998426437378,
1.339999794960022,
1.679999828338623,
1.7899998426437378,
1.8199998140335083,
1.8499997854232788,
1.8799997568130493,
1.9099997282028198,
1.9399996995925903,
1.9899996519088745,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0799996852874756,
2.0899996757507324,
2.189999580383301,
2.2199995517730713,
2.5899994373321533,
2.729999542236328,
2.749999523162842,
2.8399994373321533,
],
"type": "longrope",
},
"rope_theta": 10000.0,
"sliding_window": 262144,
"tie_word_embeddings": False,
"torch_dtype": "bfloat16",
"use_cache": True,
"attention_bias": False,
"vocab_size": 32064,
}
config.update(**kwargs)
conf = transformers.Phi3Config(**config)
model = transformers.Phi3ForCausalLM(conf)
model.eval()
cache = make_dynamic_cache(
[
(torch.randn(batch_size, 32, 30, 96), torch.randn(batch_size, 32, 30, 96))
for i in range(config["num_hidden_layers"])
]
)
cache2 = make_dynamic_cache(
[
(torch.randn(batch_size + 1, 32, 31, 96), torch.randn(batch_size + 1, 32, 31, 96))
for i in range(config["num_hidden_layers"])
]
)
inputs = dict(
input_ids=torch.randint(0, 32064, (batch_size, 3)).to(torch.int64),
attention_mask=torch.ones((batch_size, 33)).to(torch.int64),
past_key_values=cache,
)
inputs2 = dict(
input_ids=torch.randint(0, 32064, (batch_size + 1, 4)).to(torch.int64),
attention_mask=torch.ones((batch_size + 1, 35)).to(torch.int64),
past_key_values=cache2,
)
return dict(inputs=inputs, model=model, inputs2=inputs2)
data = get_phi35_untrained(num_hidden_layers=2)
model, inputs, inputs2 = data["model"], data["inputs"], data["inputs2"]
print(string_type(inputs, with_shape=True))
dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x96,T1s2x32x30x96], value_cache=#2[T1s2x32x30x96,T1s2x32x30x96]))
Dynamic Shapes¶
We want to infer the dynamic shapes from the two sets of inputs we gave. For that, we use a function to trace the execution of the model including its submodules. It is going to execute the model twice with the two sets of inputs and stores every intermediate input and output.
diag = trace_execution_piece_by_piece(model, [inputs, inputs2], verbose=2)
[_trace_forward_execution] -trace- M:__main__-Phi3ForCausalLM.forward
[_trace_forward_execution] -trace- .. M:model-Phi3Model.forward
[_trace_forward_execution] -trace- .... M:embed_tokens-Embedding.forward
[_trace_forward_execution] -trace- .... M:layers[0]-Phi3DecoderLayer.forward
[_trace_forward_execution] -trace- ...... M:self_attn-Phi3Attention.forward
[_trace_forward_execution] -trace- ........ M:o_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:qkv_proj-Linear.forward
[_trace_forward_execution] -trace- ...... M:mlp-Phi3MLP.forward
[_trace_forward_execution] -trace- ........ M:gate_up_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:down_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:activation_fn-SiLUActivation.forward
[_trace_forward_execution] -trace- ...... M:input_layernorm-Phi3RMSNorm.forward
[_trace_forward_execution] -trace- ...... M:post_attention_layernorm-Phi3RMSNorm.forward
[_trace_forward_execution] -trace- ...... M:resid_attn_dropout-Dropout.forward
[_trace_forward_execution] -trace- ...... M:resid_mlp_dropout-Dropout.forward
[_trace_forward_execution] -trace- .... M:layers[1]-Phi3DecoderLayer.forward
[_trace_forward_execution] -trace- ...... M:self_attn-Phi3Attention.forward
[_trace_forward_execution] -trace- ........ M:o_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:qkv_proj-Linear.forward
[_trace_forward_execution] -trace- ...... M:mlp-Phi3MLP.forward
[_trace_forward_execution] -trace- ........ M:gate_up_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:down_proj-Linear.forward
[_trace_forward_execution] -trace- ........ M:activation_fn-SiLUActivation.forward
[_trace_forward_execution] -trace- ...... M:input_layernorm-Phi3RMSNorm.forward
[_trace_forward_execution] -trace- ...... M:post_attention_layernorm-Phi3RMSNorm.forward
[_trace_forward_execution] -trace- ...... M:resid_attn_dropout-Dropout.forward
[_trace_forward_execution] -trace- ...... M:resid_mlp_dropout-Dropout.forward
[_trace_forward_execution] -trace- .... M:norm-Phi3RMSNorm.forward
[_trace_forward_execution] -trace- .... M:rotary_emb-Phi3RotaryEmbedding.forward
[_trace_forward_execution] -trace- .. M:lm_head-Linear.forward
[trace_execution_piece_by_piece] run with dict(args:(),kwargs:dict(input_ids:T7s2x3,attention_mask:T7s2x33,past_key_values:DynamicCache(key_cache=#2[T1s2x32x30x96,T1s2x32x30x96], value_cache=#2[T1s2x32x30x96,T1s2x32x30x96])))
[__main__:Phi3ForCausalLM] > **dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[model:Phi3Model] > **dict(input_ids:T7r2,attention_mask:T7r2,position_ids:None,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),inputs_embeds:None,use_cache:None,cache_position:None)
[embed_tokens:Embedding] > T7r2
[embed_tokens:Embedding] < T1r3
[rotary_emb:Phi3RotaryEmbedding] > *(T1r3,), **dict(position_ids:T7r2)
[rotary_emb:Phi3RotaryEmbedding] < *(T1r3,T1r3)
[layers[0]:Phi3DecoderLayer] > *(T1r3,), **dict(attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[input_layernorm:Phi3RMSNorm] > T1r3
[input_layernorm:Phi3RMSNorm] < T1r3
[self_attn:Phi3Attention] > **dict(hidden_states:T1r3,attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[qkv_proj:Linear] > T1r3
[qkv_proj:Linear] < T1r3
[o_proj:Linear] > T1r3
[o_proj:Linear] < T1r3
[self_attn:Phi3Attention] < *(T1r3,None)
[resid_attn_dropout:Dropout] > T1r3
[resid_attn_dropout:Dropout] < T1r3
[post_attention_layernorm:Phi3RMSNorm] > T1r3
[post_attention_layernorm:Phi3RMSNorm] < T1r3
[mlp:Phi3MLP] > T1r3
[gate_up_proj:Linear] > T1r3
[gate_up_proj:Linear] < T1r3
[activation_fn:SiLUActivation] > T1r3
[activation_fn:SiLUActivation] < T1r3
[down_proj:Linear] > T1r3
[down_proj:Linear] < T1r3
[mlp:Phi3MLP] < T1r3
[resid_mlp_dropout:Dropout] > T1r3
[resid_mlp_dropout:Dropout] < T1r3
[layers[0]:Phi3DecoderLayer] < T1r3
[layers[1]:Phi3DecoderLayer] > *(T1r3,), **dict(attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[input_layernorm:Phi3RMSNorm] > T1r3
[input_layernorm:Phi3RMSNorm] < T1r3
[self_attn:Phi3Attention] > **dict(hidden_states:T1r3,attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[qkv_proj:Linear] > T1r3
[qkv_proj:Linear] < T1r3
[o_proj:Linear] > T1r3
[o_proj:Linear] < T1r3
[self_attn:Phi3Attention] < *(T1r3,None)
[resid_attn_dropout:Dropout] > T1r3
[resid_attn_dropout:Dropout] < T1r3
[post_attention_layernorm:Phi3RMSNorm] > T1r3
[post_attention_layernorm:Phi3RMSNorm] < T1r3
[mlp:Phi3MLP] > T1r3
[gate_up_proj:Linear] > T1r3
[gate_up_proj:Linear] < T1r3
[activation_fn:SiLUActivation] > T1r3
[activation_fn:SiLUActivation] < T1r3
[down_proj:Linear] > T1r3
[down_proj:Linear] < T1r3
[mlp:Phi3MLP] < T1r3
[resid_mlp_dropout:Dropout] > T1r3
[resid_mlp_dropout:Dropout] < T1r3
[layers[1]:Phi3DecoderLayer] < T1r3
[norm:Phi3RMSNorm] > T1r3
[norm:Phi3RMSNorm] < T1r3
[model:Phi3Model] < *BaseModelOutputWithPast(last_hidden_state:T1r3,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[lm_head:Linear] > T1r3
[lm_head:Linear] < T1r3
[__main__:Phi3ForCausalLM] < *CausalLMOutputWithPast(logits:T1r3,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[trace_execution_piece_by_piece] run with dict(args:(),kwargs:dict(input_ids:T7s3x4,attention_mask:T7s3x35,past_key_values:DynamicCache(key_cache=#2[T1s3x32x31x96,T1s3x32x31x96], value_cache=#2[T1s3x32x31x96,T1s3x32x31x96])))
[__main__:Phi3ForCausalLM] > **dict(input_ids:T7r2,attention_mask:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[model:Phi3Model] > **dict(input_ids:T7r2,attention_mask:T7r2,position_ids:None,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),inputs_embeds:None,use_cache:None,cache_position:None)
[embed_tokens:Embedding] > T7r2
[embed_tokens:Embedding] < T1r3
[rotary_emb:Phi3RotaryEmbedding] > *(T1r3,), **dict(position_ids:T7r2)
[rotary_emb:Phi3RotaryEmbedding] < *(T1r3,T1r3)
[layers[0]:Phi3DecoderLayer] > *(T1r3,), **dict(attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[input_layernorm:Phi3RMSNorm] > T1r3
[input_layernorm:Phi3RMSNorm] < T1r3
[self_attn:Phi3Attention] > **dict(hidden_states:T1r3,attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[qkv_proj:Linear] > T1r3
[qkv_proj:Linear] < T1r3
[o_proj:Linear] > T1r3
[o_proj:Linear] < T1r3
[self_attn:Phi3Attention] < *(T1r3,None)
[resid_attn_dropout:Dropout] > T1r3
[resid_attn_dropout:Dropout] < T1r3
[post_attention_layernorm:Phi3RMSNorm] > T1r3
[post_attention_layernorm:Phi3RMSNorm] < T1r3
[mlp:Phi3MLP] > T1r3
[gate_up_proj:Linear] > T1r3
[gate_up_proj:Linear] < T1r3
[activation_fn:SiLUActivation] > T1r3
[activation_fn:SiLUActivation] < T1r3
[down_proj:Linear] > T1r3
[down_proj:Linear] < T1r3
[mlp:Phi3MLP] < T1r3
[resid_mlp_dropout:Dropout] > T1r3
[resid_mlp_dropout:Dropout] < T1r3
[layers[0]:Phi3DecoderLayer] < T1r3
[layers[1]:Phi3DecoderLayer] > *(T1r3,), **dict(attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[input_layernorm:Phi3RMSNorm] > T1r3
[input_layernorm:Phi3RMSNorm] < T1r3
[self_attn:Phi3Attention] > **dict(hidden_states:T1r3,attention_mask:T9r4,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]),use_cache:bool,cache_position:T7r1,position_embeddings:(T1r3,T1r3))
[qkv_proj:Linear] > T1r3
[qkv_proj:Linear] < T1r3
[o_proj:Linear] > T1r3
[o_proj:Linear] < T1r3
[self_attn:Phi3Attention] < *(T1r3,None)
[resid_attn_dropout:Dropout] > T1r3
[resid_attn_dropout:Dropout] < T1r3
[post_attention_layernorm:Phi3RMSNorm] > T1r3
[post_attention_layernorm:Phi3RMSNorm] < T1r3
[mlp:Phi3MLP] > T1r3
[gate_up_proj:Linear] > T1r3
[gate_up_proj:Linear] < T1r3
[activation_fn:SiLUActivation] > T1r3
[activation_fn:SiLUActivation] < T1r3
[down_proj:Linear] > T1r3
[down_proj:Linear] < T1r3
[mlp:Phi3MLP] < T1r3
[resid_mlp_dropout:Dropout] > T1r3
[resid_mlp_dropout:Dropout] < T1r3
[layers[1]:Phi3DecoderLayer] < T1r3
[norm:Phi3RMSNorm] > T1r3
[norm:Phi3RMSNorm] < T1r3
[model:Phi3Model] < *BaseModelOutputWithPast(last_hidden_state:T1r3,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[lm_head:Linear] > T1r3
[lm_head:Linear] < T1r3
[__main__:Phi3ForCausalLM] < *CausalLMOutputWithPast(logits:T1r3,past_key_values:DynamicCache(key_cache=#2[T1r4,T1r4], value_cache=#2[T1r4,T1r4]))
[trace_forward_execution] traced execution of model Phi3ForCausalLM
>>> __main__: Phi3ForCausalLM
> ((),dict(input_ids:CT7s2x3[2805,21466:A14035.333333333334],attention_mask:CT7s2x33[1,1:A1.0],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x30x96[-5.321568965911865,4.642333030700684:A-0.0017679614187095713],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x30x96[-4.218534469604492,4.59930944442749:A-0.006064819536683874],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]])))
> ((),dict(input_ids:CT7s3x4[1693,26014:A13294.0],attention_mask:CT7s3x35[1,1:A1.0],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x31x96[-4.695406436920166,4.258121490478516:A-0.0014835018764388604],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x31x96[-4.510615348815918,4.348228454589844:A0.0008330248852151018],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]])))
>>> model: Phi3Model
> ((),dict(input_ids:CT7s2x3[2805,21466:A14035.333333333334],attention_mask:CT7s2x33[1,1:A1.0],position_ids:None,past_key_values:DynamicCache(key_cache=#2[CT1s2x32x30x96[-5.321568965911865,4.642333030700684:A-0.0017679614187095713],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x30x96[-4.218534469604492,4.59930944442749:A-0.006064819536683874],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]]),inputs_embeds:None,use_cache:None,cache_position:None))
> ((),dict(input_ids:CT7s3x4[1693,26014:A13294.0],attention_mask:CT7s3x35[1,1:A1.0],position_ids:None,past_key_values:DynamicCache(key_cache=#2[CT1s3x32x31x96[-4.695406436920166,4.258121490478516:A-0.0014835018764388604],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x31x96[-4.510615348815918,4.348228454589844:A0.0008330248852151018],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]]),inputs_embeds:None,use_cache:None,cache_position:None))
>>> embed_tokens: Embedding
> ((CT7s2x3[2805,21466:A14035.333333333334],),{})
> ((CT7s3x4[1693,26014:A13294.0],),{})
< (CT1s2x3x3072[-0.09246909618377686,0.07369624078273773:A-3.130963592056255e-05],)
< (CT1s3x4x3072[-0.07948524504899979,0.07863165438175201:A6.865088794481968e-05],)
<<<
>>> layers[0]: Phi3DecoderLayer
> ((CT1s2x3x3072[-0.09246909618377686,0.07369624078273773:A-3.130963592056255e-05],),dict(attention_mask:CT9s2x1x3x33[False,True:A0.9696969696969697],position_ids:CT7s1x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x30x96[-5.321568965911865,4.642333030700684:A-0.0017679614187095713],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x30x96[-4.218534469604492,4.59930944442749:A-0.006064819536683874],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]]),use_cache:bool=True,cache_position:CT7s3[30,32:A31.0],position_embeddings:(CT1s1x3x96[-1.1855769157409668,1.1902371644973755:A0.746652018013669],CT1s1x3x96[-1.1887905597686768,1.190193772315979:A0.1589894221542636])))
> ((CT1s3x4x3072[-0.07948524504899979,0.07863165438175201:A6.865088794481968e-05],),dict(attention_mask:CT9s3x1x4x35[False,True:A0.9571428571428572],position_ids:CT7s1x4[31,34:A32.5],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x31x96[-4.695406436920166,4.258121490478516:A-0.0014835018764388604],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x31x96[-4.510615348815918,4.348228454589844:A0.0008330248852151018],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]]),use_cache:bool=True,cache_position:CT7s4[31,34:A32.5],position_embeddings:(CT1s1x4x96[-1.1855769157409668,1.190237045288086:A0.7129333875218435],CT1s1x4x96[-1.1719439029693604,1.1902378797531128:A0.18296290554159592])))
>>> self_attn: Phi3Attention
> ((),dict(hidden_states:CT1s2x3x3072[-4.630746364593506,3.6906232833862305:A-0.0014726865415836737],attention_mask:CT9s2x1x3x33[False,True:A0.9696969696969697],position_ids:CT7s1x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x30x96[-5.321568965911865,4.642333030700684:A-0.0017679614187095713],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x30x96[-4.218534469604492,4.59930944442749:A-0.006064819536683874],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]]),use_cache:bool=True,cache_position:CT7s3[30,32:A31.0],position_embeddings:(CT1s1x3x96[-1.1855769157409668,1.1902371644973755:A0.746652018013669],CT1s1x3x96[-1.1887905597686768,1.190193772315979:A0.1589894221542636])))
> ((),dict(hidden_states:CT1s3x4x3072[-3.937323808670044,3.880868911743164:A0.003424602768633785],attention_mask:CT9s3x1x4x35[False,True:A0.9571428571428572],position_ids:CT7s1x4[31,34:A32.5],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x31x96[-4.695406436920166,4.258121490478516:A-0.0014835018764388604],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x31x96[-4.510615348815918,4.348228454589844:A0.0008330248852151018],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]]),use_cache:bool=True,cache_position:CT7s4[31,34:A32.5],position_embeddings:(CT1s1x4x96[-1.1855769157409668,1.190237045288086:A0.7129333875218435],CT1s1x4x96[-1.1719439029693604,1.1902378797531128:A0.18296290554159592])))
>>> o_proj: Linear
> ((CT1s2x3x3072[-2.6784136295318604,2.9553494453430176:A-0.006247291141995366],),{})
> ((CT1s3x4x3072[-2.4470973014831543,2.807598114013672:A-1.320671074693062e-05],),{})
< (CT1s2x3x3072[-1.6150957345962524,1.6153888702392578:A-0.0020455685180928006],)
< (CT1s3x4x3072[-1.8017123937606812,1.5505903959274292:A-0.0004669091483088374],)
<<<
>>> qkv_proj: Linear
> ((CT1s2x3x3072[-4.630746364593506,3.6906232833862305:A-0.0014726865415836737],),{})
> ((CT1s3x4x3072[-3.937323808670044,3.880868911743164:A0.003424602768633785],),{})
< (CT1s2x3x9216[-4.3074750900268555,4.560797691345215:A0.003889466427002238],)
< (CT1s3x4x9216[-4.575429439544678,4.392276287078857:A0.0007742521622983784],)
<<<
< (CT1s2x3x3072[-1.6150957345962524,1.6153888702392578:A-0.0020455685180928006],None)
< (CT1s3x4x3072[-1.8017123937606812,1.5505903959274292:A-0.0004669091483088374],None)
<<<
>>> mlp: Phi3MLP
> ((CT1s2x3x3072[-3.89996600151062,3.892240524291992:A-0.005018209103335083],),{})
> ((CT1s3x4x3072[-4.527905464172363,3.8988120555877686:A-0.0010598260709656021],),{})
>>> gate_up_proj: Linear
> ((CT1s2x3x3072[-3.89996600151062,3.892240524291992:A-0.005018209103335083],),{})
> ((CT1s3x4x3072[-4.527905464172363,3.8988120555877686:A-0.0010598260709656021],),{})
< (CT1s2x3x16384[-5.326481342315674,5.053717136383057:A-0.005484532955991976],)
< (CT1s3x4x16384[-4.692765235900879,4.91367244720459:A0.003873056516350578],)
<<<
>>> down_proj: Linear
> ((CT1s2x3x8192[-9.957571983337402,9.760546684265137:A0.0004086510726069791],),{})
> ((CT1s3x4x8192[-10.153356552124023,12.667742729187012:A-0.0022295268024867926],),{})
< (CT1s2x3x3072[-5.296715259552002,5.8315043449401855:A-0.0034154122044785457],)
< (CT1s3x4x3072[-5.51116943359375,6.273403167724609:A0.005032625723124006],)
<<<
>>> activation_fn: SiLUActivation
> ((CT1s2x3x8192[-5.326481342315674,5.053717136383057:A-0.011311644454508496],),{})
> ((CT1s3x4x8192[-4.692765235900879,4.91367244720459:A0.009215649440452722],),{})
< (CT1s2x3x8192[-0.27846455574035645,5.021651268005371:A0.24074798986483872],)
< (CT1s3x4x8192[-0.27846455574035645,4.877842426300049:A0.251591828373712],)
<<<
< (CT1s2x3x3072[-5.296715259552002,5.8315043449401855:A-0.0034154122044785457],)
< (CT1s3x4x3072[-5.51116943359375,6.273403167724609:A0.005032625723124006],)
<<<
>>> input_layernorm: Phi3RMSNorm
> ((CT1s2x3x3072[-0.09246909618377686,0.07369624078273773:A-3.130963592056255e-05],),{})
> ((CT1s3x4x3072[-0.07948524504899979,0.07863165438175201:A6.865088794481968e-05],),{})
< (CT1s2x3x3072[-4.630746364593506,3.6906232833862305:A-0.0014726865415836737],)
< (CT1s3x4x3072[-3.937323808670044,3.880868911743164:A0.003424602768633785],)
<<<
>>> post_attention_layernorm: Phi3RMSNorm
> ((CT1s2x3x3072[-1.626574993133545,1.572633147239685:A-0.0020768782157681975],),{})
> ((CT1s3x4x3072[-1.7993048429489136,1.5446910858154297:A-0.00039825813672757075],),{})
< (CT1s2x3x3072[-3.89996600151062,3.892240524291992:A-0.005018209103335083],)
< (CT1s3x4x3072[-4.527905464172363,3.8988120555877686:A-0.0010598260709656021],)
<<<
>>> resid_attn_dropout: Dropout
> ((CT1s2x3x3072[-1.6150957345962524,1.6153888702392578:A-0.0020455685180928006],),{})
> ((CT1s3x4x3072[-1.8017123937606812,1.5505903959274292:A-0.0004669091483088374],),{})
< (CT1s2x3x3072[-1.6150957345962524,1.6153888702392578:A-0.0020455685180928006],)
< (CT1s3x4x3072[-1.8017123937606812,1.5505903959274292:A-0.0004669091483088374],)
<<<
>>> resid_mlp_dropout: Dropout
> ((CT1s2x3x3072[-5.296715259552002,5.8315043449401855:A-0.0034154122044785457],),{})
> ((CT1s3x4x3072[-5.51116943359375,6.273403167724609:A0.005032625723124006],),{})
< (CT1s2x3x3072[-5.296715259552002,5.8315043449401855:A-0.0034154122044785457],)
< (CT1s3x4x3072[-5.51116943359375,6.273403167724609:A0.005032625723124006],)
<<<
< (CT1s2x3x3072[-5.492254257202148,6.447257041931152:A-0.005492290605969983],)
< (CT1s3x4x3072[-5.337061405181885,6.1045732498168945:A0.0046343675318338586],)
<<<
>>> layers[1]: Phi3DecoderLayer
> ((CT1s2x3x3072[-5.492254257202148,6.447257041931152:A-0.005492290605969983],),dict(attention_mask:CT9s2x1x3x33[False,True:A0.9696969696969697],position_ids:CT7s1x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x33x96[-5.321568965911865,5.071047306060791:A-0.0010730019231920205],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x33x96[-4.218534469604492,4.59930944442749:A-0.005605154250588178],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]]),use_cache:bool=True,cache_position:CT7s3[30,32:A31.0],position_embeddings:(CT1s1x3x96[-1.1855769157409668,1.1902371644973755:A0.746652018013669],CT1s1x3x96[-1.1887905597686768,1.190193772315979:A0.1589894221542636])))
> ((CT1s3x4x3072[-5.337061405181885,6.1045732498168945:A0.0046343675318338586],),dict(attention_mask:CT9s3x1x4x35[False,True:A0.9571428571428572],position_ids:CT7s1x4[31,34:A32.5],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x35x96[-5.2906494140625,4.7679948806762695:A-0.0028739331352421174],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x35x96[-4.575429439544678,4.348228454589844:A0.0014538525100350908],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]]),use_cache:bool=True,cache_position:CT7s4[31,34:A32.5],position_embeddings:(CT1s1x4x96[-1.1855769157409668,1.190237045288086:A0.7129333875218435],CT1s1x4x96[-1.1719439029693604,1.1902378797531128:A0.18296290554159592])))
>>> self_attn: Phi3Attention
> ((),dict(hidden_states:CT1s2x3x3072[-3.9389030933380127,4.539912223815918:A-0.0043732160782262225],attention_mask:CT9s2x1x3x33[False,True:A0.9696969696969697],position_ids:CT7s1x3[30,32:A31.0],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x33x96[-5.321568965911865,5.071047306060791:A-0.0010730019231920205],CT1s2x32x30x96[-4.208118438720703,4.5439982414245605:A0.0016461648380537073]], value_cache=#2[CT1s2x32x33x96[-4.218534469604492,4.59930944442749:A-0.005605154250588178],CT1s2x32x30x96[-4.209028720855713,5.0260820388793945:A0.002745364887231304]]),use_cache:bool=True,cache_position:CT7s3[30,32:A31.0],position_embeddings:(CT1s1x3x96[-1.1855769157409668,1.1902371644973755:A0.746652018013669],CT1s1x3x96[-1.1887905597686768,1.190193772315979:A0.1589894221542636])))
> ((),dict(hidden_states:CT1s3x4x3072[-3.6848440170288086,4.238821983337402:A0.003227063433569624],attention_mask:CT9s3x1x4x35[False,True:A0.9571428571428572],position_ids:CT7s1x4[31,34:A32.5],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x35x96[-5.2906494140625,4.7679948806762695:A-0.0028739331352421174],CT1s3x32x31x96[-4.697518348693848,4.683842658996582:A-0.0008565901604404504]], value_cache=#2[CT1s3x32x35x96[-4.575429439544678,4.348228454589844:A0.0014538525100350908],CT1s3x32x31x96[-4.735829830169678,5.559142112731934:A0.00048002360800640374]]),use_cache:bool=True,cache_position:CT7s4[31,34:A32.5],position_embeddings:(CT1s1x4x96[-1.1855769157409668,1.190237045288086:A0.7129333875218435],CT1s1x4x96[-1.1719439029693604,1.1902378797531128:A0.18296290554159592])))
>>> o_proj: Linear
> ((CT1s2x3x3072[-2.4762864112854004,2.7124173641204834:A0.0026660227320708507],),{})
> ((CT1s3x4x3072[-2.4498472213745117,2.3301873207092285:A0.004975156723096797],),{})
< (CT1s2x3x3072[-1.697070598602295,1.7238038778305054:A-0.003707572536408558],)
< (CT1s3x4x3072[-1.585153579711914,1.7237988710403442:A-0.0026312880371316774],)
<<<
>>> qkv_proj: Linear
> ((CT1s2x3x3072[-3.9389030933380127,4.539912223815918:A-0.0043732160782262225],),{})
> ((CT1s3x4x3072[-3.6848440170288086,4.238821983337402:A0.003227063433569624],),{})
< (CT1s2x3x9216[-4.680721759796143,4.633952617645264:A0.0016646715417302268],)
< (CT1s3x4x9216[-4.391646862030029,4.912417888641357:A0.007990984870226023],)
<<<
< (CT1s2x3x3072[-1.697070598602295,1.7238038778305054:A-0.003707572536408558],None)
< (CT1s3x4x3072[-1.585153579711914,1.7237988710403442:A-0.0026312880371316774],None)
<<<
>>> mlp: Phi3MLP
> ((CT1s2x3x3072[-4.085476398468018,4.620463848114014:A-0.006951781481834531],),{})
> ((CT1s3x4x3072[-3.741267442703247,3.943660020828247:A0.0013388905007409118],),{})
>>> gate_up_proj: Linear
> ((CT1s2x3x3072[-4.085476398468018,4.620463848114014:A-0.006951781481834531],),{})
> ((CT1s3x4x3072[-3.741267442703247,3.943660020828247:A0.0013388905007409118],),{})
< (CT1s2x3x16384[-4.922957420349121,4.562531471252441:A0.004650305251132636],)
< (CT1s3x4x16384[-4.949065208435059,5.174188137054443:A0.00030220871082479545],)
<<<
>>> down_proj: Linear
> ((CT1s2x3x8192[-8.283615112304688,9.017322540283203:A0.005167366500767285],),{})
> ((CT1s3x4x8192[-9.517515182495117,8.992436408996582:A-0.002715212943913454],),{})
< (CT1s2x3x3072[-5.384060859680176,4.820015907287598:A-0.011407251573448067],)
< (CT1s3x4x3072[-5.838794231414795,5.128146648406982:A0.0016962297066179923],)
<<<
>>> activation_fn: SiLUActivation
> ((CT1s2x3x8192[-4.288193225860596,4.562531471252441:A-0.0010666609790395871],),{})
> ((CT1s3x4x8192[-4.949065208435059,5.174188137054443:A0.0025969812957432246],),{})
< (CT1s2x3x8192[-0.27846455574035645,4.515410423278809:A0.2449151218159792],)
< (CT1s3x4x8192[-0.27846455574035645,5.1450629234313965:A0.2456222603444075],)
<<<
< (CT1s2x3x3072[-5.384060859680176,4.820015907287598:A-0.011407251573448067],)
< (CT1s3x4x3072[-5.838794231414795,5.128146648406982:A0.0016962297066179923],)
<<<
>>> input_layernorm: Phi3RMSNorm
> ((CT1s2x3x3072[-5.492254257202148,6.447257041931152:A-0.005492290605969983],),{})
> ((CT1s3x4x3072[-5.337061405181885,6.1045732498168945:A0.0046343675318338586],),{})
< (CT1s2x3x3072[-3.9389030933380127,4.539912223815918:A-0.0043732160782262225],)
< (CT1s3x4x3072[-3.6848440170288086,4.238821983337402:A0.003227063433569624],)
<<<
>>> post_attention_layernorm: Phi3RMSNorm
> ((CT1s2x3x3072[-6.087521553039551,6.884673595428467:A-0.009199862836592528],),{})
> ((CT1s3x4x3072[-5.423608779907227,5.876498699188232:A0.0020030792350098636],),{})
< (CT1s2x3x3072[-4.085476398468018,4.620463848114014:A-0.006951781481834531],)
< (CT1s3x4x3072[-3.741267442703247,3.943660020828247:A0.0013388905007409118],)
<<<
>>> resid_attn_dropout: Dropout
> ((CT1s2x3x3072[-1.697070598602295,1.7238038778305054:A-0.003707572536408558],),{})
> ((CT1s3x4x3072[-1.585153579711914,1.7237988710403442:A-0.0026312880371316774],),{})
< (CT1s2x3x3072[-1.697070598602295,1.7238038778305054:A-0.003707572536408558],)
< (CT1s3x4x3072[-1.585153579711914,1.7237988710403442:A-0.0026312880371316774],)
<<<
>>> resid_mlp_dropout: Dropout
> ((CT1s2x3x3072[-5.384060859680176,4.820015907287598:A-0.011407251573448067],),{})
> ((CT1s3x4x3072[-5.838794231414795,5.128146648406982:A0.0016962297066179923],),{})
< (CT1s2x3x3072[-5.384060859680176,4.820015907287598:A-0.011407251573448067],)
< (CT1s3x4x3072[-5.838794231414795,5.128146648406982:A0.0016962297066179923],)
<<<
< (CT1s2x3x3072[-8.117108345031738,8.247547149658203:A-0.020607114706055855],)
< (CT1s3x4x3072[-8.020179748535156,7.753192901611328:A0.003699309034244733],)
<<<
>>> norm: Phi3RMSNorm
> ((CT1s2x3x3072[-8.117108345031738,8.247547149658203:A-0.020607114706055855],),{})
> ((CT1s3x4x3072[-8.020179748535156,7.753192901611328:A0.003699309034244733],),{})
< (CT1s2x3x3072[-4.039342880249023,4.108699798583984:A-0.010241697305276453],)
< (CT1s3x4x3072[-4.0763678550720215,3.8849968910217285:A0.001773235593723715],)
<<<
>>> rotary_emb: Phi3RotaryEmbedding
> ((CT1s2x3x3072[-0.09246909618377686,0.07369624078273773:A-3.130963592056255e-05],),dict(position_ids:CT7s1x3[30,32:A31.0]))
> ((CT1s3x4x3072[-0.07948524504899979,0.07863165438175201:A6.865088794481968e-05],),dict(position_ids:CT7s1x4[31,34:A32.5]))
< (CT1s1x3x96[-1.1855769157409668,1.1902371644973755:A0.746652018013669],CT1s1x3x96[-1.1887905597686768,1.190193772315979:A0.1589894221542636])
< (CT1s1x4x96[-1.1855769157409668,1.190237045288086:A0.7129333875218435],CT1s1x4x96[-1.1719439029693604,1.1902378797531128:A0.18296290554159592])
<<<
< (dict(last_hidden_state:CT1s2x3x3072[-4.039342880249023,4.108699798583984:A-0.010241697305276453],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x33x96[-5.321568965911865,5.071047306060791:A-0.0010730019231920205],CT1s2x32x33x96[-4.981376647949219,4.765379905700684:A0.0025422255894368135]], value_cache=#2[CT1s2x32x33x96[-4.218534469604492,4.59930944442749:A-0.005605154250588178],CT1s2x32x33x96[-4.646946907043457,5.0260820388793945:A0.001844236126241237]])),)
< (dict(last_hidden_state:CT1s3x4x3072[-4.0763678550720215,3.8849968910217285:A0.001773235593723715],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x35x96[-5.2906494140625,4.7679948806762695:A-0.0028739331352421174],CT1s3x32x35x96[-5.3778486251831055,6.231717109680176:A-0.0017655372309230257]], value_cache=#2[CT1s3x32x35x96[-4.575429439544678,4.348228454589844:A0.0014538525100350908],CT1s3x32x35x96[-4.735829830169678,5.559142112731934:A0.0021215061804070365]])),)
<<<
>>> lm_head: Linear
> ((CT1s2x3x3072[-4.039342880249023,4.108699798583984:A-0.010241697305276453],),{})
> ((CT1s3x4x3072[-4.0763678550720215,3.8849968910217285:A0.001773235593723715],),{})
< (CT1s2x3x32064[-4.781582355499268,4.740781307220459:A-0.000884814476446486],)
< (CT1s3x4x32064[-5.196939945220947,5.246708869934082:A-0.0011729046776157495],)
<<<
< (dict(logits:CT1s2x3x32064[-4.781582355499268,4.740781307220459:A-0.000884814476446486],past_key_values:DynamicCache(key_cache=#2[CT1s2x32x33x96[-5.321568965911865,5.071047306060791:A-0.0010730019231920205],CT1s2x32x33x96[-4.981376647949219,4.765379905700684:A0.0025422255894368135]], value_cache=#2[CT1s2x32x33x96[-4.218534469604492,4.59930944442749:A-0.005605154250588178],CT1s2x32x33x96[-4.646946907043457,5.0260820388793945:A0.001844236126241237]])),)
< (dict(logits:CT1s3x4x32064[-5.196939945220947,5.246708869934082:A-0.0011729046776157495],past_key_values:DynamicCache(key_cache=#2[CT1s3x32x35x96[-5.2906494140625,4.7679948806762695:A-0.0028739331352421174],CT1s3x32x35x96[-5.3778486251831055,6.231717109680176:A-0.0017655372309230257]], value_cache=#2[CT1s3x32x35x96[-4.575429439544678,4.348228454589844:A0.0014538525100350908],CT1s3x32x35x96[-4.735829830169678,5.559142112731934:A0.0021215061804070365]])),)
<<<
[_untrace_forward_execution] M:__main__-Phi3ForCausalLM
[_untrace_forward_execution] .. M:model-Phi3Model
[_untrace_forward_execution] .... M:embed_tokens-Embedding
[_untrace_forward_execution] .... M:layers[0]-Phi3DecoderLayer
[_untrace_forward_execution] ...... M:self_attn-Phi3Attention
[_untrace_forward_execution] ........ M:o_proj-Linear
[_untrace_forward_execution] ........ M:qkv_proj-Linear
[_untrace_forward_execution] ...... M:mlp-Phi3MLP
[_untrace_forward_execution] ........ M:gate_up_proj-Linear
[_untrace_forward_execution] ........ M:down_proj-Linear
[_untrace_forward_execution] ........ M:activation_fn-SiLUActivation
[_untrace_forward_execution] ...... M:input_layernorm-Phi3RMSNorm
[_untrace_forward_execution] ...... M:post_attention_layernorm-Phi3RMSNorm
[_untrace_forward_execution] ...... M:resid_attn_dropout-Dropout
[_untrace_forward_execution] ...... M:resid_mlp_dropout-Dropout
[_untrace_forward_execution] .... M:layers[1]-Phi3DecoderLayer
[_untrace_forward_execution] ...... M:self_attn-Phi3Attention
[_untrace_forward_execution] ........ M:o_proj-Linear
[_untrace_forward_execution] ........ M:qkv_proj-Linear
[_untrace_forward_execution] ...... M:mlp-Phi3MLP
[_untrace_forward_execution] ........ M:gate_up_proj-Linear
[_untrace_forward_execution] ........ M:down_proj-Linear
[_untrace_forward_execution] ........ M:activation_fn-SiLUActivation
[_untrace_forward_execution] ...... M:input_layernorm-Phi3RMSNorm
[_untrace_forward_execution] ...... M:post_attention_layernorm-Phi3RMSNorm
[_untrace_forward_execution] ...... M:resid_attn_dropout-Dropout
[_untrace_forward_execution] ...... M:resid_mlp_dropout-Dropout
[_untrace_forward_execution] .... M:norm-Phi3RMSNorm
[_untrace_forward_execution] .... M:rotary_emb-Phi3RotaryEmbedding
[_untrace_forward_execution] .. M:lm_head-Linear
Now we keep in memory every input/output for the submodules, we can guess the dynamic shapes for every of them. The final ones:
dynamic_shapes = diag.guess_dynamic_shapes()
print("The dynamic shapes are:")
pprint.pprint(dynamic_shapes)
The dynamic shapes are:
((),
{'attention_mask': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},
'input_ids': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},
'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)},
{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)},
{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)},
{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}]})
And all the dynamic shapes all along the traced submodules.
print(
diag.pretty_text(
with_dynamic_shape=True,
with_shape=False,
with_min_max=False,
with_device=False,
with_inputs=False,
).replace("<_DimHint.DYNAMIC: 3>", "DYN")
)
>>> __main__: Phi3ForCausalLM
DS=((), {'attention_mask': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'input_ids': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}]})
>>> model: Phi3Model
DS=((), {'attention_mask': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'cache_position': None, 'input_ids': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'inputs_embeds': None, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}], 'position_ids': None, 'use_cache': None})
>>> embed_tokens: Embedding: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> layers[0]: Phi3DecoderLayer
DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {'attention_mask': {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC), 3: DimHint(DYNAMIC)}, 'cache_position': {0: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}], 'position_embeddings': ({1: DimHint(DYNAMIC)}, {1: DimHint(DYNAMIC)}), 'position_ids': {1: DimHint(DYNAMIC)}, 'use_cache': None})
>>> self_attn: Phi3Attention
DS=((), {'attention_mask': {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC), 3: DimHint(DYNAMIC)}, 'cache_position': {0: DimHint(DYNAMIC)}, 'hidden_states': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}], 'position_embeddings': ({1: DimHint(DYNAMIC)}, {1: DimHint(DYNAMIC)}), 'position_ids': {1: DimHint(DYNAMIC)}, 'use_cache': None})
>>> o_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> qkv_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> mlp: Phi3MLP
DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {})
>>> gate_up_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> down_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> activation_fn: SiLUActivation: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> input_layernorm: Phi3RMSNorm: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> post_attention_layernorm: Phi3RMSNorm: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> resid_attn_dropout: Dropout: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> resid_mlp_dropout: Dropout: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> layers[1]: Phi3DecoderLayer
DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {'attention_mask': {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC), 3: DimHint(DYNAMIC)}, 'cache_position': {0: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}], 'position_embeddings': ({1: DimHint(DYNAMIC)}, {1: DimHint(DYNAMIC)}), 'position_ids': {1: DimHint(DYNAMIC)}, 'use_cache': None})
>>> self_attn: Phi3Attention
DS=((), {'attention_mask': {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC), 3: DimHint(DYNAMIC)}, 'cache_position': {0: DimHint(DYNAMIC)}, 'hidden_states': {0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)}, 'past_key_values': [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}], 'position_embeddings': ({1: DimHint(DYNAMIC)}, {1: DimHint(DYNAMIC)}), 'position_ids': {1: DimHint(DYNAMIC)}, 'use_cache': None})
>>> o_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> qkv_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> mlp: Phi3MLP
DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {})
>>> gate_up_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> down_proj: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> activation_fn: SiLUActivation: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> input_layernorm: Phi3RMSNorm: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> post_attention_layernorm: Phi3RMSNorm: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> resid_attn_dropout: Dropout: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> resid_mlp_dropout: Dropout: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
>>> norm: Phi3RMSNorm: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
>>> rotary_emb: Phi3RotaryEmbedding: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {'position_ids': {1: DimHint(DYNAMIC)}}) <<<
<<<
>>> lm_head: Linear: DS=(({0: DimHint(DYNAMIC), 1: DimHint(DYNAMIC)},), {}) <<<
<<<
Evaluate the export¶
In many cases, the export (to torch.fx.Graph, to ONNX)
does not work on the first try. We need a way to understand
how much the model can be exported. It can be used to evaluate
the how much code needs to be rewritten or patched to be exportable.
The verbosity can be increase to show dynamic shapes, results
of the discrepancies.
Let’s display the module and its submodule first.
print(
diag.pretty_text(
with_dynamic_shape=False,
with_shape=False,
with_min_max=False,
with_device=False,
with_inputs=False,
)
)
>>> __main__: Phi3ForCausalLM
>>> model: Phi3Model
>>> embed_tokens: Embedding <<<
>>> layers[0]: Phi3DecoderLayer
>>> self_attn: Phi3Attention
>>> o_proj: Linear <<<
>>> qkv_proj: Linear <<<
<<<
>>> mlp: Phi3MLP
>>> gate_up_proj: Linear <<<
>>> down_proj: Linear <<<
>>> activation_fn: SiLUActivation <<<
<<<
>>> input_layernorm: Phi3RMSNorm <<<
>>> post_attention_layernorm: Phi3RMSNorm <<<
>>> resid_attn_dropout: Dropout <<<
>>> resid_mlp_dropout: Dropout <<<
<<<
>>> layers[1]: Phi3DecoderLayer
>>> self_attn: Phi3Attention
>>> o_proj: Linear <<<
>>> qkv_proj: Linear <<<
<<<
>>> mlp: Phi3MLP
>>> gate_up_proj: Linear <<<
>>> down_proj: Linear <<<
>>> activation_fn: SiLUActivation <<<
<<<
>>> input_layernorm: Phi3RMSNorm <<<
>>> post_attention_layernorm: Phi3RMSNorm <<<
>>> resid_attn_dropout: Dropout <<<
>>> resid_mlp_dropout: Dropout <<<
<<<
>>> norm: Phi3RMSNorm <<<
>>> rotary_emb: Phi3RotaryEmbedding <<<
<<<
>>> lm_head: Linear <<<
<<<
The we try to export to see the submodule failing the whole model. We can pickle the failing model and restore it to speedup the refactoring to make it work.
print("----------------------")
ep = diag.try_export(
exporter="fx",
use_dynamic_shapes=True,
exporter_kwargs=dict(strict=False),
verbose=1,
)
----------------------
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] M:__main__-Phi3ForCausalLM --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .. M:model-Phi3Model --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .... M:embed_tokens-Embedding --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .... M:layers[0]-Phi3DecoderLayer --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:self_attn-Phi3Attention --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ........ M:o_proj-Linear --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ........ M:qkv_proj-Linear --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:mlp-Phi3MLP --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:input_layernorm-Phi3RMSNorm --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:post_attention_layernorm-Phi3RMSNorm --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:resid_attn_dropout-Dropout --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:resid_mlp_dropout-Dropout --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .... M:layers[1]-Phi3DecoderLayer --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:self_attn-Phi3Attention --- FAIL, step=EXPORT, reason=Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None) --- For more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2071, in _non_strict_export\n ) = make_fake_inputs(\n ^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 414, in make_fake_inputs\n _check_dynamic_shapes(combined_args, dynamic_shapes)\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1049, in _check_dynamic_shapes\n _tree_map_with_path(check_shape, combined_args, dynamic_shapes, tree_name="inputs")\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 630, in _tree_map_with_path\n return tree_map_with_path(f, tree, *dynamic_shapes, is_leaf=is_leaf)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in tree_map_with_path\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 1278, in unflatten\n leaves = list(leaves)\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_pytree.py", line 2205, in <genexpr>\n return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves, strict=True))\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 627, in f\n return func(path, t, *dynamic_shapes)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/dynamic_shapes.py", line 1042, in check_shape\n raise UserError(\n', "torch._dynamo.exc.UserError: Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}] specified at `dynamic_shapes['past_key_values']` to non-tensor type <class 'transformers.cache_utils.DynamicCache'> at `inputs['past_key_values']` (expected None)\nFor more information about this error, see: https://pytorch.org/docs/main/generated/exportdb/index.html#dynamic-shapes-validation\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n"]
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ........ M:o_proj-Linear --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ........ M:qkv_proj-Linear --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:mlp-Phi3MLP --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:input_layernorm-Phi3RMSNorm --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:post_attention_layernorm-Phi3RMSNorm --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:resid_attn_dropout-Dropout --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] ...... M:resid_mlp_dropout-Dropout --- OK:
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .... M:norm-Phi3RMSNorm --- OK:
[torch_export] export starts with backed_size_oblivious=False
def forward(self, arg0_1: "f32[48]", arg1_1: "f32[s77, s27, 3072]", arg2_1: "i64[1, s9]"):
# No stacktrace found for following nodes
_set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None
max_1: "i64[]" = torch.ops.aten.max.default(arg2_1); arg2_1 = None
add: "i64[]" = torch.ops.aten.add.Tensor(max_1, 1); max_1 = None
gt: "b8[]" = torch.ops.aten.gt.Scalar(add, 4096); add = None
ne: "b8[]" = torch.ops.aten.ne.Scalar(gt, 0); gt = None
item: "Sym(Eq(u0, 1))" = torch.ops.aten.item.default(ne); ne = item = None
_set_grad_enabled_1 = torch._C._set_grad_enabled(True); _set_grad_enabled_1 = None
def forward(self, arg0_1: "f32[48]", arg1_1: "f32[s77, s27, 3072]", arg2_1: "i64[1, s9]"):
# No stacktrace found for following nodes
_set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None
max_1: "i64[]" = torch.ops.aten.max.default(arg2_1); arg2_1 = None
add: "i64[]" = torch.ops.aten.add.Tensor(max_1, 1); max_1 = None
gt: "b8[]" = torch.ops.aten.gt.Scalar(add, 4096); add = None
ne: "b8[]" = torch.ops.aten.ne.Scalar(gt, 0); gt = None
item: "Sym(Eq(u0, 1))" = torch.ops.aten.item.default(ne); ne = item = None
_set_grad_enabled_1 = torch._C._set_grad_enabled(True); _set_grad_enabled_1 = None
[try_export-FX] .... M:rotary_emb-Phi3RotaryEmbedding --- FAIL, step=EXPORT, reason=Could not guard on data-dependent expression Eq(u0, 1) (unhinted: Eq(u0, 1)). (Size-like symbols: none) --- --- consider using data-dependent friendly APIs such as guard_or_false, guard_or_true and statically_known_true. --- Caused by: (_export/non_strict_utils.py:1140 in __torch_function__) --- For more information, run with TORCH_LOGS="dynamic" --- For extended logs when we create symbols, also add TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u0" --- If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1 --- For more debugging help, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing --- --- For C++ stack trace, run with TORCHDYNAMO_EXTENDED_DEBUG_CPP=1 --- --- The following call raised this error: --- File "~/github/transformers/src/transformers/modeling_rope_utils.py", line 61, in longrope_frequency_update --- if seq_len > original_max_position_embeddings: --- --- --- The error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.['Traceback (most recent call last):\n', ' File "~/github/experimental-experiment/experimental_experiment/torch_interpreter/piece_by_piece.py", line 1573, in _try_export_no_bypass_export\n ep = torch_export(\n ^^^^^^^^^^^^^\n', ' File "~/github/experimental-experiment/experimental_experiment/export_helpers.py", line 164, in torch_export\n return torch.export.export(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 311, in export\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/__init__.py", line 277, in export\n return _export(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2379, in _export\n ep = _export_for_training(\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1272, in wrapper\n raise e\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1238, in wrapper\n ep = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/exported_program.py", line 124, in wrapper\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2187, in _export_for_training\n export_artifact = export_func(\n ^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2118, in _non_strict_export\n aten_export_artifact = _to_aten_func(\n ^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1906, in _export_to_aten_ir_make_fx\n gm, graph_signature = transform(_make_fx_helper)(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2036, in _aot_export_non_strict\n gm, sig = aot_export(stack, wrapped_mod, args, kwargs=kwargs, **flags)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1818, in _make_fx_helper\n gm = make_fx(\n ^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2722, in wrapped\n return make_fx_tracer.trace(f, *args)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2629, in trace\n return self._trace_inner(f, *args)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2591, in _trace_inner\n t = dispatch_trace(\n ^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_compile.py", line 54, in inner\n return disable_fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1193, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1461, in dispatch_trace\n graph = tracer.trace(root, concrete_args) # type: ignore[arg-type]\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2179, in trace\n res = super().trace(root, concrete_args)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1193, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 879, in trace\n (self.create_arg(fn(*args)),),\n ^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1527, in wrapped\n out = f(*tensors) # type:ignore[call-arg]\n ^^^^^^^^^^^\n', ' File "<string>", line 1, in <lambda>\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 1705, in wrapped_fn\n return tuple(flat_fn(*args))\n ^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/utils.py", line 193, in flat_fn\n tree_out = fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_functorch/_aot_autograd/graph_capture_wrappers.py", line 1378, in functional_call\n out = mod(*args[params_len:], **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 853, in module_call_wrapper\n return self.call_module(mod, forward, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2267, in call_module\n return Tracer.call_module(self, m, forward, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 569, in call_module\n ret_val = forward(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 846, in forward\n return _orig_module_call(mod, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/export/_trace.py", line 2020, in forward\n tree_out = mod(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 853, in module_call_wrapper\n return self.call_module(mod, forward, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 2267, in call_module\n return Tracer.call_module(self, m, forward, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 569, in call_module\n ret_val = forward(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/_symbolic_trace.py", line 846, in forward\n return _orig_module_call(mod, *args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/github/transformers/src/transformers/modeling_rope_utils.py", line 124, in wrapper\n longrope_frequency_update(self, position_ids, device=x.device, **kwargs)\n', ' File "~/github/transformers/src/transformers/modeling_rope_utils.py", line 61, in longrope_frequency_update\n if seq_len > original_max_position_embeddings:\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1578, in __torch_function__\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/proxy_tensor.py", line 1649, in __torch_function__\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/_export/non_strict_utils.py", line 1140, in __torch_function__\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/sym_node.py", line 543, in guard_bool\n r = self.evaluate()\n ^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/sym_node.py", line 517, in evaluate\n return self.shape_env.evaluate_sym_node(self, size_oblivious)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py", line 7326, in evaluate_sym_node\n return self.evaluate_expr(\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py", line 7421, in evaluate_expr\n return self._inner_evaluate_expr(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/recording.py", line 273, in wrapper\n return retlog(fn(*args, **kwargs))\n ^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py", line 7444, in _inner_evaluate_expr\n return self._evaluate_expr(\n ^^^^^^^^^^^^^^^^^^^^\n', ' File "~/vv/this312/lib/python3.12/site-packages/torch/fx/experimental/symbolic_shapes.py", line 7663, in _evaluate_expr\n raise self._make_data_dependent_error(\n', 'torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u0, 1) (unhinted: Eq(u0, 1)). (Size-like symbols: none)\n\nconsider using data-dependent friendly APIs such as guard_or_false, guard_or_true and statically_known_true.\nCaused by: (_export/non_strict_utils.py:1140 in __torch_function__)\nFor more information, run with TORCH_LOGS="dynamic"\nFor extended logs when we create symbols, also add TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u0"\nIf you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1\nFor more debugging help, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing\n\nFor C++ stack trace, run with TORCHDYNAMO_EXTENDED_DEBUG_CPP=1\n\nThe following call raised this error:\n File "~/github/transformers/src/transformers/modeling_rope_utils.py", line 61, in longrope_frequency_update\n if seq_len > original_max_position_embeddings:\n\n\nThe error above occurred when calling torch.export.export. If you would like to view some more information about this error, and get a list of all other errors that may occur in your export call, you can replace your `export()` call with `draft_export()`.\n']
[try_export-FX] .... M:rotary_emb-Phi3RotaryEmbedding --- FAIL: Could not guard on data-depend...
[torch_export] export starts with backed_size_oblivious=False
[try_export-FX] .. M:lm_head-Linear --- OK:
Let’s display a report.
print(f"success: {ep.status}")
print(diag.get_export_report())
success: 2
__main__ Phi3ForCausalLM FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
..model Phi3Model FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
....embed_tokens Embedding OK -- ExportedProgram
....layers[0] Phi3DecoderLayer FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
......self_attn Phi3Attention FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
........o_proj Linear OK -- ExportedProgram
........qkv_proj Linear OK -- ExportedProgram
......mlp Phi3MLP OK -- ExportedProgram
........gate_up_proj Linear <OK-2i-0>
........down_proj Linear <OK-2i-0>
........activation_fn SiLUActivation <OK-2i-0>
......input_layernorm Phi3RMSNorm OK -- ExportedProgram
......post_attention_layernorm Phi3RMSNorm OK -- ExportedProgram
......resid_attn_dropout Dropout OK -- ExportedProgram
......resid_mlp_dropout Dropout OK -- ExportedProgram
....layers[1] Phi3DecoderLayer FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
......self_attn Phi3Attention FAIL -- step=EXPORT, reason='Cannot associate shape [{0: DimHint(DYNAMIC), 2: DimHint(DYNAMIC)}, {0: DimHint(DYNAMIC), 2: DimHint...'
........o_proj Linear OK -- ExportedProgram
........qkv_proj Linear OK -- ExportedProgram
......mlp Phi3MLP OK -- ExportedProgram
........gate_up_proj Linear <OK-2i-0>
........down_proj Linear <OK-2i-0>
........activation_fn SiLUActivation <OK-2i-0>
......input_layernorm Phi3RMSNorm OK -- ExportedProgram
......post_attention_layernorm Phi3RMSNorm OK -- ExportedProgram
......resid_attn_dropout Dropout OK -- ExportedProgram
......resid_mlp_dropout Dropout OK -- ExportedProgram
....norm Phi3RMSNorm OK -- ExportedProgram
....rotary_emb Phi3RotaryEmbedding FAIL -- step=EXPORT, reason='Could not guard on data-dependent expression Eq(u0, 1) (unhinted: Eq(u0, 1)). (Size-like symbols: n...'
..lm_head Linear OK -- ExportedProgram
Replace the failing module by a custom op¶
The main module is not exportable because one piece cannot be exported. But maybe if we assume it works, maybe everything else is working. So let’s try to replace this class by a custom op. This will be something for another example.
Total running time of the script: (0 minutes 5.387 seconds)
Related examples
Export Phi-3.5-mini-instruct with report_exportability