-m onnx_diagnostic validate … validate a model id

Description

The command lines validate a model id available on HuggingFace but not only. It creates dummy inputs, runs the models on them, exports the model, measures the discrepancies…

    usage: test [-h] [-m MID] [-t TASK] [-e EXPORT] [--opt OPT] [-r | --run | --no-run] [-q | --quiet | --no-quiet] [-p | --patch | --no-patch] [--trained | --no-trained] [-o DUMP_FOLDER]
                [-v VERBOSE] [--dtype DTYPE] [--device DEVICE]
    
    Prints out dummy inputs for a particular task or a model id. If both mid and task are empty, the command line displays the list of supported tasks.
    
    options:
      -h, --help            show this help message and exit
      -m MID, --mid MID     model id, usually <author>/<name>
      -t TASK, --task TASK  force the task to use
      -e EXPORT, --export EXPORT
                            export the model with this exporter
      --opt OPT             optimization to apply after the export
      -r, --run, --no-run   runs the model to check it runs
      -q, --quiet, --no-quiet
                            catches exception, report them in the summary
      -p, --patch, --no-patch
                            applies patches before exporting
      --trained, --no-trained
                            validate the trained model (requires downloading)
      -o DUMP_FOLDER, --dump-folder DUMP_FOLDER
                            if not empty, a folder is created to dumps statistics, exported program, onnx...
      -v VERBOSE, --verbose VERBOSE
                            verbosity
      --dtype DTYPE         changes dtype if necessary
      --device DEVICE       changes the device if necessary
    
    If the model id is specified, one untrained version of it is instantiated.

Get the list of supported tasks

The task are the same defined by HuggingFace. The tool only supports a subset of them.

python -m onnx_diagnostic validate
    -- list of supported tasks:
    image-classification
    image-text-to-text
    text-generation
    text2text-generation

Get the default inputs for a specific task

This returns the dummy inputs for a specific task. There may be too many inputs. Only those the forward method defines are kept.

python -m onnx_diagnostic validate -t text-generation
    -- inputs
      + input_ids       : T7s2x3
      + attention_mask  : T7s2x33
      + position_ids    : T7s2x3
      + past_key_values : DynamicCache(key_cache=#4[T1s2x24x30x16,T1s2x24x30x16,T1s2x24x30x16,T1s2x24x30x16], value_cache=#4[T1s2x24x30x16,T1s2x24x30x16,T1s2x24x30x16,T1s2x24x30x16])
    -- dynamic_shapes
      + input_ids       : {0: batch, 1: seq_length}
      + attention_mask  : {0: batch, 1: DYNAMIC}
      + position_ids    : {0: batch, 1: DYNAMIC}
      + past_key_values : [[{0: batch, 2: cache_length}, {0: batch, 2: cache_length}, {0: batch, 2: cache_length}, {0: batch, 2: cache_length}], [{0: batch, 2: cache_length}, {0: batch, 2: cache_length}, {0: batch, 2: cache_length}, {0: batch, 2: cache_length}]]

Validate dummy inputs for a model

The dummy inputs may not work for this model and this task. The following command line checks that. It is no use to export if this fails.

python -m onnx_diagnostic validate -m arnir0/Tiny-LLM --run -v 1
    [validate_model] validate model id 'arnir0/Tiny-LLM'
    [validate_model] get dummy inputs...
    [get_untrained_model_with_inputs] model_id='arnir0/Tiny-LLM'
    [get_untrained_model_with_inputs] architecture='LlamaForCausalLM'
    [get_untrained_model_with_inputs] cls='LlamaConfig'
    [get_untrained_model_with_inputs] task='text-generation'
    [validate_model] task=text-generation
    [validate_model] size=49.549072265625 Mb
    [validate_model] n_weights=12.988992 millions parameters
    [validate_model] +INPUT input_ids=T7s2x3
    [validate_model] +INPUT attention_mask=T7s2x33
    [validate_model] +INPUT position_ids=T7s2x3
    [validate_model] +INPUT past_key_values=DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96])
    [validate_model] +SHAPE input_ids={0: batch, 1: seq_length}
    [validate_model] +SHAPE attention_mask={0: batch, 1: DYNAMIC}
    [validate_model] +SHAPE position_ids={0: batch, 1: DYNAMIC}
    [validate_model] +SHAPE past_key_values=[[{0: batch, 2: cache_length}], [{0: batch, 2: cache_length}]]
    [validate_model] run the model...
    [validate_model] inputs=dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]))
    [validate_model] done (run)
    [validate_model] done (final)
    
    -- summary --
    :model_class,LlamaForCausalLM;
    :model_config,{'vocab_size':32000,'max_position_embeddings':1024,'hidden_size':192,'intermediate_size':1024,'num_hidden_layers':1,'num_attention_heads':2,'num_key_value_heads':1,'hidden_act':'silu','initializer_range':0.02,'rms_norm_eps':1e-05,'pretraining_tp':1,'use_cache':True,'rope_theta':10000.0,'rope_scaling':None,'attention_bias':False,'attention_dropout':0.0,'mlp_bias':False,'head_dim':96,'return_dict':True,'output_hidden_states':False,'output_attentions':False,'torchscript':False,'torch_dtype':'float32','use_bfloat16':False,'tf_legacy_loss':False,'pruned_heads':{},'tie_word_embeddings':False,'chunk_size_feed_forward':0,'is_encoder_decoder':False,'is_decoder':False,'cross_attention_hidden_size':None,'add_cross_attention':False,'tie_encoder_decoder':False,'max_length':20,'min_length':0,'do_sample':False,'early_stopping':False,'num_beams':1,'num_beam_groups':1,'diversity_penalty':0.0,'temperature':1.0,'top_k':50,'top_p':1.0,'typical_p':1.0,'repetition_penalty':1.0,'length_penalty':1.0,'no_repeat_ngram_size':0,'encoder_no_repeat_ngram_size':0,'bad_words_ids':None,'num_return_sequences':1,'output_scores':False,'return_dict_in_generate':False,'forced_bos_token_id':None,'forced_eos_token_id':None,'remove_invalid_values':False,'exponential_decay_length_penalty':None,'suppress_tokens':None,'begin_suppress_tokens':None,'architectures':['LlamaForCausalLM'],'finetuning_task':None,'id2label':{0:'LABEL_0',1:'LABEL_1'},'label2id':{'LABEL_0':0,'LABEL_1':1},'tokenizer_class':None,'prefix':None,'bos_token_id':1,'pad_token_id':None,'eos_token_id':2,'sep_token_id':None,'decoder_start_token_id':None,'task_specific_params':None,'problem_type':None,'_name_or_path':'','_attn_implementation_autoset':True,'transformers_version':'4.51.0.dev0','model_type':'llama'};
    :model_config_class,LlamaConfig;
    :model_expected,dict(logits:T1s2x3x32000,past_key_values:DynamicCache(key_cache=#1[T1s2x1x33x96], value_cache=#1[T1s2x1x33x96]));
    :model_id,arnir0/Tiny-LLM;
    :model_inputs,dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]));
    :model_nweights,12988992;
    :model_shapes,{'input_ids': {0: batch, 1: seq_length}, 'attention_mask': {0: batch, 1: DYNAMIC}, 'position_ids': {0: batch, 1: DYNAMIC}, 'past_key_values': [[{0: batch, 2: cache_length}], [{0: batch, 2: cache_length}]]};
    :model_size,51955968;
    :model_task,text-generation;
    :time_create,0.36135141500562895;
    :time_run,0.017487541001173668;

Validate and export a model

Exports a model given the task. Checks for discrepancies as well. The latency given are just for one run. It tells how long the benchmark runs but it is far from the latency measure we can get by running multiple times the same model.

python -m onnx_diagnostic validate -m arnir0/Tiny-LLM --run -v 1 --export export-nostrict -o dump_models --patch
    [validate_model] dump into 'arnir0-Tiny-LLM-export-nostrict-'
    [validate_model] validate model id 'arnir0/Tiny-LLM'
    [validate_model] get dummy inputs...
    [get_untrained_model_with_inputs] model_id='arnir0/Tiny-LLM'
    [get_untrained_model_with_inputs] architecture='LlamaForCausalLM'
    [get_untrained_model_with_inputs] cls='LlamaConfig'
    [get_untrained_model_with_inputs] task='text-generation'
    [validate_model] task=text-generation
    [validate_model] size=49.549072265625 Mb
    [validate_model] n_weights=12.988992 millions parameters
    [validate_model] +INPUT input_ids=T7s2x3
    [validate_model] +INPUT attention_mask=T7s2x33
    [validate_model] +INPUT position_ids=T7s2x3
    [validate_model] +INPUT past_key_values=DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96])
    [validate_model] +SHAPE input_ids={0: batch, 1: seq_length}
    [validate_model] +SHAPE attention_mask={0: batch, 1: DYNAMIC}
    [validate_model] +SHAPE position_ids={0: batch, 1: DYNAMIC}
    [validate_model] +SHAPE past_key_values=[[{0: batch, 2: cache_length}], [{0: batch, 2: cache_length}]]
    [validate_model] run the model...
    [validate_model] inputs=dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]))
    [validate_model] done (run)
    [validate_model] export the model with 'export-nostrict', optimization=None
    [validate_model] applies patches before exporting
    [validate_model] run patched model...
    [validate_model] patched inputs=dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]))
    [validate_model] done (patched run)
    [validate_model] patched discrepancies=abs=0, rel=0
    [call_torch_export_export] exporter='export-nostrict', strict=False, optimization=None
    [call_torch_export_export] args=()
    [call_torch_export_export] kwargs=dict(input_ids:T7r2,attention_mask:T7r2,position_ids:T7r2,past_key_values:DynamicCache(key_cache=#1[T1r4], value_cache=#1[T1r4]))
    [call_torch_export_export] dynamic_shapes={'input_ids': {0: batch, 1: seq_length}, 'attention_mask': {0: batch, 1: DYNAMIC}, 'position_ids': {0: batch, 1: DYNAMIC}, 'past_key_values': [[{0: batch, 2: cache_length}], [{0: batch, 2: cache_length}]]}
    [call_torch_export_export] export...
    /home/xadupre/vv/this312/lib/python3.12/site-packages/torch/backends/mkldnn/__init__.py:78: UserWarning: TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support. (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:148.)
      torch._C._set_onednn_allow_tf32(_allow_tf32)
    [call_torch_export_export] done (export) with 146 nodes
    [validate_model] run exported model...
    [validate_model] patched inputs=dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]))
    [validate_model] done (exported run)
    [validate_model] exported discrepancies=abs=0, rel=0
    [validate_model] dumps exported program in 'dump_models/arnir0-Tiny-LLM-export-nostrict-'...
    [validate_model] done (dump ep)
    [validate_model] dumps statistics in 'dump_models/arnir0-Tiny-LLM-export-nostrict-'...
    [validate_model] done (dump)
    [validate_model] done (final)
    
    -- summary --
    :disc_exported_abs,0;
    :disc_exported_dnan,0;
    :disc_exported_n,204672.0;
    :disc_exported_rel,0;
    :disc_exported_sum,0.0;
    :disc_patched_abs,0;
    :disc_patched_dnan,0;
    :disc_patched_n,204672.0;
    :disc_patched_rel,0;
    :disc_patched_sum,0.0;
    :dump_folder,dump_models/arnir0-Tiny-LLM-export-nostrict-;
    :dump_folder_name,arnir0-Tiny-LLM-export-nostrict-;
    :export_args,();
    :export_exporter,export-nostrict;
    :export_graph_nodes,146;
    :export_kwargs,dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]));
    :export_optimization,;
    :export_strict,False;
    :model_class,LlamaForCausalLM;
    :model_config,{'vocab_size':32000,'max_position_embeddings':1024,'hidden_size':192,'intermediate_size':1024,'num_hidden_layers':1,'num_attention_heads':2,'num_key_value_heads':1,'hidden_act':'silu','initializer_range':0.02,'rms_norm_eps':1e-05,'pretraining_tp':1,'use_cache':True,'rope_theta':10000.0,'rope_scaling':None,'attention_bias':False,'attention_dropout':0.0,'mlp_bias':False,'head_dim':96,'return_dict':True,'output_hidden_states':False,'output_attentions':False,'torchscript':False,'torch_dtype':'float32','use_bfloat16':False,'tf_legacy_loss':False,'pruned_heads':{},'tie_word_embeddings':False,'chunk_size_feed_forward':0,'is_encoder_decoder':False,'is_decoder':False,'cross_attention_hidden_size':None,'add_cross_attention':False,'tie_encoder_decoder':False,'max_length':20,'min_length':0,'do_sample':False,'early_stopping':False,'num_beams':1,'num_beam_groups':1,'diversity_penalty':0.0,'temperature':1.0,'top_k':50,'top_p':1.0,'typical_p':1.0,'repetition_penalty':1.0,'length_penalty':1.0,'no_repeat_ngram_size':0,'encoder_no_repeat_ngram_size':0,'bad_words_ids':None,'num_return_sequences':1,'output_scores':False,'return_dict_in_generate':False,'forced_bos_token_id':None,'forced_eos_token_id':None,'remove_invalid_values':False,'exponential_decay_length_penalty':None,'suppress_tokens':None,'begin_suppress_tokens':None,'architectures':['LlamaForCausalLM'],'finetuning_task':None,'id2label':{0:'LABEL_0',1:'LABEL_1'},'label2id':{'LABEL_0':0,'LABEL_1':1},'tokenizer_class':None,'prefix':None,'bos_token_id':1,'pad_token_id':None,'eos_token_id':2,'sep_token_id':None,'decoder_start_token_id':None,'task_specific_params':None,'problem_type':None,'_name_or_path':'','_attn_implementation_autoset':True,'transformers_version':'4.51.0.dev0','model_type':'llama'};
    :model_config_class,LlamaConfig;
    :model_expected,dict(logits:T1s2x3x32000,past_key_values:DynamicCache(key_cache=#1[T1s2x1x33x96], value_cache=#1[T1s2x1x33x96]));
    :model_id,arnir0/Tiny-LLM;
    :model_inputs,dict(input_ids:T7s2x3,attention_mask:T7s2x33,position_ids:T7s2x3,past_key_values:DynamicCache(key_cache=#1[T1s2x1x30x96], value_cache=#1[T1s2x1x30x96]));
    :model_nweights,12988992;
    :model_shapes,{'input_ids': {0: batch, 1: seq_length}, 'attention_mask': {0: batch, 1: DYNAMIC}, 'position_ids': {0: batch, 1: DYNAMIC}, 'past_key_values': [[{0: batch, 2: cache_length}], [{0: batch, 2: cache_length}]]};
    :model_size,51955968;
    :model_task,text-generation;
    :time_create,0.30678167000587564;
    :time_export_export,0.8954964470030973;
    :time_run,0.015602751009282656;
    :time_run_exported,0.05795702298928518;
    :time_run_patched,0.0037754869990749285;