llama config={'input_dims': [(2, 1024), (2, 1024), (2, 1024), (2, 1024), (2, 1024), (2, 1024), (2, 1024), (2, 1024)], 'hidden_size': 1024, 'num_hidden_layers': 1, 'vocab_size': 1024, 'intermediate_size': 1024, 'max_position_embeddings': 1024, 'num_attention_heads': 2, '_attn_implementation': 'eager'}
backend=custom
verbose=0
implementation=eager
mixed=1
disable_pattern=[]
enable_pattern=['default']
CUDA no model: memory allocated=0, reserved=0
CUDA model loaded: memory allocated=37762048, reserved=44040192
Build the compile model with backend=custom
dynamic=False
warmup on device=cuda
CUDA memory allocated=37762048, reserved=44040192
warmup done in 2.9089201999995566s.
memory allocated=37786624, reserved=65011712
measures
measures done.
dynamic=0
mixed=1
backend=custom
num_hidden_layers=1
mixed=1
repeat=5
device=cuda
avg=0.05731345999993209
times=[0.05914619999930437, 0.05698019999999815, 0.05694430000039574, 0.05702159999964351, 0.05647500000031869]
warmup_times=[2.7931430999997247, 0.05779750000056083, 0.05797959999927116]
-----------
:llama,2x1024-1024-1-1024-1024-1024-2-eager;
:config,medium;
:mixed,1;
:dynamic,False;
:backend,custom;
:repeat,5;
:warmup,3;
:torch,2.3.0.dev20240314+cu118;
:transformers,4.37.2;
:patterns,+default-;
:warmup_time,2.9089201999995566;
:time,0.05731345999993209;
--------------------------------------------------------------------------
g -- 5 15 -- 0.00006 0.01478 -- _aot_autograd/utils.py:88:g (g)
runtime_wrapper -- 5 5 -- 0.00013 0.01476 -- _aot_autograd/runtime_wrappers.py:77:runtime_wrapper (runtime_wrapper)
call_func_at_runtime_with_args -- 5 5 -- 0.00007 0.01435 -- _aot_autograd/utils.py:105:call_func_at_runtime_with_args (call_func_at_runtime_with_args) +++
__init__ -- 5 5 -- 0.00002 0.00004 -- autograd/grad_mode.py:350:__init__ (__init__)
__enter__ -- 5 5 -- 0.00000 0.00000 -- autograd/grad_mode.py:355:__enter__ (__enter__)
__exit__ -- 5 5 -- 0.00001 0.00002 -- autograd/grad_mode.py:358:__exit__ (__exit__)
<built-in method builtins.isinstance> -- 20 20 -- 0.00001 0.00001 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<built-in method torch.tensor> -- 5 5 -- 0.00014 0.00014 -- ~:0:<built-in method torch.tensor> (<built-in method torch.tensor>)
<method 'detach' of 'torch._C.TensorBase' objects> -- 20 20 -- 0.00007 0.00007 -- ~:0:<method 'detach' of 'torch._C.TensorBase' objects> (<method 'detach' of 'torch._C.TensorBase' objects>)
apply -- 5 5 -- 0.00006 0.01424 -- autograd/function.py:582:apply (apply)
unwrap_dead_wrappers -- 5 5 -- 0.00006 0.00030 -- _functorch/utils.py:19:unwrap_dead_wrappers (unwrap_dead_wrappers)
<genexpr> -- 80 80 -- 0.00013 0.00023 -- _functorch/utils.py:21:<genexpr> (<genexpr>)
<built-in method builtins.isinstance> -- 75 75 -- 0.00004 0.00004 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<built-in method torc...orch.unwrap_if_dead> -- 75 75 -- 0.00007 0.00007 -- ~:0:<built-in method torch._C._functorch.unwrap_if_dead> (<built-in method torch._C._functorch.unwrap_if_dead>)
__getattribute__ -- 10 10 -- 0.00003 0.00003 -- autograd/function.py:346:__getattribute__ (__getattribute__) +++
<built-in method apply> -- 5 5 -- 0.00026 0.01385 -- ~:0:<built-in method apply> (<built-in method apply>)
forward -- 5 5 -- 0.00034 0.01356 -- _aot_autograd/jit_compile_runtime_wrappers.py:485:forward (forward)
<genexpr> -- 190 190 -- 0.00019 0.00030 -- _aot_autograd/jit_compile_runtime_wrappers.py:539:<genexpr> (<genexpr>)
<method '_is_view' ...nsorBase' objects> -- 185 185 -- 0.00010 0.00010 -- ~:0:<method '_is_view' of 'torch._C.TensorBase' objects> (<method '_is_view' of 'torch._C.TensorBase' objects>)
<listcomp> -- 5 5 -- 0.00005 0.00005 -- _aot_autograd/jit_compile_runtime_wrappers.py:604:<listcomp> (<listcomp>)
<listcomp> -- 5 5 -- 0.00002 0.00003 -- _aot_autograd/jit_compile_runtime_wrappers.py:610:<listcomp> (<listcomp>)
<built-in method builtins.isinstance> -- 15 15 -- 0.00001 0.00001 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
functionalized_rng_runtime_epilogue -- 5 5 -- 0.00000 0.00000 -- _aot_autograd/runtime_wrappers.py:287:functionalized_rng_runtime_epilogue (functionalized_rng_runtime_epilogue)
tensors_saved_for_backwards_slice -- 5 5 -- 0.00001 0.00001 -- _aot_autograd/schemas.py:410:tensors_saved_for_backwards_slice (tensors_saved_for_backwards_slice)
symints_saved_for_backwards_slice -- 5 5 -- 0.00001 0.00001 -- _aot_autograd/schemas.py:418:symints_saved_for_backwards_slice (symints_saved_for_backwards_slice)
call_func_at_runtime_with_args -- 5 5 -- 0.00006 0.01219 -- _aot_autograd/utils.py:105:call_func_at_runtime_with_args (call_func_at_runtime_with_args) +++
save_for_backward -- 5 5 -- 0.00001 0.00001 -- autograd/function.py:33:save_for_backward (save_for_backward)
mark_non_differentiable -- 5 5 -- 0.00001 0.00001 -- autograd/function.py:189:mark_non_differentiable (mark_non_differentiable)
__getattribute__ -- 80 80 -- 0.00007 0.00007 -- autograd/function.py:346:__getattribute__ (__getattribute__) +++
__call__ -- 5 5 -- 0.00002 0.00018 -- torch/_ops.py:852:__call__ (__call__)
<built-in method to...aten._unsafe_view> -- 5 5 -- 0.00017 0.00017 -- ~:0:<built-in method torch._ops.aten._unsafe_view> (<built-in method torch._ops.aten._unsafe_view>)
<built-in method builtins.all> -- 10 10 -- 0.00010 0.00036 -- ~:0:<built-in method builtins.all> (<built-in method builtins.all>)
<genexpr> -- 190 190 -- 0.00018 0.00026 -- _aot_autograd/jit_compile_runtime_wrappers.py:536:<genexpr> (<genexpr>)
<built-in method ...tins.isinstance> -- 185 185 -- 0.00008 0.00008 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<genexpr> -- 5 5 -- 0.00000 0.00000 -- _aot_autograd/jit_compile_runtime_wrappers.py:547:<genexpr> (<genexpr>)
__getattribute__ -- 15 15 -- 0.00003 0.00003 -- autograd/function.py:346:__getattribute__ (__getattribute__) +++
__call__ -- 5 5 -- 0.00017 0.01209 -- torch_dynamo/fast_backend.py:171:__call__ (__call__)
_run_onnx_session_with_ortvaluevector -- 5 5 -- 0.00013 0.01182 -- torch_dynamo/fast_backend.py:264:_run_onnx_session_with_ortvaluevector (_run_onnx_session_with_ortvaluevector)
run_with_ortvaluevector -- 5 5 -- 0.00914 0.00914 -- capi/onnxruntime_inference_collection.py:339:run_with_ortvaluevector (run_with_ortvaluevector)
_get_ortvalues_from_torch_tensors -- 5 5 -- 0.00096 0.00154 -- torch_dynamo/fast_backend.py:195:_get_ortvalues_from_torch_tensors (_get_ortvalues_from_torch_tensors)
<method 'append' of 'list' objects> -- 575 575 -- 0.00026 0.00026 -- ~:0:<method 'append' of 'list' objects> (<method 'append' of 'list' objects>) +++
<built-in method builtins.isinstance> -- 80 80 -- 0.00005 0.00005 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<built-in method builtins.max> -- 75 75 -- 0.00006 0.00006 -- ~:0:<built-in method builtins.max> (<built-in method builtins.max>)
<method 'data_ptr' of...TensorBase' objects> -- 75 75 -- 0.00006 0.00006 -- ~:0:<method 'data_ptr' of 'torch._C.TensorBase' objects> (<method 'data_ptr' of 'torch._C.TensorBase' objects>)
<method 'get_device' ...TensorBase' objects> -- 150 150 -- 0.00008 0.00008 -- ~:0:<method 'get_device' of 'torch._C.TensorBase' objects> (<method 'get_device' of 'torch._C.TensorBase' objects>)
<method 'size' of 'to...TensorBase' objects> -- 75 75 -- 0.00007 0.00007 -- ~:0:<method 'size' of 'torch._C.TensorBase' objects> (<method 'size' of 'torch._C.TensorBase' objects>)
_ortvalues_to_torch_tensor -- 5 5 -- 0.00036 0.00080 -- torch_dynamo/fast_backend.py:253:_ortvalues_to_torch_tensor (_ortvalues_to_torch_tensor)
<genexpr> -- 205 205 -- 0.00032 0.00042 -- torch_dynamo/fast_backend.py:260:<genexpr> (<genexpr>)
_post_process -- 200 200 -- 0.00010 0.00010 -- torch_dynamo/fast_backend.py:50:_post_process (_post_process)
<genexpr> -- 80 80 -- 0.00013 0.00021 -- torch_dynamo/fast_backend.py:268:<genexpr> (<genexpr>)
<built-in method builtins.isinstance> -- 75 75 -- 0.00004 0.00004 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<method 'contiguous' ...TensorBase' objects> -- 75 75 -- 0.00005 0.00005 -- ~:0:<method 'contiguous' of 'torch._C.TensorBase' objects> (<method 'contiguous' of 'torch._C.TensorBase' objects>)
<built-in method builtins.isinstance> -- 200 200 -- 0.00010 0.00010 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
call_func_at_runtime_with_args -- 5 10 -- 0.00013 0.01435 -- _aot_autograd/utils.py:105:call_func_at_runtime_with_args (call_func_at_runtime_with_args)
normalize_as_list -- 10 10 -- 0.00002 0.00002 -- _aot_autograd/utils.py:69:normalize_as_list (normalize_as_list)
g -- 10 5 -- 0.00004 0.01426 -- _aot_autograd/utils.py:88:g (g) +++
_fn -- 5 10 -- 0.00013 0.01576 -- _dynamo/eval_frame.py:427:_fn (_fn)
revert -- 10 10 -- 0.00001 0.00001 -- _dynamo/eval_frame.py:148:revert (revert)
nothing -- 5 5 -- 0.00000 0.00000 -- _dynamo/eval_frame.py:256:nothing (nothing)
always_false -- 10 10 -- 0.00001 0.00001 -- _dynamo/eval_frame.py:260:always_false (always_false)
<listcomp> -- 10 10 -- 0.00004 0.00020 -- _dynamo/eval_frame.py:447:<listcomp> (<listcomp>)
change -- 10 10 -- 0.00005 0.00007 -- _dynamo/eval_frame.py:140:change (change)
call_on_enter -- 5 5 -- 0.00001 0.00005 -- _dynamo/eval_frame.py:317:call_on_enter (call_on_enter)
on_enter -- 5 5 -- 0.00001 0.00004 -- _dynamo/eval_frame.py:524:on_enter (on_enter)
install_generation_tagging_init -- 5 5 -- 0.00002 0.00003 -- _dynamo/mutation_guard.py:101:install_generation_tagging_init (install_generation_tagging_init)
change -- 5 5 -- 0.00002 0.00004 -- utils/_config_module.py:289:change (change)
<dictcomp> -- 5 5 -- 0.00001 0.00001 -- utils/_config_module.py:290:<dictcomp> (<dictcomp>)
inner -- 5 5 -- 0.00002 0.01483 -- _dynamo/external_utils.py:34:inner (inner)
forward -- 5 5 -- 0.00002 0.01482 -- _functorch/aot_autograd.py:913:forward (forward)
g -- 5 5 -- 0.00002 0.01478 -- _aot_autograd/utils.py:88:g (g) +++
is_fx_tracing -- 5 5 -- 0.00001 0.00001 -- fx/_symbolic_trace.py:46:is_fx_tracing (is_fx_tracing)
_wrapped_call_impl -- 5 5 -- 0.00002 0.01547 -- modules/module.py:1523:_wrapped_call_impl (_wrapped_call_impl) +++
revert -- 5 5 -- 0.00001 0.00002 -- utils/_config_module.py:293:revert (revert)
<built-in method torch._C._...eval_frame.set_eval_frame> -- 20 20 -- 0.00002 0.00002 -- ~:0:<built-in method torch._C._dynamo.eval_frame.set_eval_frame> (<built-in method torch._C._dynamo.eval_frame.set_eval_frame>)
__getattribute__ -- 105 105 -- 0.00012 0.00012 -- autograd/function.py:346:__getattribute__ (__getattribute__)
is_available -- 10 10 -- 0.00008 0.00033 -- cuda/__init__.py:105:is_available (is_available)
_is_compiled -- 10 10 -- 0.00002 0.00003 -- cuda/__init__.py:96:_is_compiled (_is_compiled)
_nvml_based_avail -- 10 10 -- 0.00003 0.00021 -- cuda/__init__.py:101:_nvml_based_avail (_nvml_based_avail)
getenv -- 10 10 -- 0.00003 0.00018 -- python3.10/os.py:772:getenv (getenv)
get -- 10 10 -- 0.00004 0.00015 -- python3.10/_collections_abc.py:821:get (get)
__getitem__ -- 10 10 -- 0.00006 0.00011 -- python3.10/os.py:675:__getitem__ (__getitem__)
encode -- 10 10 -- 0.00004 0.00005 -- python3.10/os.py:755:encode (encode)
_lazy_init -- 10 10 -- 0.00004 0.00007 -- cuda/__init__.py:263:_lazy_init (_lazy_init)
is_initialized -- 10 10 -- 0.00002 0.00003 -- cuda/__init__.py:216:is_initialized (is_initialized)
_wrapped_call_impl -- 5 10 -- 0.00004 0.01586 -- modules/module.py:1523:_wrapped_call_impl (_wrapped_call_impl)
_call_impl -- 5 10 -- 0.00014 0.01583 -- modules/module.py:1529:_call_impl (_call_impl)
guard -- 5 5 -- 0.00023 0.00039 -- <string>:2:guard (guard)
check_current_backend -- 5 5 -- 0.00001 0.00001 -- _dynamo/eval_frame.py:86:check_current_backend (check_current_backend)
__getattr__ -- 5 5 -- 0.00002 0.00002 -- modules/module.py:1691:__getattr__ (__getattr__)
<method 'keys' of 'coll....OrderedDict' objects> -- 20 20 -- 0.00001 0.00001 -- ~:0:<method 'keys' of 'collections.OrderedDict' objects> (<method 'keys' of 'collections.OrderedDict' objects>)
<built-in method torch.....guards.check_type_id> -- 40 40 -- 0.00002 0.00002 -- ~:0:<built-in method torch._C._dynamo.guards.check_type_id> (<built-in method torch._C._dynamo.guards.check_type_id>)
<built-in method torch....o.guards.check_obj_id> -- 75 75 -- 0.00005 0.00005 -- ~:0:<built-in method torch._C._dynamo.guards.check_obj_id> (<built-in method torch._C._dynamo.guards.check_obj_id>)
_fn -- 5 5 -- 0.00007 0.01576 -- _dynamo/eval_frame.py:427:_fn (_fn) +++
forward -- 5 5 -- 0.00002 0.01496 -- torch_models/llama_helper.py:179:forward (forward)
_fn -- 5 5 -- 0.00005 0.01494 -- _dynamo/eval_frame.py:427:_fn (_fn) +++
is_scripting -- 25 25 -- 0.00002 0.00002 -- torch/_jit_internal.py:1120:is_scripting (is_scripting)
main_loop -- 1 1 -- 0.00037 0.28833 -- torch_bench/dort_profile.py:174:main_loop (main_loop)
loop_iteration -- 5 5 -- 0.00014 0.28625 -- torch_bench/dort_profile.py:101:loop_iteration (loop_iteration)
__init__ -- 5 5 -- 0.00009 0.00029 -- amp/autocast_mode.py:187:__init__ (__init__)
amp_definitely_not_available -- 5 5 -- 0.00001 0.00017 -- amp/common.py:8:amp_definitely_not_available (amp_definitely_not_available)
is_available -- 5 5 -- 0.00002 0.00016 -- cuda/__init__.py:105:is_available (is_available) +++
is_scripting -- 5 5 -- 0.00000 0.00000 -- torch/_jit_internal.py:1120:is_scripting (is_scripting) +++
__enter__ -- 5 5 -- 0.00005 0.00009 -- amp/autocast_mode.py:320:__enter__ (__enter__)
is_scripting -- 5 5 -- 0.00000 0.00000 -- torch/_jit_internal.py:1120:is_scripting (is_scripting) +++
__exit__ -- 5 5 -- 0.00005 0.00008 -- amp/autocast_mode.py:370:__exit__ (__exit__)
is_scripting -- 5 5 -- 0.00000 0.00000 -- torch/_jit_internal.py:1120:is_scripting (is_scripting) +++
synchronize -- 5 5 -- 0.00008 0.25052 -- cuda/__init__.py:782:synchronize (synchronize)
_lazy_init -- 5 5 -- 0.00003 0.00006 -- cuda/__init__.py:263:_lazy_init (_lazy_init) +++
__init__ -- 5 5 -- 0.00002 0.00044 -- cuda/__init__.py:360:__init__ (__init__)
_get_device_index -- 5 5 -- 0.00004 0.00043 -- cuda/_utils.py:9:_get_device_index (_get_device_index)
is_scripting -- 5 5 -- 0.00001 0.00001 -- torch/_jit_internal.py:1120:is_scripting (is_scripting) +++
_get_device_index -- 5 5 -- 0.00003 0.00037 -- torch/_utils.py:759:_get_device_index (_get_device_index)
is_scripting -- 5 5 -- 0.00000 0.00000 -- torch/_jit_internal.py:1120:is_scripting (is_scripting) +++
_get_current_device_index -- 5 5 -- 0.00001 0.00033 -- torch/_utils.py:733:_get_current_device_index (_get_current_device_index)
_get_device_attr -- 5 5 -- 0.00007 0.00032 -- torch/_utils.py:721:_get_device_attr (_get_device_attr)
_get_available_device_type -- 5 5 -- 0.00001 0.00019 -- torch/_utils.py:708:_get_available_device_type (_get_available_device_type)
is_available -- 5 5 -- 0.00006 0.00018 -- cuda/__init__.py:105:is_available (is_available) +++
<lambda> -- 5 5 -- 0.00001 0.00005 -- torch/_utils.py:735:<lambda> (<lambda>)
current_device -- 5 5 -- 0.00001 0.00004 -- cuda/__init__.py:776:current_device (current_device)
_lazy_init -- 5 5 -- 0.00001 0.00001 -- cuda/__init__.py:263:_lazy_init (_lazy_init) +++
<built-in method ...tins.isinstance> -- 15 15 -- 0.00001 0.00001 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
<built-in method builtins.isinstance> -- 20 20 -- 0.00001 0.00001 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>) +++
__enter__ -- 5 5 -- 0.00002 0.00002 -- cuda/__init__.py:364:__enter__ (__enter__)
__exit__ -- 5 5 -- 0.00003 0.00006 -- cuda/__init__.py:367:__exit__ (__exit__)
<built-in method torch._C._cuda_synchronize> -- 5 5 -- 0.24986 0.24986 -- ~:0:<built-in method torch._C._cuda_synchronize> (<built-in method torch._C._cuda_synchronize>)
_wrapped_call_impl -- 5 5 -- 0.00002 0.01586 -- modules/module.py:1523:_wrapped_call_impl (_wrapped_call_impl) +++
backward -- 5 5 -- 0.00012 0.01879 -- torch/_tensor.py:466:backward (backward)
backward -- 5 5 -- 0.00010 0.01867 -- autograd/__init__.py:165:backward (backward)
_make_grads -- 5 5 -- 0.00006 0.00028 -- autograd/__init__.py:60:_make_grads (_make_grads)
<built-in method torch.ones_like> -- 5 5 -- 0.00021 0.00021 -- ~:0:<built-in method torch.ones_like> (<built-in method torch.ones_like>)
_tensor_or_tensors_to_tuple -- 5 5 -- 0.00001 0.00001 -- autograd/__init__.py:155:_tensor_or_tensors_to_tuple (_tensor_or_tensors_to_tuple)
_engine_run_backward -- 5 5 -- 0.00007 0.01825 -- autograd/graph.py:739:_engine_run_backward (_engine_run_backward)
getEffectiveLevel -- 5 5 -- 0.00002 0.00002 -- logging/__init__.py:1710:getEffectiveLevel (getEffectiveLevel)
<method 'run_backwa...gineBase' objects> -- 5 5 -- 0.01816 0.01816 -- ~:0:<method 'run_backward' of 'torch._C._EngineBase' objects> (<method 'run_backward' of 'torch._C._EngineBase' objects>)
<method 'sum' of 'torch._C.TensorBase' objects> -- 5 5 -- 0.00049 0.00049 -- ~:0:<method 'sum' of 'torch._C.TensorBase' objects> (<method 'sum' of 'torch._C.TensorBase' objects>)
<listcomp> -- 5 5 -- 0.00006 0.00169 -- torch_bench/dort_profile.py:176:<listcomp> (<listcomp>)
<method 'to' of 'torch._C.TensorBase' objects> -- 10 10 -- 0.00162 0.00162 -- ~:0:<method 'to' of 'torch._C.TensorBase' objects> (<method 'to' of 'torch._C.TensorBase' objects>)
<method 'append' of 'list' objects> -- 585 585 -- 0.00027 0.00027 -- ~:0:<method 'append' of 'list' objects> (<method 'append' of 'list' objects>)
<built-in method builtins.isinstance> -- 730 730 -- 0.00036 0.00036 -- ~:0:<built-in method builtins.isinstance> (<built-in method builtins.isinstance>)
<built-in method builtins.len> -- 25 25 -- 0.00003 0.00003 -- ~:0:<built-in method builtins.len> (<built-in method builtins.len>)
<built-in method builtins.hasattr> -- 30 30 -- 0.00003 0.00003 -- ~:0:<built-in method builtins.hasattr> (<built-in method builtins.hasattr>)
<built-in method builtins.id> -- 15 15 -- 0.00001 0.00001 -- ~:0:<built-in method builtins.id> (<built-in method builtins.id>)