Source code for experimental_experiment.torch_bench.dort_profile

"""
Profile python execution for DORT
=================================

The script runs a few iterations of a dummy llama model.

::

    python -m experimental_experiment.torch_bench.dort_profile --help

Example, run llama model with onnxrt backend on cuda.

::

    python -m experimental_experiment.torch_bench.dort_profile --backend ort --device cuda

"""


[docs] def main(): """ Main function for command line ``python -m experimental_experiment.torch_bench.dort_profile``. """ from experimental_experiment.torch_bench._dort_cmd_common import dort_args args = dort_args("experimental_experiment.torch_bench.dort_profile", description=__doc__) import os import time import numpy as np from onnx_array_api.profiling import profile, profile2graph import torch import torch._dynamo.backends.registry import transformers from experimental_experiment.convert.convert_helper import ort_optimize from experimental_experiment.torch_bench import BOOLEAN_VALUES from experimental_experiment.torch_models.llama_helper import get_llama_model from experimental_experiment.torch_models.dump_helper import dump_onnx from experimental_experiment.torch_bench._dort_cmd_common import ( create_compiled_model, create_configuration_for_benchmark, ) config_dict = create_configuration_for_benchmark( model="llama", config=args.config, repeat=args.repeat, warmup=args.warmup, num_hidden_layers=args.num_hidden_layers, implementation=args.implementation, ) verbose = int(args.verbose) disable_pattern = [_ for _ in args.disable_pattern.split("+") if _] enable_pattern = [_ for _ in args.enable_pattern.split("+") if _] print(f"model config={config_dict}") print(f"backend={args.backend}") print(f"verbose={args.verbose}") print(f"implementation={args.implementation}") print(f"mixed={args.mixed}") if args.backend == "custom": print(f"disable_pattern={disable_pattern!r}") print(f"enable_pattern={enable_pattern!r}") is_cuda = args.device.startswith("cuda") if is_cuda: print( f"CUDA no model: memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) model, example_args_collection = get_llama_model(**config_dict) device = args.device model = model.eval().to(device) if is_cuda: print( f"CUDA model loaded: memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) print(f"Build the compile model with backend={args.backend}") use_dynamic = args.dynamic in BOOLEAN_VALUES print(f"dynamic={use_dynamic}") if verbose: print(f"-- debug backend, opset={args.target_opset}") for a in example_args_collection[0]: print(f" input: {a.dtype}:{a.shape}") compiled_model = create_compiled_model( model, backend=args.backend, use_dynamic=use_dynamic, target_opset=args.target_opset, verbose=verbose, enable_pattern=enable_pattern, disable_pattern=disable_pattern, ort_optimize=args.ort_optimize, ) def loop_iteration(is_cuda, inputs, compiled_model, loss): if args.mixed in BOOLEAN_VALUES and is_cuda: with torch.autocast(device_type="cuda", dtype=torch.float16): result = compiled_model(*inputs) else: assert ( args.mixed not in BOOLEAN_VALUES ), f"not implemented with is_cuda={is_cuda}, mixed={args.mixed}" result = compiled_model(*inputs) # dummy_target = torch.ones_like(result[0], memory_format=torch.contiguous_format) error = result[0].sum() # loss(result[0], dummy_target) error.backward() if is_cuda: torch.cuda.synchronize() print(f"warmup on device={args.device}") if is_cuda: print( f"CUDA memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) warmup_times = [] loss = torch.nn.MSELoss() for i in range(args.warmup): example_inputs = example_args_collection[i] inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs if is_cuda: torch.cuda.synchronize() start_time = time.perf_counter() if args.backend in ("ort", "custom", "debug", "plug") and i == 0 and args.export: with dump_onnx( f"dort-{args.export}-{args.backend}", folder="dump_dort_bench", clean=True ): loop_iteration(is_cuda, inputs, compiled_model, loss) for onx in os.listdir("dump_dort_bench"): if not onx.endswith(".onnx"): continue new_onx = onx.replace(".onnx", ".opt.onnx") print(f" ort_optimize {onx} -> {new_onx}") ort_optimize( os.path.join("dump_dort_bench", onx), output=os.path.join("dump_dort_bench", new_onx), providers=( [("CUDAExecutionProvider", {}), ("CPUExecutionProvider", {})] if is_cuda else ["CPUExecutionProvider"] ), ) else: loop_iteration(is_cuda, inputs, compiled_model, loss) warmup_times.append(time.perf_counter() - start_time) warmup_time = sum(warmup_times) print(f"warmup done in {warmup_time}s.") if is_cuda: print( f"memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) print("measures") times = [] def main_loop(): for example_inputs in example_args_collection[args.warmup :]: inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs start_time = time.perf_counter() loop_iteration(is_cuda, inputs, compiled_model, loss) times.append(time.perf_counter() - start_time) ps = profile(main_loop)[0] print("measures done.") print(f"dynamic={args.dynamic}") print(f"mixed={args.mixed}") print(f"backend={args.backend}") print(f"num_hidden_layers={args.num_hidden_layers}") print(f"mixed={args.mixed}") print(f"repeat={args.repeat}") print(f"device={args.device}") print(f"avg={np.mean(times)}") print(f"times={times}") print(f"warmup_times={warmup_times}") print("-----------") idims = "x".join(map(str, config_dict["input_dims"][0])) del config_dict["input_dims"] vals = "-".join(map(str, config_dict.values())) print(f":llama,{idims}-{vals};") print(f":config,{args.config};") print(f":mixed,{args.mixed};") print(f":dynamic,{use_dynamic};") print(f":backend,{args.backend};") print(f":repeat,{args.repeat};") print(f":warmup,{args.warmup};") print(f":torch,{torch.__version__};") print(f":transformers,{transformers.__version__};") if args.backend in {"custom"}: print(f":patterns,+{args.enable_pattern}-{args.disable_pattern};") print(f":warmup_time,{sum(warmup_times)};") print(f":time,{np.mean(times)};") print("--------------------------------------------------------------------------") root, nodes = profile2graph(ps, clean_text=lambda x: "/".join(x.split("/")[-2:])) text = root.to_text() print(text)
if __name__ == "__main__": main()