Source code for experimental_experiment.torch_bench.dort_bench

"""
Run llama model with DORT
=========================

The script runs a few iterations of a dummy llama model.

::

    python -m experimental_experiment.torch_bench.dort_bench --help

Example, run llama model with onnxrt backend on cuda.

::

    python -m experimental_experiment.torch_bench.dort_bench \\
           --backend ort --device cuda --config medium

To export the models:

::

    python -m experimental_experiment.torch_bench.dort_bench \\
           --backend custom --device cuda --export a -w 3


Profiling:

::

    nsys profile python -m experimental_experiment.torch_bench.dort_bench \\
                        --device cuda -w 3 -r 5 --mixed 1 --config large \\
                        --backend eager --enable_pattern=default+onnxruntime

With experimental optimizers:

::

    python -m experimental_experiment.torch_bench.dort_bench --backend custom \\
           --device cuda --mixed=1 --export model -w 3 \\
           --enable_pattern=default+onnxruntime+experimental

Or:

::

    python -m experimental_experiment.torch_bench.dort_bench --backend ort+ \\
          --device cuda --mixed=1 --export model -w 3 \\
          --enable_pattern=default+onnxruntime+experimental
"""

import os
import pprint


[docs] def main(args=None): """ Main function for command line ``python -m experimental_experiment.torch_bench.dort_bench``. """ from experimental_experiment.torch_bench._dort_cmd_common import dort_args args = dort_args( "experimental_experiment.torch_bench.dort_bench", description=__doc__, new_args=args, ) from ..bench_run import ( multi_run, make_configs, make_dataframe_from_benchmark_data, run_benchmark, ) if multi_run(args): configs = make_configs(args) data = run_benchmark( "experimental_experiment.torch_bench.dort_bench", configs, args.verbose, stop_if_exception=False, ) if args.verbose > 2: pprint.pprint(data if args.verbose > 3 else data[:2]) if args.output_data: df = make_dataframe_from_benchmark_data(data, detailed=False) df.to_csv(args.output_data, index=False, errors="ignore") df.to_excel(args.output_data + ".xlsx", index=False) if args.verbose: print(df) else: import logging import time import numpy as np import torch import torch._dynamo.backends.registry import transformers from experimental_experiment.torch_bench import BOOLEAN_VALUES from experimental_experiment.convert.convert_helper import ( ort_optimize as run_ort_optimize, ) from experimental_experiment.torch_models.dump_helper import dump_onnx from experimental_experiment.torch_bench._dort_cmd_common import ( create_compiled_model, create_configuration_for_benchmark, create_model, ) from experimental_experiment.memory_peak import start_spying_on, flatten config_dict = create_configuration_for_benchmark( model=args.model, config=args.config, repeat=args.repeat, warmup=args.warmup, num_hidden_layers=args.num_hidden_layers, implementation=args.implementation, with_mask=args.with_mask, shape_scenario=args.shape_scenario, ) verbose = int(args.verbose) optimize = args.optimize in BOOLEAN_VALUES ort_optimize = args.ort_optimize in BOOLEAN_VALUES with_mask = args.with_mask in BOOLEAN_VALUES disable_pattern = [_ for _ in args.disable_pattern.split("+") if _] enable_pattern = [_ for _ in args.enable_pattern.split("+") if _] print(f"model={args.model}") print(f"model config={config_dict}") print(f"backend={args.backend}") print(f"verbose={verbose}") print(f"optimize={args.optimize}") print(f"ort_optimize={ort_optimize}") print(f"order_algorithm={args.order}") print(f"with_mask={with_mask}") print(f"implementation={args.implementation}") print(f"mixed={args.mixed}") print(f"shape_scenario={args.shape_scenario}") dump_patterns = args.dump_patterns in BOOLEAN_VALUES if args.backend == "custom": print(f"disable_pattern={disable_pattern!r}") print(f"enable_pattern={enable_pattern!r}") assert not dump_patterns or args.export, ( f"optimization patterns cannot be dumped if export is not set " f"dump_patterns={dump_patterns!r}, export={args.export}" ) is_cuda = args.device.startswith("cuda") if is_cuda: print( f"CUDA no model: memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) device = args.device model, example_args_collection = create_model(args.model, config_dict) if args.backend != "ortmodule": model = model.eval() model = model.to(device) if is_cuda: print( f"CUDA model loaded: memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) print(f"Build the compile model with backend={args.backend}") use_dynamic = args.dynamic in BOOLEAN_VALUES print(f"dynamic={use_dynamic}") if verbose: print(f"-- debug backend, opset={args.target_opset}") for a in example_args_collection[0]: print(f" input: {a.dtype}:{a.shape}") dump_folder = args.dump_folder if args.export and dump_folder and not os.path.exists(dump_folder): os.makedirs(dump_folder) if dump_patterns: dump_patterns_folder = os.path.join(dump_folder, "patterns") if os.path.exists(dump_patterns_folder): for _ in os.listdir(dump_patterns_folder): if _.endswith(".onnx"): os.remove(os.path.join(dump_patterns_folder, _)) else: dump_patterns_folder = None if verbose: if dump_patterns: print( f"dump models and patterns in {dump_folder!r} " f"and {dump_patterns_folder!r}" ) else: print(f"dump models in {dump_folder!r}") logger = logging.getLogger("onnxscript.optimizer.constant_folding") logger.setLevel(logging.ERROR) compiled_model = create_compiled_model( model, backend=args.backend, use_dynamic=use_dynamic, target_opset=args.target_opset, verbose=verbose, enable_pattern=enable_pattern, disable_pattern=disable_pattern, optimize=optimize, ort_optimize=ort_optimize, use_fused_aten_ops=args.implementation == "sdpa", dump_prefix=( f"{dump_folder}/{args.export}-{args.model}-{args.backend}" if args.export else None ), dump_patterns=dump_patterns_folder, processor=device.upper() if device.upper() == "CPU" else "CPU,CUDA", order_algorithm=args.order, ) del model print(f"type of compiled_model={type(compiled_model)}") def loop_iteration(is_cuda, inputs, compiled_model, loss): torch.set_grad_enabled(True) mixed = args.mixed in BOOLEAN_VALUES if mixed and is_cuda: with torch.autocast(device_type="cuda", dtype=torch.float16): torch.cuda.nvtx.range_push("DORT-FORWARD-MIXED") result = compiled_model(*inputs) torch.cuda.synchronize() torch.cuda.nvtx.range_pop() elif is_cuda: torch.cuda.nvtx.range_push("DORT-FORWARD") result = compiled_model(*inputs) torch.cuda.synchronize() torch.cuda.nvtx.range_pop() else: result = compiled_model(*inputs) # dummy_target = torch.ones_like(result[0], # memory_format=torch.contiguous_format) if mixed and is_cuda: with torch.autocast(device_type="cuda", dtype=torch.float16): torch.cuda.nvtx.range_push("DORT-ERROR-MIXED") error = result[0].sum() # loss(result[0], dummy_target) torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_push("DORT-BACKWARD-MIXED") error.backward() torch.cuda.synchronize() torch.cuda.nvtx.range_pop() elif is_cuda: torch.cuda.nvtx.range_push("DORT-ERROR") error = result[0].sum() # loss(result[0], dummy_target) torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_push("DORT-BACKWARD") error.backward() torch.cuda.synchronize() torch.cuda.nvtx.range_pop() else: error = result[0].sum() # loss(result[0], dummy_target) error.backward() print(f"warmup on device={args.device}") if is_cuda: print( f"CUDA memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) if args.memory_spy in ("1", 1, "True", "true", True): memory = start_spying_on(cuda=is_cuda) else: memory = None warmup_times = [] loss = torch.nn.MSELoss() for i in range(args.warmup): example_inputs = example_args_collection[i] inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs if is_cuda: torch.cuda.synchronize() start_time = time.perf_counter() if ( args.backend in ("ort", "custom", "debug", "plug", "ort+") and i == 0 and args.export ): with dump_onnx( f"dort-{args.export}-{args.model}-{args.backend}", folder=dump_folder, clean=True, ): loop_iteration(is_cuda, inputs, compiled_model, loss) for onx in os.listdir(dump_folder): if not onx.endswith(".onnx"): continue if ".opt." in onx: continue new_onx = onx.replace(".onnx", ".opt.onnx") print(f" ort_optimize {onx} -> {new_onx}") run_ort_optimize( os.path.join(dump_folder, onx), output=os.path.join(dump_folder, new_onx), providers=( [ ("CUDAExecutionProvider", {}), ("CPUExecutionProvider", {}), ] if is_cuda else ["CPUExecutionProvider"] ), ) else: if is_cuda: torch.cuda.nvtx.range_push("DORT-ITERATION") loop_iteration(is_cuda, inputs, compiled_model, loss) if is_cuda: torch.cuda.nvtx.range_pop() warmup_times.append(time.perf_counter() - start_time) warmup_time = sum(warmup_times) print(f"warmup done in {warmup_time}s.") if is_cuda: print( f"memory allocated={torch.cuda.memory_allocated(0)}, " f"reserved={torch.cuda.memory_reserved(0)}" ) print("measures") times = [] for example_inputs in example_args_collection[args.warmup :]: inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs start_time = time.perf_counter() loop_iteration(is_cuda, inputs, compiled_model, loss) times.append(time.perf_counter() - start_time) print("measures done.") print(f"dynamic={args.dynamic}") print(f"mixed={args.mixed}") print(f"backend={args.backend}") print(f"num_hidden_layers={args.num_hidden_layers}") print(f"mixed={args.mixed}") print(f"repeat={args.repeat}") print(f"device={args.device}") print(f"avg={np.mean(times)}") print(f"times={times}") print(f"warmup_times={warmup_times}") print("-----------") if memory is not None: stat_memory = flatten(memory.stop(), prefix="memory_") print(stat_memory) print("-----------") else: stat_memory = None i_shapes = set(config_dict["input_dims"]) if len(i_shapes) == 1: idims = "x".join(str(i) for i in i_shapes) else: idims = "|".join("x".join(map(str, shs)) for shs in list(i_shapes)[:2]) del config_dict["input_dims"] vals = "-".join(map(str, config_dict.values())) print(f":{args.model},{idims}-{vals};") print(f":config,{args.config};") print(f":mixed,{args.mixed};") print(f":dynamic,{use_dynamic};") print(f":optimize,{optimize};") print(f":order,{args.order};") print(f":ort_optimize,{ort_optimize};") print(f":backend,{args.backend};") print(f":repeat,{args.repeat};") print(f":warmup,{args.warmup};") print(f":with_mask,{args.with_mask};") print(f":implementation,{args.implementation};") print(f":torch,{torch.__version__};") print(f":transformers,{transformers.__version__};") if stat_memory: for k, v in stat_memory.items(): print(f":{k},{v};") if args.backend in {"custom", "ort+", "debug"}: suffix = "+oo" if args.ort_optimize else "" print(f":patterns,+{args.enable_pattern}-{args.disable_pattern}{suffix};") print(f":warmup_time,{sum(warmup_times)};") print(f":time,{np.mean(times)};")
if __name__ == "__main__": main()