"""
Run llama model with DORT
=========================
The script runs a few iterations of a dummy llama model.
::
python -m experimental_experiment.torch_bench.dort_bench --help
Example, run llama model with onnxrt backend on cuda.
::
python -m experimental_experiment.torch_bench.dort_bench \\
--backend ort --device cuda --config medium
To export the models:
::
python -m experimental_experiment.torch_bench.dort_bench \\
--backend custom --device cuda --export a -w 3
Profiling:
::
nsys profile python -m experimental_experiment.torch_bench.dort_bench \\
--device cuda -w 3 -r 5 --mixed 1 --config large \\
--backend eager --enable_pattern=default+onnxruntime
With experimental optimizers:
::
python -m experimental_experiment.torch_bench.dort_bench --backend custom \\
--device cuda --mixed=1 --export model -w 3 \\
--enable_pattern=default+onnxruntime+experimental
Or:
::
python -m experimental_experiment.torch_bench.dort_bench --backend ort+ \\
--device cuda --mixed=1 --export model -w 3 \\
--enable_pattern=default+onnxruntime+experimental
"""
import os
import pprint
[docs]
def main(args=None):
"""
Main function for command line
``python -m experimental_experiment.torch_bench.dort_bench``.
"""
from experimental_experiment.torch_bench._dort_cmd_common import dort_args
args = dort_args(
"experimental_experiment.torch_bench.dort_bench",
description=__doc__,
new_args=args,
)
from ..bench_run import (
multi_run,
make_configs,
make_dataframe_from_benchmark_data,
run_benchmark,
)
if multi_run(args):
configs = make_configs(args)
data = run_benchmark(
"experimental_experiment.torch_bench.dort_bench",
configs,
args.verbose,
stop_if_exception=False,
)
if args.verbose > 2:
pprint.pprint(data if args.verbose > 3 else data[:2])
if args.output_data:
df = make_dataframe_from_benchmark_data(data, detailed=False)
df.to_csv(args.output_data, index=False, errors="ignore")
df.to_excel(args.output_data + ".xlsx", index=False)
if args.verbose:
print(df)
else:
import logging
import time
import numpy as np
import torch
import torch._dynamo.backends.registry
import transformers
from experimental_experiment.torch_bench import BOOLEAN_VALUES
from experimental_experiment.convert.convert_helper import (
ort_optimize as run_ort_optimize,
)
from experimental_experiment.torch_models.dump_helper import dump_onnx
from experimental_experiment.torch_bench._dort_cmd_common import (
create_compiled_model,
create_configuration_for_benchmark,
create_model,
)
from experimental_experiment.memory_peak import start_spying_on, flatten
config_dict = create_configuration_for_benchmark(
model=args.model,
config=args.config,
repeat=args.repeat,
warmup=args.warmup,
num_hidden_layers=args.num_hidden_layers,
implementation=args.implementation,
with_mask=args.with_mask,
shape_scenario=args.shape_scenario,
)
verbose = int(args.verbose)
optimize = args.optimize in BOOLEAN_VALUES
ort_optimize = args.ort_optimize in BOOLEAN_VALUES
with_mask = args.with_mask in BOOLEAN_VALUES
disable_pattern = [_ for _ in args.disable_pattern.split("+") if _]
enable_pattern = [_ for _ in args.enable_pattern.split("+") if _]
print(f"model={args.model}")
print(f"model config={config_dict}")
print(f"backend={args.backend}")
print(f"verbose={verbose}")
print(f"optimize={args.optimize}")
print(f"ort_optimize={ort_optimize}")
print(f"order_algorithm={args.order}")
print(f"with_mask={with_mask}")
print(f"implementation={args.implementation}")
print(f"mixed={args.mixed}")
print(f"shape_scenario={args.shape_scenario}")
dump_patterns = args.dump_patterns in BOOLEAN_VALUES
if args.backend == "custom":
print(f"disable_pattern={disable_pattern!r}")
print(f"enable_pattern={enable_pattern!r}")
assert not dump_patterns or args.export, (
f"optimization patterns cannot be dumped if export is not set "
f"dump_patterns={dump_patterns!r}, export={args.export}"
)
is_cuda = args.device.startswith("cuda")
if is_cuda:
print(
f"CUDA no model: memory allocated={torch.cuda.memory_allocated(0)}, "
f"reserved={torch.cuda.memory_reserved(0)}"
)
device = args.device
model, example_args_collection = create_model(args.model, config_dict)
if args.backend != "ortmodule":
model = model.eval()
model = model.to(device)
if is_cuda:
print(
f"CUDA model loaded: memory allocated={torch.cuda.memory_allocated(0)}, "
f"reserved={torch.cuda.memory_reserved(0)}"
)
print(f"Build the compile model with backend={args.backend}")
use_dynamic = args.dynamic in BOOLEAN_VALUES
print(f"dynamic={use_dynamic}")
if verbose:
print(f"-- debug backend, opset={args.target_opset}")
for a in example_args_collection[0]:
print(f" input: {a.dtype}:{a.shape}")
dump_folder = args.dump_folder
if args.export and dump_folder and not os.path.exists(dump_folder):
os.makedirs(dump_folder)
if dump_patterns:
dump_patterns_folder = os.path.join(dump_folder, "patterns")
if os.path.exists(dump_patterns_folder):
for _ in os.listdir(dump_patterns_folder):
if _.endswith(".onnx"):
os.remove(os.path.join(dump_patterns_folder, _))
else:
dump_patterns_folder = None
if verbose:
if dump_patterns:
print(
f"dump models and patterns in {dump_folder!r} "
f"and {dump_patterns_folder!r}"
)
else:
print(f"dump models in {dump_folder!r}")
logger = logging.getLogger("onnxscript.optimizer.constant_folding")
logger.setLevel(logging.ERROR)
compiled_model = create_compiled_model(
model,
backend=args.backend,
use_dynamic=use_dynamic,
target_opset=args.target_opset,
verbose=verbose,
enable_pattern=enable_pattern,
disable_pattern=disable_pattern,
optimize=optimize,
ort_optimize=ort_optimize,
use_fused_aten_ops=args.implementation == "sdpa",
dump_prefix=(
f"{dump_folder}/{args.export}-{args.model}-{args.backend}"
if args.export
else None
),
dump_patterns=dump_patterns_folder,
processor=device.upper() if device.upper() == "CPU" else "CPU,CUDA",
order_algorithm=args.order,
)
del model
print(f"type of compiled_model={type(compiled_model)}")
def loop_iteration(is_cuda, inputs, compiled_model, loss):
torch.set_grad_enabled(True)
mixed = args.mixed in BOOLEAN_VALUES
if mixed and is_cuda:
with torch.autocast(device_type="cuda", dtype=torch.float16):
torch.cuda.nvtx.range_push("DORT-FORWARD-MIXED")
result = compiled_model(*inputs)
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()
elif is_cuda:
torch.cuda.nvtx.range_push("DORT-FORWARD")
result = compiled_model(*inputs)
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()
else:
result = compiled_model(*inputs)
# dummy_target = torch.ones_like(result[0],
# memory_format=torch.contiguous_format)
if mixed and is_cuda:
with torch.autocast(device_type="cuda", dtype=torch.float16):
torch.cuda.nvtx.range_push("DORT-ERROR-MIXED")
error = result[0].sum() # loss(result[0], dummy_target)
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("DORT-BACKWARD-MIXED")
error.backward()
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()
elif is_cuda:
torch.cuda.nvtx.range_push("DORT-ERROR")
error = result[0].sum() # loss(result[0], dummy_target)
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("DORT-BACKWARD")
error.backward()
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()
else:
error = result[0].sum() # loss(result[0], dummy_target)
error.backward()
print(f"warmup on device={args.device}")
if is_cuda:
print(
f"CUDA memory allocated={torch.cuda.memory_allocated(0)}, "
f"reserved={torch.cuda.memory_reserved(0)}"
)
if args.memory_spy in ("1", 1, "True", "true", True):
memory = start_spying_on(cuda=is_cuda)
else:
memory = None
warmup_times = []
loss = torch.nn.MSELoss()
for i in range(args.warmup):
example_inputs = example_args_collection[i]
inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs
if is_cuda:
torch.cuda.synchronize()
start_time = time.perf_counter()
if (
args.backend in ("ort", "custom", "debug", "plug", "ort+")
and i == 0
and args.export
):
with dump_onnx(
f"dort-{args.export}-{args.model}-{args.backend}",
folder=dump_folder,
clean=True,
):
loop_iteration(is_cuda, inputs, compiled_model, loss)
for onx in os.listdir(dump_folder):
if not onx.endswith(".onnx"):
continue
if ".opt." in onx:
continue
new_onx = onx.replace(".onnx", ".opt.onnx")
print(f" ort_optimize {onx} -> {new_onx}")
run_ort_optimize(
os.path.join(dump_folder, onx),
output=os.path.join(dump_folder, new_onx),
providers=(
[
("CUDAExecutionProvider", {}),
("CPUExecutionProvider", {}),
]
if is_cuda
else ["CPUExecutionProvider"]
),
)
else:
if is_cuda:
torch.cuda.nvtx.range_push("DORT-ITERATION")
loop_iteration(is_cuda, inputs, compiled_model, loss)
if is_cuda:
torch.cuda.nvtx.range_pop()
warmup_times.append(time.perf_counter() - start_time)
warmup_time = sum(warmup_times)
print(f"warmup done in {warmup_time}s.")
if is_cuda:
print(
f"memory allocated={torch.cuda.memory_allocated(0)}, "
f"reserved={torch.cuda.memory_reserved(0)}"
)
print("measures")
times = []
for example_inputs in example_args_collection[args.warmup :]:
inputs = [t.to("cuda") for t in example_inputs] if is_cuda else example_inputs
start_time = time.perf_counter()
loop_iteration(is_cuda, inputs, compiled_model, loss)
times.append(time.perf_counter() - start_time)
print("measures done.")
print(f"dynamic={args.dynamic}")
print(f"mixed={args.mixed}")
print(f"backend={args.backend}")
print(f"num_hidden_layers={args.num_hidden_layers}")
print(f"mixed={args.mixed}")
print(f"repeat={args.repeat}")
print(f"device={args.device}")
print(f"avg={np.mean(times)}")
print(f"times={times}")
print(f"warmup_times={warmup_times}")
print("-----------")
if memory is not None:
stat_memory = flatten(memory.stop(), prefix="memory_")
print(stat_memory)
print("-----------")
else:
stat_memory = None
i_shapes = set(config_dict["input_dims"])
if len(i_shapes) == 1:
idims = "x".join(str(i) for i in i_shapes)
else:
idims = "|".join("x".join(map(str, shs)) for shs in list(i_shapes)[:2])
del config_dict["input_dims"]
vals = "-".join(map(str, config_dict.values()))
print(f":{args.model},{idims}-{vals};")
print(f":config,{args.config};")
print(f":mixed,{args.mixed};")
print(f":dynamic,{use_dynamic};")
print(f":optimize,{optimize};")
print(f":order,{args.order};")
print(f":ort_optimize,{ort_optimize};")
print(f":backend,{args.backend};")
print(f":repeat,{args.repeat};")
print(f":warmup,{args.warmup};")
print(f":with_mask,{args.with_mask};")
print(f":implementation,{args.implementation};")
print(f":torch,{torch.__version__};")
print(f":transformers,{transformers.__version__};")
if stat_memory:
for k, v in stat_memory.items():
print(f":{k},{v};")
if args.backend in {"custom", "ort+", "debug"}:
suffix = "+oo" if args.ort_optimize else ""
print(f":patterns,+{args.enable_pattern}-{args.disable_pattern}{suffix};")
print(f":warmup_time,{sum(warmup_times)};")
print(f":time,{np.mean(times)};")
if __name__ == "__main__":
main()