Note
Go to the end to download the full example code.
102: Measure LLAMA speed¶
The script is calling many times the script experimental_experiment.torch_bench.dort_bench.py
.
python _doc/examples/plot_llama_bench_102.py --help
For exemple, to check mixed precision on multiple backend:
python _doc/examples/plot_llama_bench_102.py --device=cuda --num_hidden_layers=2 --mixed=1
python _doc/examples/plot_llama_bench_102.py --device=cuda --num_hidden_layers=2 --mixed=1 --backend=eager,dynger,ortmodule,inductor,ort+,custom --config=large
With 32Gb GPU memory, the script runs with 6 layers.
python _doc/examples/plot_llama_bench_102.py --device=cuda --num_hidden_layers=6 --mixed=1 --backend=eager,dynger,ortmodule,inductor,trt,ort+,custom --config=large
python _doc/examples/plot_llama_bench_102.py --device=cuda --num_hidden_layers=2 --mixed=1 --backend=eager,ort+,custom --config=large
Run the following command to run one experiment and get the available options:
python -m experimental_experiment.torch_bench.dort_bench --help
from experimental_experiment.args import get_parsed_args, check_cuda_availability
parsed_args = get_parsed_args(
"plot_llama_bench",
description=__doc__,
warmup=5,
repeat=10,
model=("llama", "model to benchmark"),
backend=(
"eager,dynger,inductor,ort,ort+,custom,ortmodule",
"backend to test, among eager,dynger,inductor,ort,ort+,custom,plug,ortmodule,backort",
),
device=("cuda" if check_cuda_availability() else "cpu", "device to test"),
num_hidden_layers=("1", "hidden layers to test"),
mixed=("0", "boolean value to test (mixed precision or not)"),
dynamic=("0", "boolean value to test dynamic shapes or not"),
script_name=("experimental_experiment.torch_bench.dort_bench", "script to run"),
dump=(0, "dump the models with env ONNXRT_DUMP_PATH"),
check=(0, "just check the script is working, ignores all other parameters"),
config=("medium", "configuration to use, default or medium"),
patterns=(
"none,default,default+onnxruntime," "default+onnxruntime+experimental",
"optimization patterns to use",
),
implementation=("eager", "eager or sdpa or both values comma separated value"),
with_mask=(1, "with or without a second input (mask"),
disable_pattern=("none", "pattern or patterns to disable"),
ort_optimize=(
"0,1",
"enable or disable onnxruntime optimization, " "by default, tries both",
),
order=("none", "optimization order see class OrderAlgorithm, none by default"),
verbose=(1, "verbosity"),
expose="backend,device,num_hidden_layers,mixed,scipt_name,repeat,"
"warmup,dump,check,config,patterns,dynamic,disable_pattern,model"
"implementation,with_mask,ort_optimize,verbose,order",
)
import onnxruntime # noqa: F401
import numpy as np
import pandas
import matplotlib.pyplot as plt
import itertools
import torch
from experimental_experiment.ext_test_case import unit_test_going
from experimental_experiment.bench_run import run_benchmark, get_machine, BenchmarkError
script_name = "experimental_experiment.torch_bench.dort_bench"
machine = {} if unit_test_going() else get_machine()
repeat = parsed_args.repeat
warmup = parsed_args.warmup
def make_config(
model,
backend,
device,
num_hidden_layers,
repeat,
mixed,
dynamic,
config,
warmup,
pattern,
disable_pattern,
implementation,
with_mask,
ort_optimize,
order,
verbose,
existing=None,
):
if backend not in ("custom", "ort+"):
ort_optimize = None
pattern = None
disable_pattern = None
cf = dict(
model=model,
backend=backend,
device=device,
num_hidden_layers=num_hidden_layers,
repeat=repeat,
mixed=mixed,
dynamic=dynamic,
config=config,
warmup=warmup,
implementation=implementation,
with_mask=with_mask,
ort_optimize=ort_optimize,
order=order,
verbose=verbose,
)
cf = {k: v for k, v in cf.items() if v is not None}
if existing and backend not in ("custom", "ort+"):
for ex in existing:
if not ex:
continue
equal = True
for k in cf:
if cf[k] != ex[k]:
equal = False
break
if equal:
return None
if pattern is None:
opt = {}
elif pattern == "none":
opt = dict(enable_pattern="default", disable_pattern="default")
elif pattern in "default" or "+" in pattern:
opt = dict(enable_pattern=pattern)
else:
raise AssertionError(f"unexpected value for pattern={pattern!r}")
cf.update(opt)
if disable_pattern not in ("none", None):
if "disable_pattern" in cf:
cf["disable_pattern"] += f",{disable_pattern}"
else:
cf["disable_pattern"] = disable_pattern
if "enable_pattern" in cf and "+experimental" in cf["enable_pattern"]:
try:
import onnx_extended # noqa: F401
except ImportError:
return None
elif not ort_optimize and backend in ("custom", "ort+"):
return None
assert (
cf["backend"] != "eager" or cf.get("ort_optimize", None) is None
), f"Wrong configuration {cf}"
return cf
if parsed_args.check not in (1, "1"):
verbose = parsed_args.verbose
configs = []
for (
backend,
device,
num_hidden_layers,
mixed,
dynamic,
pattern,
impl,
ort_optimize,
) in itertools.product(
parsed_args.backend.split(","),
parsed_args.device.split(","),
list(map(int, parsed_args.num_hidden_layers.split(","))),
list(map(int, parsed_args.mixed.split(","))),
list(map(int, parsed_args.dynamic.split(","))),
parsed_args.patterns.split(","),
parsed_args.implementation.split(","),
list(map(int, parsed_args.ort_optimize.split(","))),
):
if mixed == 1 and device == "cpu":
continue
if machine.get("capability", (0, 0)) < (7, 0) and backend == "inductor":
continue
configs.append(
make_config(
model=parsed_args.model,
backend=backend,
device=device,
num_hidden_layers=num_hidden_layers,
repeat=repeat,
mixed=mixed,
dynamic=dynamic,
config=parsed_args.config,
warmup=warmup,
pattern=pattern,
disable_pattern=parsed_args.disable_pattern,
existing=configs,
implementation=impl,
with_mask=parsed_args.with_mask,
ort_optimize=ort_optimize,
order=parsed_args.order,
verbose=verbose,
)
)
else:
verbose = 5
device = "cuda" if torch.cuda.is_available() else "cpu"
configs = [
dict(
model=parsed_args.model,
backend="custom",
device=device,
num_hidden_layers=1,
repeat=1,
mixed=0,
dynamic=0,
warmup=1,
config="small",
),
]
[2024-05-08 14:04:05,281] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
All configurations to consider.
config 1: {'model': 'llama', 'backend': 'eager', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'order': 'none', 'verbose': 1}
config 2: {'model': 'llama', 'backend': 'dynger', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'order': 'none', 'verbose': 1}
config 3: {'model': 'llama', 'backend': 'ort', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'order': 'none', 'verbose': 1}
config 4: {'model': 'llama', 'backend': 'ort+', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default', 'disable_pattern': 'default'}
config 5: {'model': 'llama', 'backend': 'ort+', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default'}
config 6: {'model': 'llama', 'backend': 'ort+', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime'}
config 7: {'model': 'llama', 'backend': 'ort+', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 0, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime+experimental'}
config 8: {'model': 'llama', 'backend': 'ort+', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime+experimental'}
config 9: {'model': 'llama', 'backend': 'custom', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default', 'disable_pattern': 'default'}
config 10: {'model': 'llama', 'backend': 'custom', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default'}
config 11: {'model': 'llama', 'backend': 'custom', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime'}
config 12: {'model': 'llama', 'backend': 'custom', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 0, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime+experimental'}
config 13: {'model': 'llama', 'backend': 'custom', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'ort_optimize': 1, 'order': 'none', 'verbose': 1, 'enable_pattern': 'default+onnxruntime+experimental'}
config 14: {'model': 'llama', 'backend': 'ortmodule', 'device': 'cuda', 'num_hidden_layers': 1, 'repeat': 10, 'mixed': 0, 'dynamic': 0, 'config': 'medium', 'warmup': 5, 'implementation': 'eager', 'with_mask': 1, 'order': 'none', 'verbose': 1}
Running configuration.
try:
data = run_benchmark(
parsed_args.script_name,
configs,
verbose=verbose,
stop_if_exception=False,
dump=parsed_args.dump in ("1", 1),
)
data_collected = True
except BenchmarkError as e:
if verbose:
print(e)
data_collected = False
0%| | 0/14 [00:00<?, ?it/s]
7%|▋ | 1/14 [00:09<02:03, 9.50s/it]
14%|█▍ | 2/14 [00:19<01:54, 9.53s/it]
21%|██▏ | 3/14 [00:31<01:57, 10.67s/it]
29%|██▊ | 4/14 [00:43<01:52, 11.23s/it]
36%|███▌ | 5/14 [00:53<01:38, 10.98s/it]
43%|████▎ | 6/14 [01:04<01:26, 10.77s/it]
50%|█████ | 7/14 [01:14<01:13, 10.53s/it]
57%|█████▋ | 8/14 [01:25<01:04, 10.73s/it]
64%|██████▍ | 9/14 [01:33<00:50, 10.09s/it]
71%|███████▏ | 10/14 [01:42<00:38, 9.61s/it]
79%|███████▊ | 11/14 [01:50<00:27, 9.21s/it]
86%|████████▌ | 12/14 [01:59<00:17, 8.93s/it]
93%|█████████▎| 13/14 [02:11<00:10, 10.05s/it]
100%|██████████| 14/14 [02:21<00:00, 9.89s/it]
100%|██████████| 14/14 [02:21<00:00, 10.09s/it]
Let’s process the data.
prefix = (
f"plot_{parsed_args.model}-{parsed_args.with_mask}-"
f"m{parsed_args.mixed}d{parsed_args.dynamic}h{parsed_args.num_hidden_layers}-"
f"{parsed_args.implementation}"
)
if data_collected:
def clean_pattern(s):
s = s.replace("+default-default", "")
return s
def make_legend(row):
row = row.to_dict()
val = [
row["device"],
f"h{row['num_hidden_layers']}",
row["implementation"],
row["backend"],
]
if row["mixed"]:
val.append("mix")
if row["dynamic"]:
val.append("dyn")
if "patterns" in row and row["patterns"] and "nan" not in str(row["patterns"]):
val.append(f"({clean_pattern(row['patterns'])})")
s = "-".join(map(str, val))
assert "nan" not in s, f"Legend {s!r} is wrong, row={row}"
return s
df = pandas.DataFrame(data)
df = df.drop(["OUTPUT", "ERROR"], axis=1)
df["legend"] = df.apply(make_legend, axis=1)
df["time"] = df["time"].astype(float)
df_eager = df[(df["implementation"] == "eager") & (df["backend"] == "eager")][
"time"
].dropna()
if df_eager.shape[0] > 0:
min_eager = df_eager.min()
df["increase"] = df["time"] / min_eager - 1
# df["ERROR"] = df["ERROR"].apply(lambda s: s.replace("\n", " "))
filename = f"plot_{prefix}_bench_with_cmd.csv"
df.to_csv(filename, index=False)
filename = f"plot_{prefix}_bench_with_cmd.xlsx"
df.to_excel(filename, index=False)
df = df.drop(["CMD"], axis=1)
filename = f"plot_{prefix}_bench.csv"
df.to_csv(filename, index=False)
df = pandas.read_csv(filename) # to cast type
print(df)
# summary
cs = [
c
for c in ["backend", "patterns", "warmup_time", "time", "increase"]
if c in df.columns
]
dfs = df[cs]
filename = f"plot_{prefix}_summary.xlsx"
dfs.to_excel(filename, index=False)
filename = f"plot_{prefix}_summary.csv"
dfs.to_csv(filename, index=False)
print(dfs)
llama config mixed dynamic ... enable_pattern disable_pattern legend increase
0 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-eager 0.000000
1 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-dynger 0.048698
2 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-ort 0.253650
3 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default default cuda-h1-eager-ort+-(+oo) 0.251130
4 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default NaN cuda-h1-eager-ort+-(+default-+oo) 0.061634
5 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime NaN cuda-h1-eager-ort+-(+default+onnxruntime-+oo) 0.021459
6 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-ort+-(+default+onnxruntime+exper... 0.016192
7 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-ort+-(+default+onnxruntime+exper... 0.010914
8 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default default cuda-h1-eager-custom-(+oo) 0.069429
9 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default NaN cuda-h1-eager-custom-(+default-+oo) 0.047844
10 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime NaN cuda-h1-eager-custom-(+default+onnxruntime-+oo) 0.017915
11 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-custom-(+default+onnxruntime+exp... 0.000395
12 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-custom-(+default+onnxruntime+exp... 0.013700
13 NaN medium 0 0 ... NaN NaN cuda-h1-eager-ortmodule NaN
[14 rows x 25 columns]
backend patterns warmup_time time increase
0 eager NaN 0.656028 0.057782 0.000000
1 dynger NaN 2.221376 0.060596 0.048698
2 ort NaN 4.558247 0.072439 0.253650
3 ort+ +default-default+oo 4.465371 0.072293 0.251130
4 ort+ +default-+oo 3.702030 0.061344 0.061634
5 ort+ +default+onnxruntime-+oo 3.785458 0.059022 0.021459
6 ort+ +default+onnxruntime+experimental- 3.669494 0.058718 0.016192
7 ort+ +default+onnxruntime+experimental-+oo 4.152905 0.058413 0.010914
8 custom +default-default+oo 1.735806 0.061794 0.069429
9 custom +default-+oo 1.925915 0.060547 0.047844
10 custom +default+onnxruntime-+oo 1.819260 0.058817 0.017915
11 custom +default+onnxruntime+experimental- 1.842596 0.057805 0.000395
12 custom +default+onnxruntime+experimental-+oo 4.649349 0.058574 0.013700
13 ortmodule NaN NaN NaN NaN
First lines.
print(df.head(2).T)
0 1
llama 2x1024-1024-1-1024-1024-1024-2-eager-1 2x1024-1024-1-1024-1024-1024-2-eager-1
config medium medium
mixed 0 0
dynamic 0 0
optimize True True
order none none
ort_optimize True True
backend eager dynger
repeat 10 10
warmup 5 5
with_mask 1 1
implementation eager eager
torch 2.4.0.dev20240425+cu118 2.4.0.dev20240425+cu118
transformers 4.39.3 4.39.3
warmup_time 0.656028 2.221376
time 0.057782 0.060596
model llama llama
device cuda cuda
num_hidden_layers 1 1
verbose 1 1
patterns NaN NaN
enable_pattern NaN NaN
disable_pattern NaN NaN
legend cuda-h1-eager-eager cuda-h1-eager-dynger
increase 0.0 0.048698
More simple
Simplified data
print(df.sort_values("legend"))
llama config mixed dynamic ... enable_pattern disable_pattern legend increase
11 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-custom-(+default+onnxruntime+exp... 0.000395
12 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-custom-(+default+onnxruntime+exp... 0.013700
10 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime NaN cuda-h1-eager-custom-(+default+onnxruntime-+oo) 0.017915
9 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default NaN cuda-h1-eager-custom-(+default-+oo) 0.047844
8 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default default cuda-h1-eager-custom-(+oo) 0.069429
1 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-dynger 0.048698
0 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-eager 0.000000
2 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... NaN NaN cuda-h1-eager-ort 0.253650
6 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-ort+-(+default+onnxruntime+exper... 0.016192
7 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime+experimental NaN cuda-h1-eager-ort+-(+default+onnxruntime+exper... 0.010914
5 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default+onnxruntime NaN cuda-h1-eager-ort+-(+default+onnxruntime-+oo) 0.021459
4 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default NaN cuda-h1-eager-ort+-(+default-+oo) 0.061634
3 2x1024-1024-1-1024-1024-1024-2-eager-1 medium 0 0 ... default default cuda-h1-eager-ort+-(+oo) 0.251130
13 NaN medium 0 0 ... NaN NaN cuda-h1-eager-ortmodule NaN
[14 rows x 25 columns]
Plot warmup time.
torch_version = list(set(df["torch"].dropna()))
transformers_version = list(set(df["transformers"].dropna()))
ver = f"{torch_version[0]} - {transformers_version[0]}"
model = parsed_args.model
modeldf = list(set(df[model].dropna()))[0]
title_prefix = (
f"lower better\n"
f"{parsed_args.model} - {ver} - mask{parsed_args.with_mask}"
f"\n<device>-h<hidden-layers>-<implementation>-<backend>-(optimization)"
)
if data_collected:
fig, ax = plt.subplots(1, 1, figsize=(12, df.shape[0] // 3 + 1))
df = df.sort_values("time").set_index("legend")
df[["warmup_time"]].plot.barh(ax=ax, title=f"warmup time\n{title_prefix}")
ax.grid(True)
fig.tight_layout()
fig.savefig(f"plot_{prefix}_bench_warmup_time.png")
Plot time.
if data_collected:
fig, ax = plt.subplots(1, 1, figsize=(12, df.shape[0] // 3 + 1))
df[["time"]].plot.barh(ax=ax, title=f"computation time\n{title_prefix}")
mi, ma = df["time"].min(), df["time"].max()
mi = mi - (ma - mi) / 10
ax.set_xlim(left=mi)
ax.grid(True)
fig.tight_layout()
fig.savefig(f"plot_{prefix}_bench_time.png")
Plot increase.
if data_collected:
fig, ax = plt.subplots(1, 1, figsize=(12, df.shape[0] // 3 + 1))
df[["increase"]].plot.barh(ax=ax, title=f"comparison to eager %\n{title_prefix}")
ax.grid(True)
fig.tight_layout()
fig.savefig(f"plot_{prefix}_bench_relative.png")
Total running time of the script: (2 minutes 30.437 seconds)