Note
Go to the end to download the full example code
Measuring CPU performance with a parallelized vector sum and AVX#
The example compares the time spend in computing the sum of all coefficients of a matrix when the function walks through the coefficients by rows or by columns when the computation is parallelized or uses AVX instructions.
Vector Sum#
from tqdm import tqdm
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnx_extended.ext_test_case import measure_time, unit_test_going
from onnx_extended.validation.cpu._validation import (
vector_sum_array as vector_sum,
vector_sum_array_parallel as vector_sum_parallel,
vector_sum_array_avx as vector_sum_avx,
vector_sum_array_avx_parallel as vector_sum_avx_parallel,
)
obs = []
dims = [500, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 2000]
if unit_test_going():
dims = dims[:2]
for dim in tqdm(dims):
values = numpy.ones((dim, dim), dtype=numpy.float32).ravel()
diff = abs(vector_sum(dim, values, True) - dim**2)
res = measure_time(lambda: vector_sum(dim, values, True), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="rows",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
res = measure_time(lambda: vector_sum_parallel(dim, values, True), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="rows//",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
diff = abs(vector_sum_avx(dim, values) - dim**2)
res = measure_time(lambda: vector_sum_avx(dim, values), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="avx",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
diff = abs(vector_sum_avx_parallel(dim, values) - dim**2)
res = measure_time(lambda: vector_sum_avx_parallel(dim, values), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="avx//",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
df = DataFrame(obs)
piv = df.pivot(index="dim", columns="direction", values="time_per_element")
print(piv)
0%| | 0/14 [00:00<?, ?it/s]
7%|7 | 1/14 [00:06<01:25, 6.60s/it]
14%|#4 | 2/14 [00:22<02:23, 11.93s/it]
21%|##1 | 3/14 [00:38<02:32, 13.85s/it]
29%|##8 | 4/14 [00:43<01:43, 10.32s/it]
36%|###5 | 5/14 [00:50<01:23, 9.30s/it]
43%|####2 | 6/14 [00:59<01:13, 9.19s/it]
50%|##### | 7/14 [01:05<00:56, 8.06s/it]
57%|#####7 | 8/14 [01:13<00:47, 7.90s/it]
64%|######4 | 9/14 [01:15<00:31, 6.29s/it]
71%|#######1 | 10/14 [01:18<00:21, 5.26s/it]
79%|#######8 | 11/14 [01:21<00:13, 4.34s/it]
86%|########5 | 12/14 [01:23<00:07, 3.75s/it]
93%|#########2| 13/14 [01:28<00:04, 4.10s/it]
100%|##########| 14/14 [01:32<00:00, 4.16s/it]
100%|##########| 14/14 [01:32<00:00, 6.62s/it]
direction avx avx// rows rows//
dim
500 2.124341e-10 2.970035e-09 1.415554e-09 8.919709e-09
700 1.472835e-10 2.119565e-09 1.403033e-09 6.208487e-09
800 1.821257e-10 2.282421e-09 1.513690e-09 5.536130e-09
900 4.354594e-10 1.834429e-09 2.839234e-09 6.489717e-09
1000 6.261898e-10 1.790831e-09 3.901196e-09 5.061805e-09
1100 8.991476e-10 1.502097e-09 2.668395e-09 4.909074e-09
1200 1.228392e-09 1.332833e-09 3.610501e-09 4.859514e-09
1300 1.006782e-09 2.078401e-09 2.328450e-09 4.445881e-09
1400 6.657746e-10 5.622565e-09 2.554591e-09 3.542308e-09
1500 7.966057e-10 4.039738e-09 2.743586e-09 3.729478e-09
1600 3.986791e-10 1.298015e-09 3.199657e-09 2.493390e-09
1700 4.674480e-10 1.629045e-09 1.810117e-09 1.461178e-09
1800 5.678262e-10 2.122750e-09 2.106633e-09 2.333242e-09
2000 1.260421e-09 2.016133e-09 2.736361e-09 4.461474e-09
Plots#
piv_diff = df.pivot(index="dim", columns="direction", values="diff")
piv_time = df.pivot(index="dim", columns="direction", values="time")
fig, ax = plt.subplots(1, 3, figsize=(12, 6))
piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
fig.savefig("plot_bench_cpu_vector_sum_avx_parallel.png")
AVX is faster.
Total running time of the script: ( 1 minutes 36.823 seconds)