Note
Go to the end to download the full example code
Measuring CPU performance with a parallelized vector sum#
The example compares the time spend in computing the sum of all coefficients of a matrix when the function walks through the coefficients by rows or by columns when the computation is parallelized.
Vector Sum#
from tqdm import tqdm
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnx_extended.ext_test_case import measure_time, unit_test_going
from onnx_extended.validation.cpu._validation import (
vector_sum_array as vector_sum,
vector_sum_array_parallel as vector_sum_parallel,
)
obs = []
dims = [500, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 2000]
if unit_test_going():
dims = dims[:2]
for dim in tqdm(dims):
values = numpy.ones((dim, dim), dtype=numpy.float32).ravel()
diff = abs(vector_sum(dim, values, True) - dim**2)
res = measure_time(lambda: vector_sum(dim, values, True), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="rows",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
res = measure_time(lambda: vector_sum_parallel(dim, values, True), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="rows//",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
diff = abs(vector_sum(dim, values, False) - dim**2)
res = measure_time(lambda: vector_sum_parallel(dim, values, False), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
direction="cols//",
time_per_element=res["average"] / dim**2,
diff=diff,
)
)
df = DataFrame(obs)
piv = df.pivot(index="dim", columns="direction", values="time_per_element")
print(piv)
0%| | 0/14 [00:00<?, ?it/s]
7%|7 | 1/14 [00:02<00:26, 2.01s/it]
14%|#4 | 2/14 [00:03<00:22, 1.85s/it]
21%|##1 | 3/14 [00:06<00:24, 2.21s/it]
29%|##8 | 4/14 [00:08<00:23, 2.37s/it]
36%|###5 | 5/14 [00:10<00:19, 2.21s/it]
43%|####2 | 6/14 [00:12<00:16, 2.06s/it]
50%|##### | 7/14 [00:16<00:17, 2.57s/it]
57%|#####7 | 8/14 [00:19<00:15, 2.63s/it]
64%|######4 | 9/14 [00:21<00:13, 2.61s/it]
71%|#######1 | 10/14 [00:25<00:11, 2.85s/it]
79%|#######8 | 11/14 [00:28<00:08, 2.90s/it]
86%|########5 | 12/14 [00:31<00:06, 3.04s/it]
93%|#########2| 13/14 [00:33<00:02, 2.84s/it]
100%|##########| 14/14 [00:35<00:00, 2.55s/it]
100%|##########| 14/14 [00:35<00:00, 2.55s/it]
direction cols// rows rows//
dim
500 4.501106e-10 1.143559e-09 4.139325e-10
700 6.155812e-10 1.073944e-09 3.556286e-10
800 6.970523e-10 1.117020e-09 4.847324e-10
900 4.924685e-09 1.859755e-09 4.701204e-09
1000 6.969506e-09 1.727890e-09 4.129575e-09
1100 7.630074e-09 2.125762e-09 3.901288e-09
1200 7.776375e-09 1.905790e-09 4.013687e-09
1300 5.484795e-09 1.815760e-09 2.989803e-09
1400 6.822384e-09 1.657764e-09 2.738828e-09
1500 7.283323e-09 3.763968e-09 4.062766e-09
1600 7.517995e-09 2.326358e-09 3.914371e-09
1700 7.771251e-09 3.978246e-09 3.644829e-09
1800 8.769196e-09 4.425172e-09 3.465385e-09
2000 7.528083e-09 3.870298e-09 3.345342e-09
Plots#
piv_diff = df.pivot(index="dim", columns="direction", values="diff")
piv_time = df.pivot(index="dim", columns="direction", values="time")
fig, ax = plt.subplots(1, 3, figsize=(12, 6))
piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
fig.savefig("plot_bench_cpu_vector_sum_parallel.png")
/home/xadupre/.local/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py:741: UserWarning: Data has no positive values, and therefore cannot be log-scaled.
labels = axis.get_majorticklabels() + axis.get_minorticklabels()
The summation by rows is much faster as expected. That explains why it is usually more efficient to transpose the first matrix before a matrix multiplication. Parallelization is faster.
Total running time of the script: ( 0 minutes 37.957 seconds)