Note
Go to the end to download the full example code.
Compares dot implementations (numpy, c++, sse, openmp)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
ddot_cython_array_omp,
ddot_array_openmp,
get_omp_max_threads,
ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going
def get_vectors(fct, n, h=250, dtype=numpy.float64):
ctxs = [
dict(
va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n,
)
for n in range(10, n, h)
]
return ctxs
Number of threads¶
print(get_omp_max_threads())
10
Several cython dot¶
def numpy_dot(va, vb):
return numpy.dot(va, vb)
def ddot_omp(va, vb):
return ddot_cython_array_omp(va, vb)
def ddot_omp_static(va, vb):
return ddot_cython_array_omp(va, vb, schedule=1)
def ddot_omp_dyn(va, vb):
return ddot_cython_array_omp(va, vb, schedule=2)
def ddot_omp_cpp(va, vb):
return ddot_array_openmp(va, vb)
def ddot_omp_cpp_16(va, vb):
return ddot_array_openmp_16(va, vb)
dfs = []
for fct in [
numpy_dot,
ddot_array,
ddot_array_16_sse,
ddot_omp,
ddot_omp_static,
ddot_omp_dyn,
ddot_omp_cpp,
ddot_omp_cpp_16,
]:
ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)
print(fct.__name__)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
numpy_dot
0%| | 0/160 [00:00<?, ?it/s]
26%|██▌ | 41/160 [00:00<00:00, 196.18it/s]
38%|███▊ | 61/160 [00:00<00:01, 67.49it/s]
45%|████▌ | 72/160 [00:01<00:01, 59.17it/s]
50%|█████ | 80/160 [00:01<00:01, 50.86it/s]
54%|█████▍ | 86/160 [00:03<00:05, 14.17it/s]
56%|█████▋ | 90/160 [00:03<00:05, 13.08it/s]
65%|██████▌ | 104/160 [00:03<00:02, 20.48it/s]
69%|██████▉ | 111/160 [00:05<00:04, 11.20it/s]
72%|███████▏ | 115/160 [00:08<00:08, 5.05it/s]
74%|███████▍ | 118/160 [00:08<00:08, 5.18it/s]
76%|███████▋ | 122/160 [00:09<00:06, 6.27it/s]
78%|███████▊ | 125/160 [00:09<00:05, 6.76it/s]
81%|████████ | 129/160 [00:09<00:03, 8.37it/s]
82%|████████▏ | 131/160 [00:13<00:11, 2.62it/s]
83%|████████▎ | 133/160 [00:13<00:09, 2.87it/s]
86%|████████▌ | 137/160 [00:13<00:05, 4.15it/s]
87%|████████▋ | 139/160 [00:13<00:04, 4.78it/s]
91%|█████████▏| 146/160 [00:14<00:01, 7.84it/s]
93%|█████████▎| 149/160 [00:14<00:01, 8.79it/s]
95%|█████████▌| 152/160 [00:16<00:02, 3.82it/s]
96%|█████████▋| 154/160 [00:17<00:02, 2.87it/s]
97%|█████████▋| 155/160 [00:19<00:02, 2.11it/s]
98%|█████████▊| 156/160 [00:19<00:01, 2.27it/s]
99%|█████████▉| 159/160 [00:19<00:00, 3.18it/s]
100%|██████████| 160/160 [00:20<00:00, 3.20it/s]
100%|██████████| 160/160 [00:20<00:00, 7.97it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000031 0.000072 0.000006 0.000248 10 50 0.000308 184 0.000020 39260 numpy_dot
158 0.000624 0.001273 0.000006 0.004053 10 50 0.006240 184 0.000029 39510 numpy_dot
159 0.000602 0.001374 0.000005 0.004639 10 50 0.006015 184 0.000023 39760 numpy_dot
ddot_array
0%| | 0/160 [00:00<?, ?it/s]
24%|██▍ | 38/160 [00:00<00:00, 375.55it/s]
48%|████▊ | 76/160 [00:00<00:00, 193.02it/s]
62%|██████▎ | 100/160 [00:00<00:00, 142.27it/s]
74%|███████▍ | 118/160 [00:00<00:00, 116.80it/s]
82%|████████▎ | 132/160 [00:01<00:00, 99.85it/s]
90%|█████████ | 144/160 [00:01<00:00, 88.45it/s]
96%|█████████▋| 154/160 [00:01<00:00, 79.97it/s]
100%|██████████| 160/160 [00:01<00:00, 105.17it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000034 5.379273e-07 0.000033 0.000035 10 50 0.000340 184 0.000043 39260 ddot_array
158 0.000034 5.291191e-07 0.000034 0.000035 10 50 0.000342 184 0.000054 39510 ddot_array
159 0.000035 1.512740e-06 0.000033 0.000039 10 50 0.000352 184 0.000050 39760 ddot_array
ddot_array_16_sse
0%| | 0/160 [00:00<?, ?it/s]
36%|███▌ | 57/160 [00:00<00:00, 562.15it/s]
71%|███████▏ | 114/160 [00:00<00:00, 312.63it/s]
95%|█████████▌| 152/160 [00:00<00:00, 235.72it/s]
100%|██████████| 160/160 [00:00<00:00, 253.21it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000014 0.000002 0.000011 0.000018 10 50 0.000143 184 0.000035 39260 ddot_array_16_sse
158 0.000017 0.000003 0.000013 0.000023 10 50 0.000171 184 0.000065 39510 ddot_array_16_sse
159 0.000014 0.000002 0.000012 0.000017 10 50 0.000139 184 0.000042 39760 ddot_array_16_sse
ddot_omp
0%| | 0/160 [00:00<?, ?it/s]
14%|█▍ | 23/160 [00:00<00:00, 223.63it/s]
29%|██▉ | 46/160 [00:00<00:00, 211.05it/s]
42%|████▎ | 68/160 [00:00<00:00, 191.45it/s]
55%|█████▌ | 88/160 [00:00<00:00, 172.16it/s]
66%|██████▋ | 106/160 [00:00<00:00, 165.64it/s]
77%|███████▋ | 123/160 [00:00<00:00, 161.20it/s]
88%|████████▊ | 140/160 [00:00<00:00, 159.37it/s]
98%|█████████▊| 156/160 [00:00<00:00, 154.64it/s]
100%|██████████| 160/160 [00:00<00:00, 166.52it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000014 1.649187e-06 0.000013 0.000017 10 50 0.000139 184 0.000024 39260 ddot_omp
158 0.000014 1.319730e-06 0.000012 0.000017 10 50 0.000143 184 0.000051 39510 ddot_omp
159 0.000014 7.013416e-07 0.000013 0.000015 10 50 0.000140 184 0.000025 39760 ddot_omp
ddot_omp_static
0%| | 0/160 [00:00<?, ?it/s]
16%|█▌ | 25/160 [00:00<00:00, 249.66it/s]
31%|███▏ | 50/160 [00:00<00:00, 225.81it/s]
46%|████▌ | 73/160 [00:00<00:00, 208.47it/s]
59%|█████▉ | 95/160 [00:00<00:00, 189.12it/s]
72%|███████▏ | 115/160 [00:00<00:00, 177.75it/s]
83%|████████▎ | 133/160 [00:00<00:00, 156.76it/s]
94%|█████████▍| 150/160 [00:00<00:00, 147.63it/s]
100%|██████████| 160/160 [00:00<00:00, 165.81it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000015 0.000002 0.000013 0.000020 10 50 0.000153 184 0.000033 39260 ddot_omp_static
158 0.000015 0.000001 0.000013 0.000017 10 50 0.000148 184 0.000033 39510 ddot_omp_static
159 0.000016 0.000002 0.000014 0.000020 10 50 0.000165 184 0.000039 39760 ddot_omp_static
ddot_omp_dyn
0%| | 0/160 [00:00<?, ?it/s]
10%|█ | 16/160 [00:00<00:00, 149.65it/s]
19%|█▉ | 31/160 [00:00<00:01, 122.15it/s]
28%|██▊ | 44/160 [00:00<00:01, 103.84it/s]
34%|███▍ | 55/160 [00:00<00:01, 91.38it/s]
41%|████ | 65/160 [00:00<00:01, 81.08it/s]
46%|████▋ | 74/160 [00:00<00:01, 75.12it/s]
51%|█████▏ | 82/160 [00:01<00:01, 65.67it/s]
56%|█████▌ | 89/160 [00:01<00:01, 60.71it/s]
60%|██████ | 96/160 [00:01<00:01, 55.27it/s]
64%|██████▍ | 102/160 [00:01<00:01, 51.01it/s]
68%|██████▊ | 108/160 [00:01<00:01, 48.16it/s]
71%|███████ | 113/160 [00:01<00:01, 45.92it/s]
74%|███████▍ | 118/160 [00:01<00:00, 43.47it/s]
77%|███████▋ | 123/160 [00:01<00:00, 41.74it/s]
80%|████████ | 128/160 [00:02<00:00, 38.48it/s]
82%|████████▎ | 132/160 [00:02<00:00, 37.86it/s]
85%|████████▌ | 136/160 [00:02<00:00, 36.38it/s]
88%|████████▊ | 140/160 [00:02<00:00, 35.29it/s]
90%|█████████ | 144/160 [00:02<00:00, 34.93it/s]
92%|█████████▎| 148/160 [00:02<00:00, 34.55it/s]
95%|█████████▌| 152/160 [00:02<00:00, 32.26it/s]
98%|█████████▊| 156/160 [00:03<00:00, 31.37it/s]
100%|██████████| 160/160 [00:03<00:00, 29.50it/s]
100%|██████████| 160/160 [00:03<00:00, 50.33it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000064 0.000005 0.000058 0.000075 10 50 0.000638 184 0.000084 39260 ddot_omp_dyn
158 0.000076 0.000010 0.000060 0.000093 10 50 0.000759 184 0.000119 39510 ddot_omp_dyn
159 0.000088 0.000003 0.000082 0.000092 10 50 0.000877 184 0.000097 39760 ddot_omp_dyn
ddot_omp_cpp
0%| | 0/160 [00:00<?, ?it/s]
19%|█▉ | 31/160 [00:00<00:00, 305.47it/s]
39%|███▉ | 62/160 [00:00<00:00, 267.23it/s]
56%|█████▋ | 90/160 [00:00<00:00, 224.22it/s]
71%|███████▏ | 114/160 [00:00<00:00, 203.36it/s]
84%|████████▍ | 135/160 [00:00<00:00, 188.38it/s]
97%|█████████▋| 155/160 [00:00<00:00, 172.96it/s]
100%|██████████| 160/160 [00:00<00:00, 194.62it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000012 9.622883e-07 0.000011 0.000014 10 50 0.000120 184 0.000020 39260 ddot_omp_cpp
158 0.000016 2.446743e-06 0.000011 0.000020 10 50 0.000159 184 0.000019 39510 ddot_omp_cpp
159 0.000013 1.154813e-06 0.000012 0.000016 10 50 0.000133 184 0.000025 39760 ddot_omp_cpp
ddot_omp_cpp_16
0%| | 0/160 [00:00<?, ?it/s]
17%|█▋ | 27/160 [00:00<00:00, 261.68it/s]
37%|███▋ | 59/160 [00:00<00:00, 292.84it/s]
56%|█████▌ | 89/160 [00:00<00:00, 295.85it/s]
74%|███████▍ | 119/160 [00:00<00:00, 261.14it/s]
91%|█████████▏| 146/160 [00:00<00:00, 236.48it/s]
100%|██████████| 160/160 [00:00<00:00, 245.37it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000010 0.000001 0.000009 0.000013 10 50 0.000097 184 0.000026 39260 ddot_omp_cpp_16
158 0.000011 0.000001 0.000009 0.000012 10 50 0.000105 184 0.000047 39510 ddot_omp_cpp_16
159 0.000011 0.000001 0.000009 0.000013 10 50 0.000106 184 0.000024 39760 ddot_omp_cpp_16
Let’s display the results¶
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1, 1]
)
cc[
(
(cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
& ~cc.fct.str.contains("dyn")
)
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")

Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')
Total running time of the script: (0 minutes 31.023 seconds)