Note
Go to the end to download the full example code.
Compares dot implementations (numpy, c++, sse, openmp)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
ddot_cython_array_omp,
ddot_array_openmp,
get_omp_max_threads,
ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going
def get_vectors(fct, n, h=250, dtype=numpy.float64):
ctxs = [
dict(
va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n,
)
for n in range(10, n, h)
]
return ctxs
Number of threads¶
print(get_omp_max_threads())
10
Several cython dot¶
def numpy_dot(va, vb):
return numpy.dot(va, vb)
def ddot_omp(va, vb):
return ddot_cython_array_omp(va, vb)
def ddot_omp_static(va, vb):
return ddot_cython_array_omp(va, vb, schedule=1)
def ddot_omp_dyn(va, vb):
return ddot_cython_array_omp(va, vb, schedule=2)
def ddot_omp_cpp(va, vb):
return ddot_array_openmp(va, vb)
def ddot_omp_cpp_16(va, vb):
return ddot_array_openmp_16(va, vb)
dfs = []
for fct in [
numpy_dot,
ddot_array,
ddot_array_16_sse,
ddot_omp,
ddot_omp_static,
ddot_omp_dyn,
ddot_omp_cpp,
ddot_omp_cpp_16,
]:
ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)
print(fct.__name__)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
numpy_dot
0%| | 0/160 [00:00<?, ?it/s]
26%|██▌ | 41/160 [00:00<00:01, 70.36it/s]
39%|███▉ | 63/160 [00:00<00:00, 99.59it/s]
49%|████▉ | 79/160 [00:01<00:01, 54.71it/s]
56%|█████▌ | 89/160 [00:01<00:01, 52.64it/s]
69%|██████▉ | 110/160 [00:01<00:00, 74.39it/s]
82%|████████▎ | 132/160 [00:01<00:00, 96.18it/s]
99%|█████████▉| 159/160 [00:01<00:00, 128.89it/s]
100%|██████████| 160/160 [00:01<00:00, 88.52it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000006 0.000002 0.000005 ... 0.000015 39260 numpy_dot
158 0.000006 0.000001 0.000005 ... 0.000016 39510 numpy_dot
159 0.000007 0.000002 0.000005 ... 0.000276 39760 numpy_dot
[3 rows x 11 columns]
ddot_array
0%| | 0/160 [00:00<?, ?it/s]
21%|██▏ | 34/160 [00:00<00:00, 336.85it/s]
42%|████▎ | 68/160 [00:00<00:00, 211.28it/s]
57%|█████▊ | 92/160 [00:00<00:00, 150.89it/s]
69%|██████▉ | 110/160 [00:00<00:00, 124.33it/s]
78%|███████▊ | 125/160 [00:00<00:00, 104.56it/s]
86%|████████▌ | 137/160 [00:01<00:00, 93.39it/s]
92%|█████████▏| 147/160 [00:01<00:00, 83.89it/s]
98%|█████████▊| 156/160 [00:01<00:00, 68.81it/s]
100%|██████████| 160/160 [00:01<00:00, 100.49it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000036 0.000003 0.000032 ... 0.000052 39260 ddot_array
158 0.000035 0.000002 0.000033 ... 0.000043 39510 ddot_array
159 0.000036 0.000005 0.000033 ... 0.000041 39760 ddot_array
[3 rows x 11 columns]
ddot_array_16_sse
0%| | 0/160 [00:00<?, ?it/s]
32%|███▎ | 52/160 [00:00<00:00, 507.14it/s]
64%|██████▍ | 103/160 [00:00<00:00, 320.80it/s]
87%|████████▋ | 139/160 [00:00<00:00, 248.20it/s]
100%|██████████| 160/160 [00:00<00:00, 244.18it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000028 0.000028 0.000013 ... 0.000057 39260 ddot_array_16_sse
158 0.000015 0.000003 0.000013 ... 0.000037 39510 ddot_array_16_sse
159 0.000018 0.000007 0.000014 ... 0.000037 39760 ddot_array_16_sse
[3 rows x 11 columns]
ddot_omp
0%| | 0/160 [00:00<?, ?it/s]
1%| | 1/160 [00:00<00:36, 4.42it/s]
11%|█▏ | 18/160 [00:00<00:02, 67.86it/s]
28%|██▊ | 44/160 [00:00<00:00, 134.81it/s]
44%|████▍ | 70/160 [00:00<00:00, 175.43it/s]
58%|█████▊ | 93/160 [00:00<00:00, 191.79it/s]
72%|███████▏ | 115/160 [00:00<00:00, 145.18it/s]
83%|████████▎ | 133/160 [00:01<00:00, 95.75it/s]
92%|█████████▏| 147/160 [00:01<00:00, 102.21it/s]
100%|██████████| 160/160 [00:01<00:00, 111.98it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000009 8.693929e-07 0.000009 ... 0.000025 39260 ddot_omp
158 0.000014 2.401780e-06 0.000009 ... 0.000021 39510 ddot_omp
159 0.000017 1.338579e-06 0.000015 ... 0.000025 39760 ddot_omp
[3 rows x 11 columns]
ddot_omp_static
0%| | 0/160 [00:00<?, ?it/s]
1%| | 1/160 [00:00<00:44, 3.60it/s]
12%|█▎ | 20/160 [00:00<00:02, 66.43it/s]
28%|██▊ | 45/160 [00:00<00:00, 124.99it/s]
39%|███▉ | 63/160 [00:00<00:00, 141.84it/s]
51%|█████ | 81/160 [00:00<00:00, 131.34it/s]
65%|██████▌ | 104/160 [00:00<00:00, 156.27it/s]
79%|███████▉ | 126/160 [00:00<00:00, 171.99it/s]
91%|█████████ | 145/160 [00:01<00:00, 174.57it/s]
100%|██████████| 160/160 [00:01<00:00, 136.22it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000012 0.000004 0.000010 ... 0.000037 39260 ddot_omp_static
158 0.000019 0.000011 0.000010 ... 0.000077 39510 ddot_omp_static
159 0.000011 0.000003 0.000009 ... 0.000055 39760 ddot_omp_static
[3 rows x 11 columns]
ddot_omp_dyn
0%| | 0/160 [00:00<?, ?it/s]
1%| | 1/160 [00:00<00:38, 4.13it/s]
6%|▌ | 9/160 [00:00<00:04, 32.07it/s]
9%|▉ | 15/160 [00:00<00:03, 41.47it/s]
14%|█▍ | 23/160 [00:00<00:02, 52.42it/s]
20%|██ | 32/160 [00:00<00:02, 62.06it/s]
25%|██▌ | 40/160 [00:00<00:01, 67.04it/s]
30%|███ | 48/160 [00:00<00:01, 69.05it/s]
35%|███▌ | 56/160 [00:01<00:01, 67.07it/s]
39%|███▉ | 63/160 [00:01<00:01, 63.45it/s]
44%|████▍ | 70/160 [00:01<00:01, 54.68it/s]
48%|████▊ | 76/160 [00:01<00:01, 46.73it/s]
51%|█████▏ | 82/160 [00:01<00:01, 42.18it/s]
54%|█████▍ | 87/160 [00:01<00:01, 39.42it/s]
57%|█████▊ | 92/160 [00:01<00:01, 39.91it/s]
61%|██████ | 97/160 [00:02<00:01, 41.61it/s]
64%|██████▍ | 102/160 [00:02<00:01, 43.07it/s]
67%|██████▋ | 107/160 [00:02<00:01, 44.60it/s]
70%|███████ | 112/160 [00:02<00:01, 44.56it/s]
73%|███████▎ | 117/160 [00:02<00:00, 44.17it/s]
76%|███████▋ | 122/160 [00:02<00:00, 43.41it/s]
79%|███████▉ | 127/160 [00:02<00:00, 43.16it/s]
82%|████████▎ | 132/160 [00:02<00:00, 39.70it/s]
86%|████████▌ | 137/160 [00:03<00:00, 37.03it/s]
88%|████████▊ | 141/160 [00:03<00:00, 37.53it/s]
91%|█████████ | 145/160 [00:03<00:00, 33.99it/s]
93%|█████████▎| 149/160 [00:03<00:00, 33.54it/s]
96%|█████████▌| 153/160 [00:03<00:00, 34.82it/s]
98%|█████████▊| 157/160 [00:03<00:00, 33.30it/s]
100%|██████████| 160/160 [00:03<00:00, 42.88it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000066 0.000014 0.000052 ... 0.000069 39260 ddot_omp_dyn
158 0.000064 0.000009 0.000050 ... 0.000112 39510 ddot_omp_dyn
159 0.000070 0.000021 0.000053 ... 0.000349 39760 ddot_omp_dyn
[3 rows x 11 columns]
ddot_omp_cpp
0%| | 0/160 [00:00<?, ?it/s]
1%| | 1/160 [00:00<00:29, 5.34it/s]
25%|██▌ | 40/160 [00:00<00:00, 170.03it/s]
46%|████▋ | 74/160 [00:00<00:00, 233.26it/s]
64%|██████▍ | 102/160 [00:00<00:00, 226.67it/s]
80%|████████ | 128/160 [00:00<00:00, 229.42it/s]
96%|█████████▋| 154/160 [00:00<00:00, 238.21it/s]
100%|██████████| 160/160 [00:00<00:00, 211.32it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000008 0.000001 0.000006 ... 0.000020 39260 ddot_omp_cpp
158 0.000009 0.000003 0.000007 ... 0.000032 39510 ddot_omp_cpp
159 0.000008 0.000002 0.000007 ... 0.000018 39760 ddot_omp_cpp
[3 rows x 11 columns]
ddot_omp_cpp_16
0%| | 0/160 [00:00<?, ?it/s]
1%|▏ | 2/160 [00:00<00:19, 8.27it/s]
28%|██▊ | 45/160 [00:00<00:00, 163.83it/s]
45%|████▌ | 72/160 [00:00<00:00, 199.44it/s]
66%|██████▌ | 105/160 [00:00<00:00, 241.28it/s]
85%|████████▌ | 136/160 [00:00<00:00, 262.45it/s]
100%|██████████| 160/160 [00:00<00:00, 215.61it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000007 3.779736e-07 0.000006 ... 0.000028 39260 ddot_omp_cpp_16
158 0.000007 8.829744e-07 0.000006 ... 0.000028 39510 ddot_omp_cpp_16
159 0.000007 6.191916e-07 0.000006 ... 0.000237 39760 ddot_omp_cpp_16
[3 rows x 11 columns]
Let’s display the results¶
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1, 1]
)
cc[
(
(cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
& ~cc.fct.str.contains("dyn")
)
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')
Total running time of the script: (0 minutes 14.336 seconds)