Note
Go to the end to download the full example code.
Compares dot implementations (numpy, c++, sse, openmp)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
ddot_cython_array_omp,
ddot_array_openmp,
get_omp_max_threads,
ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going
def get_vectors(fct, n, h=250, dtype=numpy.float64):
ctxs = [
dict(
va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n,
)
for n in range(10, n, h)
]
return ctxs
Number of threads¶
print(get_omp_max_threads())
10
Several cython dot¶
def numpy_dot(va, vb):
return numpy.dot(va, vb)
def ddot_omp(va, vb):
return ddot_cython_array_omp(va, vb)
def ddot_omp_static(va, vb):
return ddot_cython_array_omp(va, vb, schedule=1)
def ddot_omp_dyn(va, vb):
return ddot_cython_array_omp(va, vb, schedule=2)
def ddot_omp_cpp(va, vb):
return ddot_array_openmp(va, vb)
def ddot_omp_cpp_16(va, vb):
return ddot_array_openmp_16(va, vb)
dfs = []
for fct in [
numpy_dot,
ddot_array,
ddot_array_16_sse,
ddot_omp,
ddot_omp_static,
ddot_omp_dyn,
ddot_omp_cpp,
ddot_omp_cpp_16,
]:
ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)
print(fct.__name__)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
numpy_dot
0%| | 0/160 [00:00<?, ?it/s]
26%|██▋ | 42/160 [00:00<00:00, 378.41it/s]
50%|█████ | 80/160 [00:00<00:00, 370.65it/s]
74%|███████▍ | 118/160 [00:00<00:00, 187.71it/s]
90%|█████████ | 144/160 [00:00<00:00, 187.30it/s]
100%|██████████| 160/160 [00:00<00:00, 211.50it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000008 0.000008 0.000004 0.000031 10 50 0.000076 184 0.000030 39260 numpy_dot
158 0.000010 0.000013 0.000004 0.000049 10 50 0.000098 184 0.000999 39510 numpy_dot
159 0.000009 0.000012 0.000004 0.000043 10 50 0.000089 184 0.000018 39760 numpy_dot
ddot_array
0%| | 0/160 [00:00<?, ?it/s]
27%|██▋ | 43/160 [00:00<00:00, 415.78it/s]
53%|█████▎ | 85/160 [00:00<00:00, 216.85it/s]
70%|███████ | 112/160 [00:00<00:00, 151.22it/s]
82%|████████▏ | 131/160 [00:00<00:00, 125.67it/s]
91%|█████████▏| 146/160 [00:01<00:00, 106.55it/s]
99%|█████████▉| 158/160 [00:01<00:00, 90.74it/s]
100%|██████████| 160/160 [00:01<00:00, 120.92it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000034 0.000007 0.000027 0.000051 10 50 0.000344 184 0.000048 39260 ddot_array
158 0.000036 0.000005 0.000029 0.000044 10 50 0.000355 184 0.000044 39510 ddot_array
159 0.000030 0.000003 0.000027 0.000034 10 50 0.000299 184 0.000054 39760 ddot_array
ddot_array_16_sse
0%| | 0/160 [00:00<?, ?it/s]
46%|████▌ | 73/160 [00:00<00:00, 714.43it/s]
91%|█████████ | 145/160 [00:00<00:00, 349.69it/s]
100%|██████████| 160/160 [00:00<00:00, 329.90it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000010 4.870777e-07 0.000009 0.000011 10 50 0.000101 184 0.000055 39260 ddot_array_16_sse
158 0.000011 1.692256e-06 0.000009 0.000015 10 50 0.000105 184 0.000035 39510 ddot_array_16_sse
159 0.000010 1.071967e-06 0.000009 0.000013 10 50 0.000101 184 0.000038 39760 ddot_array_16_sse
ddot_omp
0%| | 0/160 [00:00<?, ?it/s]
5%|▌ | 8/160 [00:00<00:01, 78.09it/s]
19%|█▉ | 31/160 [00:00<00:00, 164.12it/s]
36%|███▌ | 57/160 [00:00<00:00, 206.69it/s]
49%|████▉ | 78/160 [00:00<00:00, 203.36it/s]
62%|██████▏ | 99/160 [00:00<00:00, 198.76it/s]
74%|███████▍ | 119/160 [00:00<00:00, 189.97it/s]
87%|████████▋ | 139/160 [00:00<00:00, 179.03it/s]
99%|█████████▉| 158/160 [00:00<00:00, 169.76it/s]
100%|██████████| 160/160 [00:00<00:00, 177.53it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000012 0.000001 0.000011 0.000016 10 50 0.000125 184 0.000023 39260 ddot_omp
158 0.000014 0.000002 0.000012 0.000020 10 50 0.000139 184 0.000027 39510 ddot_omp
159 0.000013 0.000001 0.000011 0.000016 10 50 0.000127 184 0.000058 39760 ddot_omp
ddot_omp_static
0%| | 0/160 [00:00<?, ?it/s]
13%|█▎ | 21/160 [00:00<00:00, 203.31it/s]
26%|██▋ | 42/160 [00:00<00:00, 200.00it/s]
41%|████▏ | 66/160 [00:00<00:00, 216.56it/s]
55%|█████▌ | 88/160 [00:00<00:00, 213.98it/s]
69%|██████▉ | 110/160 [00:00<00:00, 211.15it/s]
82%|████████▎ | 132/160 [00:00<00:00, 188.91it/s]
95%|█████████▌| 152/160 [00:00<00:00, 173.66it/s]
100%|██████████| 160/160 [00:00<00:00, 186.85it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000013 0.000002 0.000011 0.000016 10 50 0.000134 184 0.000028 39260 ddot_omp_static
158 0.000014 0.000002 0.000011 0.000017 10 50 0.000137 184 0.000034 39510 ddot_omp_static
159 0.000015 0.000004 0.000012 0.000024 10 50 0.000155 184 0.000029 39760 ddot_omp_static
ddot_omp_dyn
0%| | 0/160 [00:00<?, ?it/s]
7%|▋ | 11/160 [00:00<00:01, 98.38it/s]
13%|█▎ | 21/160 [00:00<00:01, 71.38it/s]
22%|██▎ | 36/160 [00:00<00:01, 97.36it/s]
29%|██▉ | 47/160 [00:00<00:01, 97.43it/s]
36%|███▋ | 58/160 [00:00<00:01, 86.48it/s]
42%|████▎ | 68/160 [00:00<00:01, 82.03it/s]
48%|████▊ | 77/160 [00:00<00:01, 73.28it/s]
53%|█████▎ | 85/160 [00:01<00:01, 66.51it/s]
57%|█████▊ | 92/160 [00:01<00:01, 65.78it/s]
62%|██████▏ | 99/160 [00:01<00:00, 63.07it/s]
66%|██████▋ | 106/160 [00:01<00:00, 57.64it/s]
70%|███████ | 112/160 [00:01<00:00, 55.20it/s]
74%|███████▍ | 118/160 [00:01<00:00, 52.08it/s]
78%|███████▊ | 124/160 [00:01<00:00, 52.37it/s]
81%|████████▏ | 130/160 [00:01<00:00, 49.08it/s]
84%|████████▍ | 135/160 [00:02<00:00, 47.09it/s]
88%|████████▊ | 140/160 [00:02<00:00, 43.81it/s]
91%|█████████ | 145/160 [00:02<00:00, 40.07it/s]
94%|█████████▍| 150/160 [00:02<00:00, 37.55it/s]
96%|█████████▋| 154/160 [00:02<00:00, 37.78it/s]
99%|█████████▉| 158/160 [00:02<00:00, 36.60it/s]
100%|██████████| 160/160 [00:02<00:00, 56.43it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000053 1.783173e-06 0.000052 0.000059 10 50 0.000534 184 0.000068 39260 ddot_omp_dyn
158 0.000055 6.390969e-07 0.000054 0.000056 10 50 0.000546 184 0.000083 39510 ddot_omp_dyn
159 0.000054 5.796942e-06 0.000045 0.000066 10 50 0.000544 184 0.000071 39760 ddot_omp_dyn
ddot_omp_cpp
0%| | 0/160 [00:00<?, ?it/s]
10%|█ | 16/160 [00:00<00:00, 158.91it/s]
20%|██ | 32/160 [00:00<00:00, 128.99it/s]
43%|████▎ | 69/160 [00:00<00:00, 225.69it/s]
61%|██████▏ | 98/160 [00:00<00:00, 246.15it/s]
78%|███████▊ | 124/160 [00:00<00:00, 234.27it/s]
93%|█████████▎| 149/160 [00:00<00:00, 213.37it/s]
100%|██████████| 160/160 [00:00<00:00, 209.45it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.00001 6.273831e-07 0.000009 0.000011 10 50 0.000098 184 0.000178 39260 ddot_omp_cpp
158 0.00001 1.323552e-06 0.000009 0.000013 10 50 0.000105 184 0.000018 39510 ddot_omp_cpp
159 0.00001 1.097687e-06 0.000009 0.000013 10 50 0.000103 184 0.000018 39760 ddot_omp_cpp
ddot_omp_cpp_16
0%| | 0/160 [00:00<?, ?it/s]
19%|█▉ | 30/160 [00:00<00:00, 293.18it/s]
38%|███▊ | 60/160 [00:00<00:00, 195.47it/s]
61%|██████ | 97/160 [00:00<00:00, 256.68it/s]
81%|████████ | 129/160 [00:00<00:00, 277.59it/s]
99%|█████████▉| 159/160 [00:00<00:00, 282.12it/s]
100%|██████████| 160/160 [00:00<00:00, 266.90it/s]
average deviation min_exec max_exec repeat number ttime context_size warmup_time x_name fct
157 0.000010 2.392966e-06 0.000008 0.000017 10 50 0.000096 184 0.000042 39260 ddot_omp_cpp_16
158 0.000007 6.434547e-07 0.000006 0.000008 10 50 0.000067 184 0.000021 39510 ddot_omp_cpp_16
159 0.000007 8.910203e-07 0.000006 0.000009 10 50 0.000067 184 0.000037 39760 ddot_omp_cpp_16
Let’s display the results¶
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1, 1]
)
cc[
(
(cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
& ~cc.fct.str.contains("dyn")
)
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")

Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')
Total running time of the script: (0 minutes 10.261 seconds)