Note
Go to the end to download the full example code
Compares dot implementations (numpy, c++, sse, openmp)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
ddot_cython_array_omp,
ddot_array_openmp,
get_omp_max_threads,
ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going
def get_vectors(fct, n, h=250, dtype=numpy.float64):
ctxs = [
dict(
va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n,
)
for n in range(10, n, h)
]
return ctxs
Number of threads¶
print(get_omp_max_threads())
8
Several cython dot¶
def numpy_dot(va, vb):
return numpy.dot(va, vb)
def ddot_omp(va, vb):
return ddot_cython_array_omp(va, vb)
def ddot_omp_static(va, vb):
return ddot_cython_array_omp(va, vb, schedule=1)
def ddot_omp_dyn(va, vb):
return ddot_cython_array_omp(va, vb, schedule=2)
def ddot_omp_cpp(va, vb):
return ddot_array_openmp(va, vb)
def ddot_omp_cpp_16(va, vb):
return ddot_array_openmp_16(va, vb)
dfs = []
for fct in [
numpy_dot,
ddot_array,
ddot_array_16_sse,
ddot_omp,
ddot_omp_static,
ddot_omp_dyn,
ddot_omp_cpp,
ddot_omp_cpp_16,
]:
ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)
print(fct.__name__)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
numpy_dot
0%| | 0/160 [00:00<?, ?it/s]
32%|███▎ | 52/160 [00:00<00:00, 510.92it/s]
65%|██████▌ | 104/160 [00:00<00:00, 229.71it/s]
84%|████████▍ | 135/160 [00:00<00:00, 195.89it/s]
99%|█████████▉| 159/160 [00:00<00:00, 152.11it/s]
100%|██████████| 160/160 [00:00<00:00, 182.06it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000007 6.590977e-07 0.000007 ... 0.000044 39260 numpy_dot
158 0.000024 4.850721e-05 0.000007 ... 0.000048 39510 numpy_dot
159 0.000008 1.848197e-06 0.000007 ... 0.000057 39760 numpy_dot
[3 rows x 11 columns]
ddot_array
0%| | 0/160 [00:00<?, ?it/s]
22%|██▏ | 35/160 [00:00<00:00, 337.07it/s]
43%|████▎ | 69/160 [00:00<00:00, 170.93it/s]
57%|█████▋ | 91/160 [00:00<00:00, 118.66it/s]
67%|██████▋ | 107/160 [00:00<00:00, 93.02it/s]
74%|███████▍ | 119/160 [00:01<00:00, 78.74it/s]
81%|████████ | 129/160 [00:01<00:00, 69.82it/s]
86%|████████▌ | 137/160 [00:01<00:00, 64.98it/s]
90%|█████████ | 144/160 [00:01<00:00, 61.02it/s]
94%|█████████▍| 151/160 [00:01<00:00, 56.73it/s]
98%|█████████▊| 157/160 [00:01<00:00, 53.92it/s]
100%|██████████| 160/160 [00:02<00:00, 78.53it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000046 0.000004 0.000042 ... 0.000057 39260 ddot_array
158 0.000045 0.000003 0.000042 ... 0.000059 39510 ddot_array
159 0.000047 0.000006 0.000042 ... 0.000155 39760 ddot_array
[3 rows x 11 columns]
ddot_array_16_sse
0%| | 0/160 [00:00<?, ?it/s]
38%|███▊ | 60/160 [00:00<00:00, 583.63it/s]
74%|███████▍ | 119/160 [00:00<00:00, 286.71it/s]
98%|█████████▊| 156/160 [00:00<00:00, 213.79it/s]
100%|██████████| 160/160 [00:00<00:00, 235.80it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000020 5.677125e-06 0.000016 ... 0.000047 39260 ddot_array_16_sse
158 0.000017 9.999452e-07 0.000016 ... 0.000069 39510 ddot_array_16_sse
159 0.000018 2.049265e-06 0.000016 ... 0.000052 39760 ddot_array_16_sse
[3 rows x 11 columns]
ddot_omp
0%| | 0/160 [00:00<?, ?it/s]
2%|▎ | 4/160 [00:00<00:04, 34.93it/s]
5%|▌ | 8/160 [00:00<00:14, 10.39it/s]
21%|██ | 33/160 [00:00<00:02, 53.18it/s]
34%|███▍ | 55/160 [00:00<00:01, 87.07it/s]
45%|████▌ | 72/160 [00:01<00:00, 102.14it/s]
55%|█████▌ | 88/160 [00:01<00:00, 102.59it/s]
66%|██████▌ | 105/160 [00:01<00:00, 117.07it/s]
75%|███████▌ | 120/160 [00:01<00:00, 120.38it/s]
84%|████████▍ | 135/160 [00:01<00:00, 122.95it/s]
93%|█████████▎| 149/160 [00:01<00:00, 106.96it/s]
100%|██████████| 160/160 [00:01<00:00, 90.12it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000013 2.520171e-06 0.000011 ... 0.000093 39260 ddot_omp
158 0.000011 3.950575e-07 0.000011 ... 0.000039 39510 ddot_omp
159 0.000022 2.188830e-05 0.000011 ... 0.000038 39760 ddot_omp
[3 rows x 11 columns]
ddot_omp_static
0%| | 0/160 [00:00<?, ?it/s]
18%|█▊ | 28/160 [00:00<00:00, 279.22it/s]
35%|███▌ | 56/160 [00:00<00:00, 249.77it/s]
51%|█████▏ | 82/160 [00:00<00:00, 217.08it/s]
66%|██████▌ | 105/160 [00:00<00:00, 185.71it/s]
78%|███████▊ | 125/160 [00:00<00:00, 162.75it/s]
89%|████████▉ | 142/160 [00:00<00:00, 132.89it/s]
98%|█████████▊| 157/160 [00:01<00:00, 123.51it/s]
100%|██████████| 160/160 [00:01<00:00, 152.78it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000014 7.931527e-07 0.000013 ... 0.000052 39260 ddot_omp_static
158 0.000033 4.043829e-05 0.000014 ... 0.000057 39510 ddot_omp_static
159 0.000021 1.071811e-05 0.000015 ... 0.000049 39760 ddot_omp_static
[3 rows x 11 columns]
ddot_omp_dyn
0%| | 0/160 [00:00<?, ?it/s]
14%|█▍ | 23/160 [00:00<00:00, 228.00it/s]
29%|██▉ | 46/160 [00:00<00:00, 139.38it/s]
39%|███▉ | 62/160 [00:00<00:00, 117.99it/s]
47%|████▋ | 75/160 [00:00<00:00, 97.51it/s]
54%|█████▍ | 86/160 [00:00<00:00, 87.18it/s]
60%|██████ | 96/160 [00:00<00:00, 80.16it/s]
66%|██████▌ | 105/160 [00:01<00:00, 74.94it/s]
71%|███████ | 113/160 [00:01<00:00, 69.07it/s]
75%|███████▌ | 120/160 [00:01<00:00, 65.53it/s]
79%|███████▉ | 127/160 [00:01<00:00, 60.60it/s]
84%|████████▍ | 134/160 [00:01<00:00, 37.46it/s]
87%|████████▋ | 139/160 [00:02<00:00, 38.99it/s]
90%|█████████ | 144/160 [00:02<00:00, 39.66it/s]
93%|█████████▎| 149/160 [00:02<00:00, 39.51it/s]
96%|█████████▋| 154/160 [00:02<00:00, 37.98it/s]
99%|█████████▉| 159/160 [00:02<00:00, 38.43it/s]
100%|██████████| 160/160 [00:02<00:00, 61.84it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000056 0.000029 0.000036 ... 0.000051 39260 ddot_omp_dyn
158 0.000043 0.000013 0.000037 ... 0.000066 39510 ddot_omp_dyn
159 0.000049 0.000024 0.000036 ... 0.004085 39760 ddot_omp_dyn
[3 rows x 11 columns]
ddot_omp_cpp
0%| | 0/160 [00:00<?, ?it/s]
19%|█▉ | 30/160 [00:00<00:00, 299.87it/s]
38%|███▊ | 60/160 [00:00<00:00, 248.61it/s]
54%|█████▍ | 86/160 [00:00<00:00, 248.97it/s]
70%|███████ | 112/160 [00:00<00:00, 177.22it/s]
82%|████████▎ | 132/160 [00:00<00:00, 174.44it/s]
94%|█████████▍| 151/160 [00:00<00:00, 166.51it/s]
100%|██████████| 160/160 [00:00<00:00, 184.60it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000014 8.102669e-06 0.000010 ... 0.000044 39260 ddot_omp_cpp
158 0.000010 4.264122e-07 0.000010 ... 0.000038 39510 ddot_omp_cpp
159 0.000020 1.931403e-05 0.000009 ... 0.000036 39760 ddot_omp_cpp
[3 rows x 11 columns]
ddot_omp_cpp_16
0%| | 0/160 [00:00<?, ?it/s]
26%|██▋ | 42/160 [00:00<00:00, 415.80it/s]
52%|█████▎ | 84/160 [00:00<00:00, 296.15it/s]
72%|███████▎ | 116/160 [00:00<00:00, 255.17it/s]
89%|████████▉ | 143/160 [00:00<00:00, 231.46it/s]
100%|██████████| 160/160 [00:00<00:00, 228.03it/s]
average deviation min_exec ... warmup_time x_name fct
157 0.000012 7.478398e-06 0.000008 ... 0.000087 39260 ddot_omp_cpp_16
158 0.000009 5.406901e-07 0.000008 ... 0.000047 39510 ddot_omp_cpp_16
159 0.000009 1.903409e-06 0.000008 ... 0.000033 39760 ddot_omp_cpp_16
[3 rows x 11 columns]
Let’s display the results¶
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1, 1]
)
cc[
(
(cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
& ~cc.fct.str.contains("dyn")
)
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations" "\nwithout dot_product")
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')
Total running time of the script: (0 minutes 12.893 seconds)