Note
Go to the end to download the full example code.
Compares dot implementations (numpy, cython, c++, sse)¶
numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import (
dot_product,
ddot_cython_array,
ddot_cython_array_optim,
ddot_array,
ddot_array_16,
ddot_array_16_sse,
)
from teachcompute.validation.cython.dot_cython import (
sdot_cython_array,
sdot_cython_array_optim,
sdot_array,
sdot_array_16,
sdot_array_16_sse,
)
from teachcompute.ext_test_case import measure_time_dim
def get_vectors(fct, n, h=100, dtype=numpy.float64):
ctxs = [
dict(
va=numpy.random.randn(n).astype(dtype),
vb=numpy.random.randn(n).astype(dtype),
dot=fct,
x_name=n,
)
for n in range(10, n, h)
]
return ctxs
numpy dot¶
0%| | 0/100 [00:00<?, ?it/s]
73%|███████▎ | 73/100 [00:00<00:00, 729.74it/s]
100%|██████████| 100/100 [00:00<00:00, 697.86it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000004 5.619979e-07 0.000003 ... 0.000014 9710 numpy.dot
98 0.000004 4.847581e-07 0.000003 ... 0.000012 9810 numpy.dot
99 0.000004 1.200792e-06 0.000003 ... 0.000011 9910 numpy.dot
[3 rows x 11 columns]
Several cython dot¶
for fct in [
dot_product,
ddot_cython_array,
ddot_cython_array_optim,
ddot_array,
ddot_array_16,
ddot_array_16_sse,
]:
ctxs = get_vectors(fct, 10000 if fct.__name__ != "dot_product" else 1000)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
0%| | 0/10 [00:00<?, ?it/s]
50%|█████ | 5/10 [00:00<00:00, 37.43it/s]
90%|█████████ | 9/10 [00:00<00:00, 15.98it/s]
100%|██████████| 10/10 [00:00<00:00, 15.53it/s]
average deviation min_exec ... warmup_time x_name fct
7 0.000226 0.000061 0.000151 ... 0.000223 710 dot_product
8 0.000228 0.000042 0.000173 ... 0.000237 810 dot_product
9 0.000266 0.000056 0.000191 ... 0.000199 910 dot_product
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
44%|████▍ | 44/100 [00:00<00:00, 431.56it/s]
88%|████████▊ | 88/100 [00:00<00:00, 283.18it/s]
100%|██████████| 100/100 [00:00<00:00, 275.06it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000012 3.422998e-06 0.000009 ... 0.000020 9710 ddot_cython_array
98 0.000010 1.483251e-06 0.000009 ... 0.000022 9810 ddot_cython_array
99 0.000009 8.385466e-07 0.000009 ... 0.000021 9910 ddot_cython_array
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
57%|█████▋ | 57/100 [00:00<00:00, 561.78it/s]
100%|██████████| 100/100 [00:00<00:00, 342.23it/s]
average deviation ... x_name fct
97 0.000009 1.964734e-07 ... 9710 ddot_cython_array_optim
98 0.000011 2.940348e-06 ... 9810 ddot_cython_array_optim
99 0.000010 1.142207e-06 ... 9910 ddot_cython_array_optim
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
50%|█████ | 50/100 [00:00<00:00, 492.02it/s]
100%|██████████| 100/100 [00:00<00:00, 288.84it/s]
100%|██████████| 100/100 [00:00<00:00, 307.58it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000017 0.000009 0.000012 ... 0.000018 9710 ddot_array
98 0.000014 0.000001 0.000012 ... 0.000034 9810 ddot_array
99 0.000011 0.000001 0.000010 ... 0.000027 9910 ddot_array
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
67%|██████▋ | 67/100 [00:00<00:00, 664.74it/s]
100%|██████████| 100/100 [00:00<00:00, 509.76it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000007 2.034337e-06 0.000006 ... 0.000017 9710 ddot_array_16
98 0.000008 8.386058e-07 0.000007 ... 0.000021 9810 ddot_array_16
99 0.000009 3.665782e-07 0.000008 ... 0.000024 9910 ddot_array_16
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
82%|████████▏ | 82/100 [00:00<00:00, 795.20it/s]
100%|██████████| 100/100 [00:00<00:00, 693.75it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000003 2.172306e-07 0.000003 ... 0.000014 9710 ddot_array_16_sse
98 0.000004 6.489630e-07 0.000003 ... 0.000015 9810 ddot_array_16_sse
99 0.000003 1.767073e-07 0.000003 ... 0.000023 9910 ddot_array_16_sse
[3 rows x 11 columns]
Let’s display the results¶
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1100].pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[0, 0]
)
cc[cc.fct != "dot_product"].pivot(index="N", columns="fct", values="average").plot(
logy=True, ax=ax[0, 1]
)
cc[cc.fct != "dot_product"].pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1, 1]
)
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")
###################
# :epkg:`numpy` is faster but we are able to catch up.
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')
Same for floats¶
Let’s for single floats.
dfs = []
for fct in [
numpy.dot,
sdot_cython_array,
sdot_cython_array_optim,
sdot_array,
sdot_array_16,
sdot_array_16_sse,
]:
ctxs = get_vectors(
fct, 10000 if fct.__name__ != "dot_product" else 1000, dtype=numpy.float32
)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = fct.__name__
dfs.append(df)
print(df.tail(n=3))
cc = concat(dfs)
cc["N"] = cc["x_name"]
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0])
cc.pivot(index="N", columns="fct", values="average").plot(
logy=True, logx=True, ax=ax[1]
)
ax[0].set_title("Comparison of cython sdot implementations")
ax[1].set_title("Comparison of cython sdot implementations")
0%| | 0/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<00:00, 1015.88it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000003 1.160534e-06 0.000002 ... 0.000110 9710 dot
98 0.000002 1.534211e-07 0.000002 ... 0.000015 9810 dot
99 0.000002 4.696813e-07 0.000002 ... 0.000008 9910 dot
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
54%|█████▍ | 54/100 [00:00<00:00, 530.75it/s]
100%|██████████| 100/100 [00:00<00:00, 336.73it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000010 2.593688e-06 0.000008 ... 0.000013 9710 sdot_cython_array
98 0.000009 6.537024e-07 0.000008 ... 0.000013 9810 sdot_cython_array
99 0.000009 1.271740e-06 0.000008 ... 0.000014 9910 sdot_cython_array
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
57%|█████▋ | 57/100 [00:00<00:00, 560.74it/s]
100%|██████████| 100/100 [00:00<00:00, 347.06it/s]
average deviation ... x_name fct
97 0.000009 4.817641e-07 ... 9710 sdot_cython_array_optim
98 0.000009 1.792688e-06 ... 9810 sdot_cython_array_optim
99 0.000009 8.121792e-07 ... 9910 sdot_cython_array_optim
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
56%|█████▌ | 56/100 [00:00<00:00, 542.80it/s]
100%|██████████| 100/100 [00:00<00:00, 345.13it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000009 7.199072e-07 0.000008 ... 0.000013 9710 sdot_array
98 0.000009 3.365492e-07 0.000009 ... 0.000017 9810 sdot_array
99 0.000009 1.962220e-07 0.000008 ... 0.000012 9910 sdot_array
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
70%|███████ | 70/100 [00:00<00:00, 691.94it/s]
100%|██████████| 100/100 [00:00<00:00, 516.70it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000010 8.293641e-06 0.000007 ... 0.000014 9710 sdot_array_16
98 0.000009 2.818870e-06 0.000007 ... 0.000018 9810 sdot_array_16
99 0.000007 2.819661e-07 0.000007 ... 0.000016 9910 sdot_array_16
[3 rows x 11 columns]
0%| | 0/100 [00:00<?, ?it/s]
83%|████████▎ | 83/100 [00:00<00:00, 828.34it/s]
100%|██████████| 100/100 [00:00<00:00, 797.59it/s]
average deviation min_exec ... warmup_time x_name fct
97 0.000003 2.282910e-07 0.000002 ... 0.000008 9710 sdot_array_16_sse
98 0.000003 2.415674e-07 0.000003 ... 0.000008 9810 sdot_array_16_sse
99 0.000003 8.935738e-09 0.000003 ... 0.000009 9910 sdot_array_16_sse
[3 rows x 11 columns]
Text(0.5, 1.0, 'Comparison of cython sdot implementations')
Total running time of the script: (0 minutes 5.423 seconds)