Compares dot implementations (numpy, cython, c++, sse)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import (
    dot_product,
    ddot_cython_array,
    ddot_cython_array_optim,
    ddot_array,
    ddot_array_16,
    ddot_array_16_sse,
)
from teachcompute.validation.cython.dot_cython import (
    sdot_cython_array,
    sdot_cython_array_optim,
    sdot_array,
    sdot_array_16,
    sdot_array_16_sse,
)
from teachcompute.ext_test_case import measure_time_dim


def get_vectors(fct, n, h=100, dtype=numpy.float64):
    ctxs = [
        dict(
            va=numpy.random.randn(n).astype(dtype),
            vb=numpy.random.randn(n).astype(dtype),
            dot=fct,
            x_name=n,
        )
        for n in range(10, n, h)
    ]
    return ctxs

numpy dot

ctxs = get_vectors(numpy.dot, 10000)
df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
df["fct"] = "numpy.dot"
print(df.tail(n=3))
dfs = [df]
  0%|          | 0/100 [00:00<?, ?it/s]
 98%|█████████▊| 98/100 [00:00<00:00, 979.11it/s]
100%|██████████| 100/100 [00:00<00:00, 966.28it/s]
     average     deviation  min_exec  ...  warmup_time  x_name        fct
97  0.000003  6.093308e-08  0.000002  ...     0.000013    9710  numpy.dot
98  0.000003  2.499603e-07  0.000003  ...     0.000020    9810  numpy.dot
99  0.000002  1.052162e-07  0.000002  ...     0.000014    9910  numpy.dot

[3 rows x 11 columns]

Several cython dot

for fct in [
    dot_product,
    ddot_cython_array,
    ddot_cython_array_optim,
    ddot_array,
    ddot_array_16,
    ddot_array_16_sse,
]:
    ctxs = get_vectors(fct, 10000 if fct.__name__ != "dot_product" else 1000)

    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
  0%|          | 0/10 [00:00<?, ?it/s]
 50%|█████     | 5/10 [00:00<00:00, 48.46it/s]
100%|██████████| 10/10 [00:00<00:00, 19.64it/s]
100%|██████████| 10/10 [00:00<00:00, 21.55it/s]
    average  deviation  min_exec  ...  warmup_time  x_name          fct
7  0.000140   0.000003  0.000136  ...     0.000142     710  dot_product
8  0.000177   0.000038  0.000156  ...     0.000158     810  dot_product
9  0.000179   0.000008  0.000175  ...     0.000177     910  dot_product

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 52%|█████▏    | 52/100 [00:00<00:00, 507.30it/s]
100%|██████████| 100/100 [00:00<00:00, 284.65it/s]
     average     deviation  min_exec  ...  warmup_time  x_name                fct
97  0.000012  3.919590e-07  0.000012  ...     0.000028    9710  ddot_cython_array
98  0.000014  1.615427e-06  0.000013  ...     0.000029    9810  ddot_cython_array
99  0.000013  2.936753e-07  0.000013  ...     0.000021    9910  ddot_cython_array

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 51%|█████     | 51/100 [00:00<00:00, 505.30it/s]
100%|██████████| 100/100 [00:00<00:00, 287.57it/s]
     average     deviation  ...  x_name                      fct
97  0.000012  3.260690e-07  ...    9710  ddot_cython_array_optim
98  0.000012  2.699690e-07  ...    9810  ddot_cython_array_optim
99  0.000012  2.400454e-07  ...    9910  ddot_cython_array_optim

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 51%|█████     | 51/100 [00:00<00:00, 503.65it/s]
100%|██████████| 100/100 [00:00<00:00, 283.57it/s]
     average     deviation  min_exec  ...  warmup_time  x_name         fct
97  0.000012  4.664114e-07  0.000012  ...     0.000018    9710  ddot_array
98  0.000012  2.514716e-07  0.000012  ...     0.000018    9810  ddot_array
99  0.000013  1.607867e-06  0.000012  ...     0.000022    9910  ddot_array

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 67%|██████▋   | 67/100 [00:00<00:00, 665.03it/s]
100%|██████████| 100/100 [00:00<00:00, 490.87it/s]
     average     deviation  min_exec  ...  warmup_time  x_name            fct
97  0.000007  1.865304e-07  0.000006  ...     0.000021    9710  ddot_array_16
98  0.000006  1.038200e-07  0.000006  ...     0.000018    9810  ddot_array_16
99  0.000007  4.855401e-07  0.000007  ...     0.000024    9910  ddot_array_16

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 87%|████████▋ | 87/100 [00:00<00:00, 861.90it/s]
100%|██████████| 100/100 [00:00<00:00, 780.12it/s]
     average     deviation  min_exec  ...  warmup_time  x_name                fct
97  0.000004  1.391546e-08  0.000004  ...     0.000024    9710  ddot_array_16_sse
98  0.000004  1.292169e-07  0.000004  ...     0.000020    9810  ddot_array_16_sse
99  0.000004  7.972453e-09  0.000004  ...     0.000014    9910  ddot_array_16_sse

[3 rows x 11 columns]

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1100].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 0]
)
cc[cc.fct != "dot_product"].pivot(index="N", columns="fct", values="average").plot(
    logy=True, ax=ax[0, 1]
)
cc[cc.fct != "dot_product"].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations" "\nwithout dot_product")

###################
# :epkg:`numpy` is faster but we are able to catch up.
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')

Same for floats

Let’s for single floats.

dfs = []
for fct in [
    numpy.dot,
    sdot_cython_array,
    sdot_cython_array_optim,
    sdot_array,
    sdot_array_16,
    sdot_array_16_sse,
]:
    ctxs = get_vectors(
        fct, 10000 if fct.__name__ != "dot_product" else 1000, dtype=numpy.float32
    )

    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))


cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0])
cc.pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1]
)
ax[0].set_title("Comparison of cython sdot implementations")
ax[1].set_title("Comparison of cython sdot implementations")
Comparison of cython sdot implementations, Comparison of cython sdot implementations
  0%|          | 0/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:00<00:00, 1349.82it/s]
     average     deviation  min_exec  ...  warmup_time  x_name  fct
97  0.000002  3.370458e-09  0.000002  ...     0.000010    9710  dot
98  0.000002  7.255342e-09  0.000002  ...     0.000008    9810  dot
99  0.000002  5.059644e-09  0.000002  ...     0.000010    9910  dot

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 51%|█████     | 51/100 [00:00<00:00, 509.74it/s]
100%|██████████| 100/100 [00:00<00:00, 288.94it/s]
     average     deviation  min_exec  ...  warmup_time  x_name                fct
97  0.000012  4.738001e-07  0.000012  ...     0.000017    9710  sdot_cython_array
98  0.000013  1.219627e-06  0.000012  ...     0.000022    9810  sdot_cython_array
99  0.000012  4.456777e-07  0.000012  ...     0.000020    9910  sdot_cython_array

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 52%|█████▏    | 52/100 [00:00<00:00, 502.44it/s]
100%|██████████| 100/100 [00:00<00:00, 286.29it/s]
     average     deviation  ...  x_name                      fct
97  0.000012  1.560405e-07  ...    9710  sdot_cython_array_optim
98  0.000013  5.861514e-07  ...    9810  sdot_cython_array_optim
99  0.000012  2.063391e-07  ...    9910  sdot_cython_array_optim

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 51%|█████     | 51/100 [00:00<00:00, 496.85it/s]
100%|██████████| 100/100 [00:00<00:00, 281.93it/s]
     average     deviation  min_exec  ...  warmup_time  x_name         fct
97  0.000012  3.187071e-07  0.000012  ...     0.000016    9710  sdot_array
98  0.000012  2.984842e-07  0.000012  ...     0.000020    9810  sdot_array
99  0.000013  3.354402e-07  0.000012  ...     0.000017    9910  sdot_array

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 63%|██████▎   | 63/100 [00:00<00:00, 628.94it/s]
100%|██████████| 100/100 [00:00<00:00, 424.37it/s]
     average     deviation  min_exec  ...  warmup_time  x_name            fct
97  0.000007  1.391853e-06  0.000006  ...     0.000054    9710  sdot_array_16
98  0.000007  2.197745e-07  0.000006  ...     0.000014    9810  sdot_array_16
99  0.000007  3.757621e-07  0.000007  ...     0.000023    9910  sdot_array_16

[3 rows x 11 columns]

  0%|          | 0/100 [00:00<?, ?it/s]
 86%|████████▌ | 86/100 [00:00<00:00, 847.48it/s]
100%|██████████| 100/100 [00:00<00:00, 806.03it/s]
     average     deviation  min_exec  ...  warmup_time  x_name                fct
97  0.000003  6.378326e-07  0.000003  ...     0.000041    9710  sdot_array_16_sse
98  0.000003  6.358742e-08  0.000003  ...     0.000016    9810  sdot_array_16_sse
99  0.000003  5.147139e-07  0.000003  ...     0.000011    9910  sdot_array_16_sse

[3 rows x 11 columns]

Text(0.5, 1.0, 'Comparison of cython sdot implementations')

Total running time of the script: (0 minutes 5.147 seconds)

Gallery generated by Sphinx-Gallery