Compares dot implementations (numpy, c++, sse, openmp)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
    ddot_cython_array_omp,
    ddot_array_openmp,
    get_omp_max_threads,
    ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going


def get_vectors(fct, n, h=250, dtype=numpy.float64):
    ctxs = [
        dict(
            va=numpy.random.randn(n).astype(dtype),
            vb=numpy.random.randn(n).astype(dtype),
            dot=fct,
            x_name=n,
        )
        for n in range(10, n, h)
    ]
    return ctxs

Number of threads

print(get_omp_max_threads())
10

Several cython dot

def numpy_dot(va, vb):
    return numpy.dot(va, vb)


def ddot_omp(va, vb):
    return ddot_cython_array_omp(va, vb)


def ddot_omp_static(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=1)


def ddot_omp_dyn(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=2)


def ddot_omp_cpp(va, vb):
    return ddot_array_openmp(va, vb)


def ddot_omp_cpp_16(va, vb):
    return ddot_array_openmp_16(va, vb)


dfs = []
for fct in [
    numpy_dot,
    ddot_array,
    ddot_array_16_sse,
    ddot_omp,
    ddot_omp_static,
    ddot_omp_dyn,
    ddot_omp_cpp,
    ddot_omp_cpp_16,
]:
    ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)

    print(fct.__name__)
    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
numpy_dot

  0%|          | 0/160 [00:00<?, ?it/s]
 26%|██▌       | 41/160 [00:00<00:01, 70.36it/s]
 39%|███▉      | 63/160 [00:00<00:00, 99.59it/s]
 49%|████▉     | 79/160 [00:01<00:01, 54.71it/s]
 56%|█████▌    | 89/160 [00:01<00:01, 52.64it/s]
 69%|██████▉   | 110/160 [00:01<00:00, 74.39it/s]
 82%|████████▎ | 132/160 [00:01<00:00, 96.18it/s]
 99%|█████████▉| 159/160 [00:01<00:00, 128.89it/s]
100%|██████████| 160/160 [00:01<00:00, 88.52it/s]
      average  deviation  min_exec  ...  warmup_time  x_name        fct
157  0.000006   0.000002  0.000005  ...     0.000015   39260  numpy_dot
158  0.000006   0.000001  0.000005  ...     0.000016   39510  numpy_dot
159  0.000007   0.000002  0.000005  ...     0.000276   39760  numpy_dot

[3 rows x 11 columns]
ddot_array

  0%|          | 0/160 [00:00<?, ?it/s]
 21%|██▏       | 34/160 [00:00<00:00, 336.85it/s]
 42%|████▎     | 68/160 [00:00<00:00, 211.28it/s]
 57%|█████▊    | 92/160 [00:00<00:00, 150.89it/s]
 69%|██████▉   | 110/160 [00:00<00:00, 124.33it/s]
 78%|███████▊  | 125/160 [00:00<00:00, 104.56it/s]
 86%|████████▌ | 137/160 [00:01<00:00, 93.39it/s]
 92%|█████████▏| 147/160 [00:01<00:00, 83.89it/s]
 98%|█████████▊| 156/160 [00:01<00:00, 68.81it/s]
100%|██████████| 160/160 [00:01<00:00, 100.49it/s]
      average  deviation  min_exec  ...  warmup_time  x_name         fct
157  0.000036   0.000003  0.000032  ...     0.000052   39260  ddot_array
158  0.000035   0.000002  0.000033  ...     0.000043   39510  ddot_array
159  0.000036   0.000005  0.000033  ...     0.000041   39760  ddot_array

[3 rows x 11 columns]
ddot_array_16_sse

  0%|          | 0/160 [00:00<?, ?it/s]
 32%|███▎      | 52/160 [00:00<00:00, 507.14it/s]
 64%|██████▍   | 103/160 [00:00<00:00, 320.80it/s]
 87%|████████▋ | 139/160 [00:00<00:00, 248.20it/s]
100%|██████████| 160/160 [00:00<00:00, 244.18it/s]
      average  deviation  min_exec  ...  warmup_time  x_name                fct
157  0.000028   0.000028  0.000013  ...     0.000057   39260  ddot_array_16_sse
158  0.000015   0.000003  0.000013  ...     0.000037   39510  ddot_array_16_sse
159  0.000018   0.000007  0.000014  ...     0.000037   39760  ddot_array_16_sse

[3 rows x 11 columns]
ddot_omp

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|          | 1/160 [00:00<00:36,  4.42it/s]
 11%|█▏        | 18/160 [00:00<00:02, 67.86it/s]
 28%|██▊       | 44/160 [00:00<00:00, 134.81it/s]
 44%|████▍     | 70/160 [00:00<00:00, 175.43it/s]
 58%|█████▊    | 93/160 [00:00<00:00, 191.79it/s]
 72%|███████▏  | 115/160 [00:00<00:00, 145.18it/s]
 83%|████████▎ | 133/160 [00:01<00:00, 95.75it/s]
 92%|█████████▏| 147/160 [00:01<00:00, 102.21it/s]
100%|██████████| 160/160 [00:01<00:00, 111.98it/s]
      average     deviation  min_exec  ...  warmup_time  x_name       fct
157  0.000009  8.693929e-07  0.000009  ...     0.000025   39260  ddot_omp
158  0.000014  2.401780e-06  0.000009  ...     0.000021   39510  ddot_omp
159  0.000017  1.338579e-06  0.000015  ...     0.000025   39760  ddot_omp

[3 rows x 11 columns]
ddot_omp_static

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|          | 1/160 [00:00<00:44,  3.60it/s]
 12%|█▎        | 20/160 [00:00<00:02, 66.43it/s]
 28%|██▊       | 45/160 [00:00<00:00, 124.99it/s]
 39%|███▉      | 63/160 [00:00<00:00, 141.84it/s]
 51%|█████     | 81/160 [00:00<00:00, 131.34it/s]
 65%|██████▌   | 104/160 [00:00<00:00, 156.27it/s]
 79%|███████▉  | 126/160 [00:00<00:00, 171.99it/s]
 91%|█████████ | 145/160 [00:01<00:00, 174.57it/s]
100%|██████████| 160/160 [00:01<00:00, 136.22it/s]
      average  deviation  min_exec  ...  warmup_time  x_name              fct
157  0.000012   0.000004  0.000010  ...     0.000037   39260  ddot_omp_static
158  0.000019   0.000011  0.000010  ...     0.000077   39510  ddot_omp_static
159  0.000011   0.000003  0.000009  ...     0.000055   39760  ddot_omp_static

[3 rows x 11 columns]
ddot_omp_dyn

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|          | 1/160 [00:00<00:38,  4.13it/s]
  6%|▌         | 9/160 [00:00<00:04, 32.07it/s]
  9%|▉         | 15/160 [00:00<00:03, 41.47it/s]
 14%|█▍        | 23/160 [00:00<00:02, 52.42it/s]
 20%|██        | 32/160 [00:00<00:02, 62.06it/s]
 25%|██▌       | 40/160 [00:00<00:01, 67.04it/s]
 30%|███       | 48/160 [00:00<00:01, 69.05it/s]
 35%|███▌      | 56/160 [00:01<00:01, 67.07it/s]
 39%|███▉      | 63/160 [00:01<00:01, 63.45it/s]
 44%|████▍     | 70/160 [00:01<00:01, 54.68it/s]
 48%|████▊     | 76/160 [00:01<00:01, 46.73it/s]
 51%|█████▏    | 82/160 [00:01<00:01, 42.18it/s]
 54%|█████▍    | 87/160 [00:01<00:01, 39.42it/s]
 57%|█████▊    | 92/160 [00:01<00:01, 39.91it/s]
 61%|██████    | 97/160 [00:02<00:01, 41.61it/s]
 64%|██████▍   | 102/160 [00:02<00:01, 43.07it/s]
 67%|██████▋   | 107/160 [00:02<00:01, 44.60it/s]
 70%|███████   | 112/160 [00:02<00:01, 44.56it/s]
 73%|███████▎  | 117/160 [00:02<00:00, 44.17it/s]
 76%|███████▋  | 122/160 [00:02<00:00, 43.41it/s]
 79%|███████▉  | 127/160 [00:02<00:00, 43.16it/s]
 82%|████████▎ | 132/160 [00:02<00:00, 39.70it/s]
 86%|████████▌ | 137/160 [00:03<00:00, 37.03it/s]
 88%|████████▊ | 141/160 [00:03<00:00, 37.53it/s]
 91%|█████████ | 145/160 [00:03<00:00, 33.99it/s]
 93%|█████████▎| 149/160 [00:03<00:00, 33.54it/s]
 96%|█████████▌| 153/160 [00:03<00:00, 34.82it/s]
 98%|█████████▊| 157/160 [00:03<00:00, 33.30it/s]
100%|██████████| 160/160 [00:03<00:00, 42.88it/s]
      average  deviation  min_exec  ...  warmup_time  x_name           fct
157  0.000066   0.000014  0.000052  ...     0.000069   39260  ddot_omp_dyn
158  0.000064   0.000009  0.000050  ...     0.000112   39510  ddot_omp_dyn
159  0.000070   0.000021  0.000053  ...     0.000349   39760  ddot_omp_dyn

[3 rows x 11 columns]
ddot_omp_cpp

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|          | 1/160 [00:00<00:29,  5.34it/s]
 25%|██▌       | 40/160 [00:00<00:00, 170.03it/s]
 46%|████▋     | 74/160 [00:00<00:00, 233.26it/s]
 64%|██████▍   | 102/160 [00:00<00:00, 226.67it/s]
 80%|████████  | 128/160 [00:00<00:00, 229.42it/s]
 96%|█████████▋| 154/160 [00:00<00:00, 238.21it/s]
100%|██████████| 160/160 [00:00<00:00, 211.32it/s]
      average  deviation  min_exec  ...  warmup_time  x_name           fct
157  0.000008   0.000001  0.000006  ...     0.000020   39260  ddot_omp_cpp
158  0.000009   0.000003  0.000007  ...     0.000032   39510  ddot_omp_cpp
159  0.000008   0.000002  0.000007  ...     0.000018   39760  ddot_omp_cpp

[3 rows x 11 columns]
ddot_omp_cpp_16

  0%|          | 0/160 [00:00<?, ?it/s]
  1%|▏         | 2/160 [00:00<00:19,  8.27it/s]
 28%|██▊       | 45/160 [00:00<00:00, 163.83it/s]
 45%|████▌     | 72/160 [00:00<00:00, 199.44it/s]
 66%|██████▌   | 105/160 [00:00<00:00, 241.28it/s]
 85%|████████▌ | 136/160 [00:00<00:00, 262.45it/s]
100%|██████████| 160/160 [00:00<00:00, 215.61it/s]
      average     deviation  min_exec  ...  warmup_time  x_name              fct
157  0.000007  3.779736e-07  0.000006  ...     0.000028   39260  ddot_omp_cpp_16
158  0.000007  8.829744e-07  0.000006  ...     0.000028   39510  ddot_omp_cpp_16
159  0.000007  6.191916e-07  0.000006  ...     0.000237   39760  ddot_omp_cpp_16

[3 rows x 11 columns]

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
    logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[
    (
        (cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
        & ~cc.fct.str.contains("dyn")
    )
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')

Total running time of the script: (0 minutes 14.336 seconds)

Gallery generated by Sphinx-Gallery