Compares dot implementations (numpy, c++, sse, openmp)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
    ddot_cython_array_omp,
    ddot_array_openmp,
    get_omp_max_threads,
    ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going


def get_vectors(fct, n, h=250, dtype=numpy.float64):
    ctxs = [
        dict(
            va=numpy.random.randn(n).astype(dtype),
            vb=numpy.random.randn(n).astype(dtype),
            dot=fct,
            x_name=n,
        )
        for n in range(10, n, h)
    ]
    return ctxs

Number of threads

print(get_omp_max_threads())
10

Several cython dot

def numpy_dot(va, vb):
    return numpy.dot(va, vb)


def ddot_omp(va, vb):
    return ddot_cython_array_omp(va, vb)


def ddot_omp_static(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=1)


def ddot_omp_dyn(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=2)


def ddot_omp_cpp(va, vb):
    return ddot_array_openmp(va, vb)


def ddot_omp_cpp_16(va, vb):
    return ddot_array_openmp_16(va, vb)


dfs = []
for fct in [
    numpy_dot,
    ddot_array,
    ddot_array_16_sse,
    ddot_omp,
    ddot_omp_static,
    ddot_omp_dyn,
    ddot_omp_cpp,
    ddot_omp_cpp_16,
]:
    ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)

    print(fct.__name__)
    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
numpy_dot

  0%|          | 0/160 [00:00<?, ?it/s]
 26%|██▋       | 42/160 [00:00<00:00, 378.41it/s]
 50%|█████     | 80/160 [00:00<00:00, 370.65it/s]
 74%|███████▍  | 118/160 [00:00<00:00, 187.71it/s]
 90%|█████████ | 144/160 [00:00<00:00, 187.30it/s]
100%|██████████| 160/160 [00:00<00:00, 211.50it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
157  0.000008   0.000008  0.000004  0.000031      10      50  0.000076           184     0.000030   39260  numpy_dot
158  0.000010   0.000013  0.000004  0.000049      10      50  0.000098           184     0.000999   39510  numpy_dot
159  0.000009   0.000012  0.000004  0.000043      10      50  0.000089           184     0.000018   39760  numpy_dot
ddot_array

  0%|          | 0/160 [00:00<?, ?it/s]
 27%|██▋       | 43/160 [00:00<00:00, 415.78it/s]
 53%|█████▎    | 85/160 [00:00<00:00, 216.85it/s]
 70%|███████   | 112/160 [00:00<00:00, 151.22it/s]
 82%|████████▏ | 131/160 [00:00<00:00, 125.67it/s]
 91%|█████████▏| 146/160 [00:01<00:00, 106.55it/s]
 99%|█████████▉| 158/160 [00:01<00:00, 90.74it/s]
100%|██████████| 160/160 [00:01<00:00, 120.92it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name         fct
157  0.000034   0.000007  0.000027  0.000051      10      50  0.000344           184     0.000048   39260  ddot_array
158  0.000036   0.000005  0.000029  0.000044      10      50  0.000355           184     0.000044   39510  ddot_array
159  0.000030   0.000003  0.000027  0.000034      10      50  0.000299           184     0.000054   39760  ddot_array
ddot_array_16_sse

  0%|          | 0/160 [00:00<?, ?it/s]
 46%|████▌     | 73/160 [00:00<00:00, 714.43it/s]
 91%|█████████ | 145/160 [00:00<00:00, 349.69it/s]
100%|██████████| 160/160 [00:00<00:00, 329.90it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name                fct
157  0.000010  4.870777e-07  0.000009  0.000011      10      50  0.000101           184     0.000055   39260  ddot_array_16_sse
158  0.000011  1.692256e-06  0.000009  0.000015      10      50  0.000105           184     0.000035   39510  ddot_array_16_sse
159  0.000010  1.071967e-06  0.000009  0.000013      10      50  0.000101           184     0.000038   39760  ddot_array_16_sse
ddot_omp

  0%|          | 0/160 [00:00<?, ?it/s]
  5%|▌         | 8/160 [00:00<00:01, 78.09it/s]
 19%|█▉        | 31/160 [00:00<00:00, 164.12it/s]
 36%|███▌      | 57/160 [00:00<00:00, 206.69it/s]
 49%|████▉     | 78/160 [00:00<00:00, 203.36it/s]
 62%|██████▏   | 99/160 [00:00<00:00, 198.76it/s]
 74%|███████▍  | 119/160 [00:00<00:00, 189.97it/s]
 87%|████████▋ | 139/160 [00:00<00:00, 179.03it/s]
 99%|█████████▉| 158/160 [00:00<00:00, 169.76it/s]
100%|██████████| 160/160 [00:00<00:00, 177.53it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name       fct
157  0.000012   0.000001  0.000011  0.000016      10      50  0.000125           184     0.000023   39260  ddot_omp
158  0.000014   0.000002  0.000012  0.000020      10      50  0.000139           184     0.000027   39510  ddot_omp
159  0.000013   0.000001  0.000011  0.000016      10      50  0.000127           184     0.000058   39760  ddot_omp
ddot_omp_static

  0%|          | 0/160 [00:00<?, ?it/s]
 13%|█▎        | 21/160 [00:00<00:00, 203.31it/s]
 26%|██▋       | 42/160 [00:00<00:00, 200.00it/s]
 41%|████▏     | 66/160 [00:00<00:00, 216.56it/s]
 55%|█████▌    | 88/160 [00:00<00:00, 213.98it/s]
 69%|██████▉   | 110/160 [00:00<00:00, 211.15it/s]
 82%|████████▎ | 132/160 [00:00<00:00, 188.91it/s]
 95%|█████████▌| 152/160 [00:00<00:00, 173.66it/s]
100%|██████████| 160/160 [00:00<00:00, 186.85it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name              fct
157  0.000013   0.000002  0.000011  0.000016      10      50  0.000134           184     0.000028   39260  ddot_omp_static
158  0.000014   0.000002  0.000011  0.000017      10      50  0.000137           184     0.000034   39510  ddot_omp_static
159  0.000015   0.000004  0.000012  0.000024      10      50  0.000155           184     0.000029   39760  ddot_omp_static
ddot_omp_dyn

  0%|          | 0/160 [00:00<?, ?it/s]
  7%|▋         | 11/160 [00:00<00:01, 98.38it/s]
 13%|█▎        | 21/160 [00:00<00:01, 71.38it/s]
 22%|██▎       | 36/160 [00:00<00:01, 97.36it/s]
 29%|██▉       | 47/160 [00:00<00:01, 97.43it/s]
 36%|███▋      | 58/160 [00:00<00:01, 86.48it/s]
 42%|████▎     | 68/160 [00:00<00:01, 82.03it/s]
 48%|████▊     | 77/160 [00:00<00:01, 73.28it/s]
 53%|█████▎    | 85/160 [00:01<00:01, 66.51it/s]
 57%|█████▊    | 92/160 [00:01<00:01, 65.78it/s]
 62%|██████▏   | 99/160 [00:01<00:00, 63.07it/s]
 66%|██████▋   | 106/160 [00:01<00:00, 57.64it/s]
 70%|███████   | 112/160 [00:01<00:00, 55.20it/s]
 74%|███████▍  | 118/160 [00:01<00:00, 52.08it/s]
 78%|███████▊  | 124/160 [00:01<00:00, 52.37it/s]
 81%|████████▏ | 130/160 [00:01<00:00, 49.08it/s]
 84%|████████▍ | 135/160 [00:02<00:00, 47.09it/s]
 88%|████████▊ | 140/160 [00:02<00:00, 43.81it/s]
 91%|█████████ | 145/160 [00:02<00:00, 40.07it/s]
 94%|█████████▍| 150/160 [00:02<00:00, 37.55it/s]
 96%|█████████▋| 154/160 [00:02<00:00, 37.78it/s]
 99%|█████████▉| 158/160 [00:02<00:00, 36.60it/s]
100%|██████████| 160/160 [00:02<00:00, 56.43it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name           fct
157  0.000053  1.783173e-06  0.000052  0.000059      10      50  0.000534           184     0.000068   39260  ddot_omp_dyn
158  0.000055  6.390969e-07  0.000054  0.000056      10      50  0.000546           184     0.000083   39510  ddot_omp_dyn
159  0.000054  5.796942e-06  0.000045  0.000066      10      50  0.000544           184     0.000071   39760  ddot_omp_dyn
ddot_omp_cpp

  0%|          | 0/160 [00:00<?, ?it/s]
 10%|█         | 16/160 [00:00<00:00, 158.91it/s]
 20%|██        | 32/160 [00:00<00:00, 128.99it/s]
 43%|████▎     | 69/160 [00:00<00:00, 225.69it/s]
 61%|██████▏   | 98/160 [00:00<00:00, 246.15it/s]
 78%|███████▊  | 124/160 [00:00<00:00, 234.27it/s]
 93%|█████████▎| 149/160 [00:00<00:00, 213.37it/s]
100%|██████████| 160/160 [00:00<00:00, 209.45it/s]
     average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name           fct
157  0.00001  6.273831e-07  0.000009  0.000011      10      50  0.000098           184     0.000178   39260  ddot_omp_cpp
158  0.00001  1.323552e-06  0.000009  0.000013      10      50  0.000105           184     0.000018   39510  ddot_omp_cpp
159  0.00001  1.097687e-06  0.000009  0.000013      10      50  0.000103           184     0.000018   39760  ddot_omp_cpp
ddot_omp_cpp_16

  0%|          | 0/160 [00:00<?, ?it/s]
 19%|█▉        | 30/160 [00:00<00:00, 293.18it/s]
 38%|███▊      | 60/160 [00:00<00:00, 195.47it/s]
 61%|██████    | 97/160 [00:00<00:00, 256.68it/s]
 81%|████████  | 129/160 [00:00<00:00, 277.59it/s]
 99%|█████████▉| 159/160 [00:00<00:00, 282.12it/s]
100%|██████████| 160/160 [00:00<00:00, 266.90it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name              fct
157  0.000010  2.392966e-06  0.000008  0.000017      10      50  0.000096           184     0.000042   39260  ddot_omp_cpp_16
158  0.000007  6.434547e-07  0.000006  0.000008      10      50  0.000067           184     0.000021   39510  ddot_omp_cpp_16
159  0.000007  8.910203e-07  0.000006  0.000009      10      50  0.000067           184     0.000037   39760  ddot_omp_cpp_16

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
    logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[
    (
        (cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
        & ~cc.fct.str.contains("dyn")
    )
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')

Total running time of the script: (0 minutes 10.261 seconds)

Gallery generated by Sphinx-Gallery