Compares dot implementations (numpy, c++, sse, openmp)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
    ddot_cython_array_omp,
    ddot_array_openmp,
    get_omp_max_threads,
    ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going


def get_vectors(fct, n, h=250, dtype=numpy.float64):
    ctxs = [
        dict(
            va=numpy.random.randn(n).astype(dtype),
            vb=numpy.random.randn(n).astype(dtype),
            dot=fct,
            x_name=n,
        )
        for n in range(10, n, h)
    ]
    return ctxs

Number of threads

print(get_omp_max_threads())
10

Several cython dot

def numpy_dot(va, vb):
    return numpy.dot(va, vb)


def ddot_omp(va, vb):
    return ddot_cython_array_omp(va, vb)


def ddot_omp_static(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=1)


def ddot_omp_dyn(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=2)


def ddot_omp_cpp(va, vb):
    return ddot_array_openmp(va, vb)


def ddot_omp_cpp_16(va, vb):
    return ddot_array_openmp_16(va, vb)


dfs = []
for fct in [
    numpy_dot,
    ddot_array,
    ddot_array_16_sse,
    ddot_omp,
    ddot_omp_static,
    ddot_omp_dyn,
    ddot_omp_cpp,
    ddot_omp_cpp_16,
]:
    ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)

    print(fct.__name__)
    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
numpy_dot

  0%|          | 0/160 [00:00<?, ?it/s]
 26%|██▌       | 41/160 [00:00<00:00, 196.18it/s]
 38%|███▊      | 61/160 [00:00<00:01, 67.49it/s]
 45%|████▌     | 72/160 [00:01<00:01, 59.17it/s]
 50%|█████     | 80/160 [00:01<00:01, 50.86it/s]
 54%|█████▍    | 86/160 [00:03<00:05, 14.17it/s]
 56%|█████▋    | 90/160 [00:03<00:05, 13.08it/s]
 65%|██████▌   | 104/160 [00:03<00:02, 20.48it/s]
 69%|██████▉   | 111/160 [00:05<00:04, 11.20it/s]
 72%|███████▏  | 115/160 [00:08<00:08,  5.05it/s]
 74%|███████▍  | 118/160 [00:08<00:08,  5.18it/s]
 76%|███████▋  | 122/160 [00:09<00:06,  6.27it/s]
 78%|███████▊  | 125/160 [00:09<00:05,  6.76it/s]
 81%|████████  | 129/160 [00:09<00:03,  8.37it/s]
 82%|████████▏ | 131/160 [00:13<00:11,  2.62it/s]
 83%|████████▎ | 133/160 [00:13<00:09,  2.87it/s]
 86%|████████▌ | 137/160 [00:13<00:05,  4.15it/s]
 87%|████████▋ | 139/160 [00:13<00:04,  4.78it/s]
 91%|█████████▏| 146/160 [00:14<00:01,  7.84it/s]
 93%|█████████▎| 149/160 [00:14<00:01,  8.79it/s]
 95%|█████████▌| 152/160 [00:16<00:02,  3.82it/s]
 96%|█████████▋| 154/160 [00:17<00:02,  2.87it/s]
 97%|█████████▋| 155/160 [00:19<00:02,  2.11it/s]
 98%|█████████▊| 156/160 [00:19<00:01,  2.27it/s]
 99%|█████████▉| 159/160 [00:19<00:00,  3.18it/s]
100%|██████████| 160/160 [00:20<00:00,  3.20it/s]
100%|██████████| 160/160 [00:20<00:00,  7.97it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
157  0.000031   0.000072  0.000006  0.000248      10      50  0.000308           184     0.000020   39260  numpy_dot
158  0.000624   0.001273  0.000006  0.004053      10      50  0.006240           184     0.000029   39510  numpy_dot
159  0.000602   0.001374  0.000005  0.004639      10      50  0.006015           184     0.000023   39760  numpy_dot
ddot_array

  0%|          | 0/160 [00:00<?, ?it/s]
 24%|██▍       | 38/160 [00:00<00:00, 375.55it/s]
 48%|████▊     | 76/160 [00:00<00:00, 193.02it/s]
 62%|██████▎   | 100/160 [00:00<00:00, 142.27it/s]
 74%|███████▍  | 118/160 [00:00<00:00, 116.80it/s]
 82%|████████▎ | 132/160 [00:01<00:00, 99.85it/s]
 90%|█████████ | 144/160 [00:01<00:00, 88.45it/s]
 96%|█████████▋| 154/160 [00:01<00:00, 79.97it/s]
100%|██████████| 160/160 [00:01<00:00, 105.17it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name         fct
157  0.000034  5.379273e-07  0.000033  0.000035      10      50  0.000340           184     0.000043   39260  ddot_array
158  0.000034  5.291191e-07  0.000034  0.000035      10      50  0.000342           184     0.000054   39510  ddot_array
159  0.000035  1.512740e-06  0.000033  0.000039      10      50  0.000352           184     0.000050   39760  ddot_array
ddot_array_16_sse

  0%|          | 0/160 [00:00<?, ?it/s]
 36%|███▌      | 57/160 [00:00<00:00, 562.15it/s]
 71%|███████▏  | 114/160 [00:00<00:00, 312.63it/s]
 95%|█████████▌| 152/160 [00:00<00:00, 235.72it/s]
100%|██████████| 160/160 [00:00<00:00, 253.21it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name                fct
157  0.000014   0.000002  0.000011  0.000018      10      50  0.000143           184     0.000035   39260  ddot_array_16_sse
158  0.000017   0.000003  0.000013  0.000023      10      50  0.000171           184     0.000065   39510  ddot_array_16_sse
159  0.000014   0.000002  0.000012  0.000017      10      50  0.000139           184     0.000042   39760  ddot_array_16_sse
ddot_omp

  0%|          | 0/160 [00:00<?, ?it/s]
 14%|█▍        | 23/160 [00:00<00:00, 223.63it/s]
 29%|██▉       | 46/160 [00:00<00:00, 211.05it/s]
 42%|████▎     | 68/160 [00:00<00:00, 191.45it/s]
 55%|█████▌    | 88/160 [00:00<00:00, 172.16it/s]
 66%|██████▋   | 106/160 [00:00<00:00, 165.64it/s]
 77%|███████▋  | 123/160 [00:00<00:00, 161.20it/s]
 88%|████████▊ | 140/160 [00:00<00:00, 159.37it/s]
 98%|█████████▊| 156/160 [00:00<00:00, 154.64it/s]
100%|██████████| 160/160 [00:00<00:00, 166.52it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name       fct
157  0.000014  1.649187e-06  0.000013  0.000017      10      50  0.000139           184     0.000024   39260  ddot_omp
158  0.000014  1.319730e-06  0.000012  0.000017      10      50  0.000143           184     0.000051   39510  ddot_omp
159  0.000014  7.013416e-07  0.000013  0.000015      10      50  0.000140           184     0.000025   39760  ddot_omp
ddot_omp_static

  0%|          | 0/160 [00:00<?, ?it/s]
 16%|█▌        | 25/160 [00:00<00:00, 249.66it/s]
 31%|███▏      | 50/160 [00:00<00:00, 225.81it/s]
 46%|████▌     | 73/160 [00:00<00:00, 208.47it/s]
 59%|█████▉    | 95/160 [00:00<00:00, 189.12it/s]
 72%|███████▏  | 115/160 [00:00<00:00, 177.75it/s]
 83%|████████▎ | 133/160 [00:00<00:00, 156.76it/s]
 94%|█████████▍| 150/160 [00:00<00:00, 147.63it/s]
100%|██████████| 160/160 [00:00<00:00, 165.81it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name              fct
157  0.000015   0.000002  0.000013  0.000020      10      50  0.000153           184     0.000033   39260  ddot_omp_static
158  0.000015   0.000001  0.000013  0.000017      10      50  0.000148           184     0.000033   39510  ddot_omp_static
159  0.000016   0.000002  0.000014  0.000020      10      50  0.000165           184     0.000039   39760  ddot_omp_static
ddot_omp_dyn

  0%|          | 0/160 [00:00<?, ?it/s]
 10%|█         | 16/160 [00:00<00:00, 149.65it/s]
 19%|█▉        | 31/160 [00:00<00:01, 122.15it/s]
 28%|██▊       | 44/160 [00:00<00:01, 103.84it/s]
 34%|███▍      | 55/160 [00:00<00:01, 91.38it/s]
 41%|████      | 65/160 [00:00<00:01, 81.08it/s]
 46%|████▋     | 74/160 [00:00<00:01, 75.12it/s]
 51%|█████▏    | 82/160 [00:01<00:01, 65.67it/s]
 56%|█████▌    | 89/160 [00:01<00:01, 60.71it/s]
 60%|██████    | 96/160 [00:01<00:01, 55.27it/s]
 64%|██████▍   | 102/160 [00:01<00:01, 51.01it/s]
 68%|██████▊   | 108/160 [00:01<00:01, 48.16it/s]
 71%|███████   | 113/160 [00:01<00:01, 45.92it/s]
 74%|███████▍  | 118/160 [00:01<00:00, 43.47it/s]
 77%|███████▋  | 123/160 [00:01<00:00, 41.74it/s]
 80%|████████  | 128/160 [00:02<00:00, 38.48it/s]
 82%|████████▎ | 132/160 [00:02<00:00, 37.86it/s]
 85%|████████▌ | 136/160 [00:02<00:00, 36.38it/s]
 88%|████████▊ | 140/160 [00:02<00:00, 35.29it/s]
 90%|█████████ | 144/160 [00:02<00:00, 34.93it/s]
 92%|█████████▎| 148/160 [00:02<00:00, 34.55it/s]
 95%|█████████▌| 152/160 [00:02<00:00, 32.26it/s]
 98%|█████████▊| 156/160 [00:03<00:00, 31.37it/s]
100%|██████████| 160/160 [00:03<00:00, 29.50it/s]
100%|██████████| 160/160 [00:03<00:00, 50.33it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name           fct
157  0.000064   0.000005  0.000058  0.000075      10      50  0.000638           184     0.000084   39260  ddot_omp_dyn
158  0.000076   0.000010  0.000060  0.000093      10      50  0.000759           184     0.000119   39510  ddot_omp_dyn
159  0.000088   0.000003  0.000082  0.000092      10      50  0.000877           184     0.000097   39760  ddot_omp_dyn
ddot_omp_cpp

  0%|          | 0/160 [00:00<?, ?it/s]
 19%|█▉        | 31/160 [00:00<00:00, 305.47it/s]
 39%|███▉      | 62/160 [00:00<00:00, 267.23it/s]
 56%|█████▋    | 90/160 [00:00<00:00, 224.22it/s]
 71%|███████▏  | 114/160 [00:00<00:00, 203.36it/s]
 84%|████████▍ | 135/160 [00:00<00:00, 188.38it/s]
 97%|█████████▋| 155/160 [00:00<00:00, 172.96it/s]
100%|██████████| 160/160 [00:00<00:00, 194.62it/s]
      average     deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name           fct
157  0.000012  9.622883e-07  0.000011  0.000014      10      50  0.000120           184     0.000020   39260  ddot_omp_cpp
158  0.000016  2.446743e-06  0.000011  0.000020      10      50  0.000159           184     0.000019   39510  ddot_omp_cpp
159  0.000013  1.154813e-06  0.000012  0.000016      10      50  0.000133           184     0.000025   39760  ddot_omp_cpp
ddot_omp_cpp_16

  0%|          | 0/160 [00:00<?, ?it/s]
 17%|█▋        | 27/160 [00:00<00:00, 261.68it/s]
 37%|███▋      | 59/160 [00:00<00:00, 292.84it/s]
 56%|█████▌    | 89/160 [00:00<00:00, 295.85it/s]
 74%|███████▍  | 119/160 [00:00<00:00, 261.14it/s]
 91%|█████████▏| 146/160 [00:00<00:00, 236.48it/s]
100%|██████████| 160/160 [00:00<00:00, 245.37it/s]
      average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name              fct
157  0.000010   0.000001  0.000009  0.000013      10      50  0.000097           184     0.000026   39260  ddot_omp_cpp_16
158  0.000011   0.000001  0.000009  0.000012      10      50  0.000105           184     0.000047   39510  ddot_omp_cpp_16
159  0.000011   0.000001  0.000009  0.000013      10      50  0.000106           184     0.000024   39760  ddot_omp_cpp_16

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
    logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[
    (
        (cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
        & ~cc.fct.str.contains("dyn")
    )
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations\nwithout dot_product")
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')

Total running time of the script: (0 minutes 31.023 seconds)

Gallery generated by Sphinx-Gallery