Compares dot implementations (numpy, c++, sse, openmp)

numpy has a very fast implementation of the dot product. It is difficult to be better and very easy to be slower. This example looks into a couple of slower implementations with cython. The tested functions are the following:

import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.dot_cython import ddot_array_16_sse, ddot_array
from teachcompute.validation.cython.dot_cython_omp import (
    ddot_cython_array_omp,
    ddot_array_openmp,
    get_omp_max_threads,
    ddot_array_openmp_16,
)
from teachcompute.ext_test_case import measure_time_dim, unit_test_going


def get_vectors(fct, n, h=250, dtype=numpy.float64):
    ctxs = [
        dict(
            va=numpy.random.randn(n).astype(dtype),
            vb=numpy.random.randn(n).astype(dtype),
            dot=fct,
            x_name=n,
        )
        for n in range(10, n, h)
    ]
    return ctxs

Number of threads

print(get_omp_max_threads())
8

Several cython dot

def numpy_dot(va, vb):
    return numpy.dot(va, vb)


def ddot_omp(va, vb):
    return ddot_cython_array_omp(va, vb)


def ddot_omp_static(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=1)


def ddot_omp_dyn(va, vb):
    return ddot_cython_array_omp(va, vb, schedule=2)


def ddot_omp_cpp(va, vb):
    return ddot_array_openmp(va, vb)


def ddot_omp_cpp_16(va, vb):
    return ddot_array_openmp_16(va, vb)


dfs = []
for fct in [
    numpy_dot,
    ddot_array,
    ddot_array_16_sse,
    ddot_omp,
    ddot_omp_static,
    ddot_omp_dyn,
    ddot_omp_cpp,
    ddot_omp_cpp_16,
]:
    ctxs = get_vectors(fct, 400 if unit_test_going() else 40000)

    print(fct.__name__)
    df = DataFrame(list(measure_time_dim("dot(va, vb)", ctxs, verbose=1)))
    df["fct"] = fct.__name__
    dfs.append(df)
    print(df.tail(n=3))
numpy_dot

  0%|          | 0/160 [00:00<?, ?it/s]
 32%|███▎      | 52/160 [00:00<00:00, 510.92it/s]
 65%|██████▌   | 104/160 [00:00<00:00, 229.71it/s]
 84%|████████▍ | 135/160 [00:00<00:00, 195.89it/s]
 99%|█████████▉| 159/160 [00:00<00:00, 152.11it/s]
100%|██████████| 160/160 [00:00<00:00, 182.06it/s]
      average     deviation  min_exec  ...  warmup_time  x_name        fct
157  0.000007  6.590977e-07  0.000007  ...     0.000044   39260  numpy_dot
158  0.000024  4.850721e-05  0.000007  ...     0.000048   39510  numpy_dot
159  0.000008  1.848197e-06  0.000007  ...     0.000057   39760  numpy_dot

[3 rows x 11 columns]
ddot_array

  0%|          | 0/160 [00:00<?, ?it/s]
 22%|██▏       | 35/160 [00:00<00:00, 337.07it/s]
 43%|████▎     | 69/160 [00:00<00:00, 170.93it/s]
 57%|█████▋    | 91/160 [00:00<00:00, 118.66it/s]
 67%|██████▋   | 107/160 [00:00<00:00, 93.02it/s]
 74%|███████▍  | 119/160 [00:01<00:00, 78.74it/s]
 81%|████████  | 129/160 [00:01<00:00, 69.82it/s]
 86%|████████▌ | 137/160 [00:01<00:00, 64.98it/s]
 90%|█████████ | 144/160 [00:01<00:00, 61.02it/s]
 94%|█████████▍| 151/160 [00:01<00:00, 56.73it/s]
 98%|█████████▊| 157/160 [00:01<00:00, 53.92it/s]
100%|██████████| 160/160 [00:02<00:00, 78.53it/s]
      average  deviation  min_exec  ...  warmup_time  x_name         fct
157  0.000046   0.000004  0.000042  ...     0.000057   39260  ddot_array
158  0.000045   0.000003  0.000042  ...     0.000059   39510  ddot_array
159  0.000047   0.000006  0.000042  ...     0.000155   39760  ddot_array

[3 rows x 11 columns]
ddot_array_16_sse

  0%|          | 0/160 [00:00<?, ?it/s]
 38%|███▊      | 60/160 [00:00<00:00, 583.63it/s]
 74%|███████▍  | 119/160 [00:00<00:00, 286.71it/s]
 98%|█████████▊| 156/160 [00:00<00:00, 213.79it/s]
100%|██████████| 160/160 [00:00<00:00, 235.80it/s]
      average     deviation  min_exec  ...  warmup_time  x_name                fct
157  0.000020  5.677125e-06  0.000016  ...     0.000047   39260  ddot_array_16_sse
158  0.000017  9.999452e-07  0.000016  ...     0.000069   39510  ddot_array_16_sse
159  0.000018  2.049265e-06  0.000016  ...     0.000052   39760  ddot_array_16_sse

[3 rows x 11 columns]
ddot_omp

  0%|          | 0/160 [00:00<?, ?it/s]
  2%|▎         | 4/160 [00:00<00:04, 34.93it/s]
  5%|▌         | 8/160 [00:00<00:14, 10.39it/s]
 21%|██        | 33/160 [00:00<00:02, 53.18it/s]
 34%|███▍      | 55/160 [00:00<00:01, 87.07it/s]
 45%|████▌     | 72/160 [00:01<00:00, 102.14it/s]
 55%|█████▌    | 88/160 [00:01<00:00, 102.59it/s]
 66%|██████▌   | 105/160 [00:01<00:00, 117.07it/s]
 75%|███████▌  | 120/160 [00:01<00:00, 120.38it/s]
 84%|████████▍ | 135/160 [00:01<00:00, 122.95it/s]
 93%|█████████▎| 149/160 [00:01<00:00, 106.96it/s]
100%|██████████| 160/160 [00:01<00:00, 90.12it/s]
      average     deviation  min_exec  ...  warmup_time  x_name       fct
157  0.000013  2.520171e-06  0.000011  ...     0.000093   39260  ddot_omp
158  0.000011  3.950575e-07  0.000011  ...     0.000039   39510  ddot_omp
159  0.000022  2.188830e-05  0.000011  ...     0.000038   39760  ddot_omp

[3 rows x 11 columns]
ddot_omp_static

  0%|          | 0/160 [00:00<?, ?it/s]
 18%|█▊        | 28/160 [00:00<00:00, 279.22it/s]
 35%|███▌      | 56/160 [00:00<00:00, 249.77it/s]
 51%|█████▏    | 82/160 [00:00<00:00, 217.08it/s]
 66%|██████▌   | 105/160 [00:00<00:00, 185.71it/s]
 78%|███████▊  | 125/160 [00:00<00:00, 162.75it/s]
 89%|████████▉ | 142/160 [00:00<00:00, 132.89it/s]
 98%|█████████▊| 157/160 [00:01<00:00, 123.51it/s]
100%|██████████| 160/160 [00:01<00:00, 152.78it/s]
      average     deviation  min_exec  ...  warmup_time  x_name              fct
157  0.000014  7.931527e-07  0.000013  ...     0.000052   39260  ddot_omp_static
158  0.000033  4.043829e-05  0.000014  ...     0.000057   39510  ddot_omp_static
159  0.000021  1.071811e-05  0.000015  ...     0.000049   39760  ddot_omp_static

[3 rows x 11 columns]
ddot_omp_dyn

  0%|          | 0/160 [00:00<?, ?it/s]
 14%|█▍        | 23/160 [00:00<00:00, 228.00it/s]
 29%|██▉       | 46/160 [00:00<00:00, 139.38it/s]
 39%|███▉      | 62/160 [00:00<00:00, 117.99it/s]
 47%|████▋     | 75/160 [00:00<00:00, 97.51it/s]
 54%|█████▍    | 86/160 [00:00<00:00, 87.18it/s]
 60%|██████    | 96/160 [00:00<00:00, 80.16it/s]
 66%|██████▌   | 105/160 [00:01<00:00, 74.94it/s]
 71%|███████   | 113/160 [00:01<00:00, 69.07it/s]
 75%|███████▌  | 120/160 [00:01<00:00, 65.53it/s]
 79%|███████▉  | 127/160 [00:01<00:00, 60.60it/s]
 84%|████████▍ | 134/160 [00:01<00:00, 37.46it/s]
 87%|████████▋ | 139/160 [00:02<00:00, 38.99it/s]
 90%|█████████ | 144/160 [00:02<00:00, 39.66it/s]
 93%|█████████▎| 149/160 [00:02<00:00, 39.51it/s]
 96%|█████████▋| 154/160 [00:02<00:00, 37.98it/s]
 99%|█████████▉| 159/160 [00:02<00:00, 38.43it/s]
100%|██████████| 160/160 [00:02<00:00, 61.84it/s]
      average  deviation  min_exec  ...  warmup_time  x_name           fct
157  0.000056   0.000029  0.000036  ...     0.000051   39260  ddot_omp_dyn
158  0.000043   0.000013  0.000037  ...     0.000066   39510  ddot_omp_dyn
159  0.000049   0.000024  0.000036  ...     0.004085   39760  ddot_omp_dyn

[3 rows x 11 columns]
ddot_omp_cpp

  0%|          | 0/160 [00:00<?, ?it/s]
 19%|█▉        | 30/160 [00:00<00:00, 299.87it/s]
 38%|███▊      | 60/160 [00:00<00:00, 248.61it/s]
 54%|█████▍    | 86/160 [00:00<00:00, 248.97it/s]
 70%|███████   | 112/160 [00:00<00:00, 177.22it/s]
 82%|████████▎ | 132/160 [00:00<00:00, 174.44it/s]
 94%|█████████▍| 151/160 [00:00<00:00, 166.51it/s]
100%|██████████| 160/160 [00:00<00:00, 184.60it/s]
      average     deviation  min_exec  ...  warmup_time  x_name           fct
157  0.000014  8.102669e-06  0.000010  ...     0.000044   39260  ddot_omp_cpp
158  0.000010  4.264122e-07  0.000010  ...     0.000038   39510  ddot_omp_cpp
159  0.000020  1.931403e-05  0.000009  ...     0.000036   39760  ddot_omp_cpp

[3 rows x 11 columns]
ddot_omp_cpp_16

  0%|          | 0/160 [00:00<?, ?it/s]
 26%|██▋       | 42/160 [00:00<00:00, 415.80it/s]
 52%|█████▎    | 84/160 [00:00<00:00, 296.15it/s]
 72%|███████▎  | 116/160 [00:00<00:00, 255.17it/s]
 89%|████████▉ | 143/160 [00:00<00:00, 231.46it/s]
100%|██████████| 160/160 [00:00<00:00, 228.03it/s]
      average     deviation  min_exec  ...  warmup_time  x_name              fct
157  0.000012  7.478398e-06  0.000008  ...     0.000087   39260  ddot_omp_cpp_16
158  0.000009  5.406901e-07  0.000008  ...     0.000047   39510  ddot_omp_cpp_16
159  0.000009  1.903409e-06  0.000008  ...     0.000033   39760  ddot_omp_cpp_16

[3 rows x 11 columns]

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
cc[cc.N <= 1000].pivot(index="N", columns="fct", values="average").plot(
    logy=True, ax=ax[0, 0]
)
cc.pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[0, 1])
cc.pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[
    (
        (cc.fct.str.contains("omp") | (cc.fct == "ddot_array"))
        & ~cc.fct.str.contains("dyn")
    )
].pivot(index="N", columns="fct", values="average").plot(logy=True, ax=ax[1, 0])
ax[0, 0].set_title("Comparison of cython ddot implementations")
ax[0, 1].set_title("Comparison of cython ddot implementations" "\nwithout dot_product")
Comparison of cython ddot implementations, Comparison of cython ddot implementations without dot_product
Text(0.5, 1.0, 'Comparison of cython ddot implementations\nwithout dot_product')

Total running time of the script: (0 minutes 12.893 seconds)

Gallery generated by Sphinx-Gallery