Compares matrix multiplication implementations

numpy has a very fast implementation of matrix multiplication. There are many ways to be slower.

Compared implementations:

import pprint
import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.mul_cython_omp import dmul_cython_omp
from teachcompute.ext_test_case import measure_time_dim, unit_test_going

dfs = []
if unit_test_going():
    sets = [2, 4, 6]
else:
    sets = list(range(2, 145, 20))

numpy mul

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=lambda x, y: x @ y,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]["fct"] = "numpy"
pprint.pprint(dfs[-1].tail(n=2))
  0%|          | 0/8 [00:00<?, ?it/s]
100%|██████████| 8/8 [00:00<00:00, 46.31it/s]
100%|██████████| 8/8 [00:00<00:00, 46.21it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name    fct
6  0.000053   0.000018  0.000036  0.000084      10      50  0.000531           184     0.000161     122  numpy
7  0.000134   0.000044  0.000104  0.000257      10      50  0.001341           184     0.006012     142  numpy

Simple multiplication

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=dmul_cython_omp,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
pprint.pprint(res[-1])
  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 23.48it/s]
100%|██████████| 8/8 [00:01<00:00,  3.68it/s]
100%|██████████| 8/8 [00:01<00:00,  4.37it/s]
{'average': np.float64(0.0015088314280001214),
 'context_size': 184,
 'deviation': np.float64(6.891646911459498e-05),
 'max_exec': np.float64(0.0016632056000025841),
 'min_exec': np.float64(0.001450596519998726),
 'number': 50,
 'repeat': 10,
 'ttime': np.float64(0.015088314280001213),
 'warmup_time': 0.0014077939999879163,
 'x_name': 142}

Other scenarios

3 differents algorithms, each of them parallelized. See dmul_cython_omp.

for algo in range(2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y, algo=algo, parallel=parallel: dmul_cython_omp(
                    x, y, algo=algo, parallel=parallel
                ),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 23.07it/s]
100%|██████████| 8/8 [00:01<00:00,  3.80it/s]
100%|██████████| 8/8 [00:01<00:00,  4.50it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name      fct
6  0.000954   0.000181  0.000849  0.001461      10      50  0.009538           184     0.000969     122  a=0-p=0
7  0.001685   0.000330  0.001438  0.002535      10      50  0.016847           184     0.002133     142  a=0-p=0
algo=0 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:04,  1.70it/s]
 25%|██▌       | 2/8 [00:01<00:03,  1.74it/s]
 38%|███▊      | 3/8 [00:01<00:03,  1.50it/s]
 50%|█████     | 4/8 [00:02<00:02,  1.39it/s]
 62%|██████▎   | 5/8 [00:03<00:02,  1.24it/s]
 75%|███████▌  | 6/8 [00:04<00:01,  1.07it/s]
 88%|████████▊ | 7/8 [00:05<00:00,  1.07it/s]
100%|██████████| 8/8 [00:06<00:00,  1.09it/s]
100%|██████████| 8/8 [00:06<00:00,  1.20it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name      fct
6  0.001846   0.000424  0.001267  0.002829      10      50  0.018456           184     0.004603     122  a=0-p=1
7  0.001751   0.000484  0.001182  0.002715      10      50  0.017514           184     0.001130     142  a=0-p=1
algo=1 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 26.17it/s]
100%|██████████| 8/8 [00:01<00:00,  3.86it/s]
100%|██████████| 8/8 [00:01<00:00,  4.59it/s]
   average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name      fct
6  0.00105   0.000390  0.000848  0.002208      10      50  0.010503           184     0.006923     122  a=1-p=0
7  0.00153   0.000112  0.001431  0.001803      10      50  0.015296           184     0.001477     142  a=1-p=0
algo=1 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:06,  1.12it/s]
 25%|██▌       | 2/8 [00:01<00:05,  1.03it/s]
 38%|███▊      | 3/8 [00:02<00:04,  1.04it/s]
 50%|█████     | 4/8 [00:03<00:04,  1.01s/it]
 62%|██████▎   | 5/8 [00:05<00:03,  1.10s/it]
 75%|███████▌  | 6/8 [00:06<00:02,  1.13s/it]
 88%|████████▊ | 7/8 [00:07<00:01,  1.09s/it]
100%|██████████| 8/8 [00:08<00:00,  1.10s/it]
100%|██████████| 8/8 [00:08<00:00,  1.07s/it]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name      fct
6  0.002010   0.000566  0.001093  0.003070      10      50  0.020097           184     0.003395     122  a=1-p=1
7  0.002242   0.000596  0.001131  0.002956      10      50  0.022424           184     0.001155     142  a=1-p=1

One left issue

Will you find it in dmul_cython_omp.

va = numpy.random.randn(3, 4).astype(numpy.float64)
vb = numpy.random.randn(4, 5).astype(numpy.float64)
numpy_mul = va @ vb

try:
    for a in range(50):
        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
        assert_almost_equal(numpy_mul, wrong_mul)
        print("Iteration %d is Ok" % a)
    print("All iterations are unexpectedly Ok. Don't push your luck.")
except AssertionError as e:
    print(e)
Arrays are not almost equal to 7 decimals

Mismatched elements: 3 / 15 (20%)
Max absolute difference among violations: 1.5377744
Max relative difference among violations: inf
 ACTUAL: array([[-0.1116808,  2.404237 ,  0.9328312, -1.5881412, -0.4303665],
       [-5.541893 , -1.8682949, -1.3837282,  0.9456719, -0.8593078],
       [-0.1463528,  3.4096553, -0.1803634,  1.1345345,  1.5377744]])
 DESIRED: array([[-0.1116808,  2.404237 ,  0.9328312, -1.5881412,  0.       ],
       [-5.541893 , -1.8682949, -1.3837282,  0.9456719,  0.       ],
       [-0.1463528,  3.4096553, -0.1803634,  1.1345345,  0.       ]])

Other scenarios but transposed

Same differents algorithms but the second matrix is transposed first: b_trans=1.

for algo in range(2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d transposed" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y, parallel=parallel, algo=algo: dmul_cython_omp(
                    x, y, algo=algo, parallel=parallel, b_trans=1
                ),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=2))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d-T" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 36.12it/s]
100%|██████████| 8/8 [00:01<00:00,  7.76it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
6  0.000605   0.000108  0.000468  0.000833      10      50  0.006055           184     0.000539     122  a=0-p=0-T
7  0.000840   0.000111  0.000687  0.001031      10      50  0.008404           184     0.000729     142  a=0-p=0-T
algo=0 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 75%|███████▌  | 6/8 [00:00<00:00, 34.06it/s]
100%|██████████| 8/8 [00:00<00:00, 21.80it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
6  0.000133   0.000009  0.000126  0.000158      10      50  0.001334           184     0.000165     122  a=0-p=1-T
7  0.000245   0.000008  0.000226  0.000259      10      50  0.002445           184     0.000246     142  a=0-p=1-T
algo=1 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 33.27it/s]
100%|██████████| 8/8 [00:01<00:00,  7.28it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
6  0.000690   0.000154  0.000501  0.000953      10      50  0.006898           184     0.001148     122  a=1-p=0-T
7  0.000767   0.000073  0.000693  0.000916      10      50  0.007674           184     0.000701     142  a=1-p=0-T
algo=1 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 38.43it/s]
100%|██████████| 8/8 [00:00<00:00, 19.67it/s]
    average  deviation  min_exec  max_exec  repeat  number     ttime  context_size  warmup_time  x_name        fct
6  0.000137   0.000006  0.000129  0.000146      10      50  0.001373           184     0.000161     122  a=1-p=1-T
7  0.000215   0.000019  0.000189  0.000246      10      50  0.002150           184     0.000207     142  a=1-p=1-T

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
ccnp = cc.fct == "numpy"
cct = cc.fct.str.contains("-T")
cca0 = cc.fct.str.contains("a=0")
cc[ccnp | (~cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 1]
)
cc[ccnp | (cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[ccnp | cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 0]
)
cc[ccnp | ~cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 1]
)
fig.suptitle("Comparison of matrix multiplication implementations")
Comparison of matrix multiplication implementations
Text(0.5, 0.98, 'Comparison of matrix multiplication implementations')

The results depends on the machine, its number of cores, the compilation settings of numpy or this module.

Total running time of the script: (0 minutes 24.732 seconds)

Gallery generated by Sphinx-Gallery