Compares matrix multiplication implementations

numpy has a very fast implementation of matrix multiplication. There are many ways to be slower.

Compared implementations:

import pprint
import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.mul_cython_omp import dmul_cython_omp
from teachcompute.ext_test_case import measure_time_dim, unit_test_going

dfs = []
if unit_test_going():
    sets = [2, 4, 6]
else:
    sets = list(range(2, 145, 20))

numpy mul

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=lambda x, y: x @ y,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]["fct"] = "numpy"
pprint.pprint(dfs[-1].tail(n=2))
  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 35.20it/s]
100%|██████████| 8/8 [00:04<00:00,  1.84it/s]
    average  deviation  min_exec  ...  warmup_time  x_name    fct
6  0.002392   0.002224  0.000062  ...     0.003768     122  numpy
7  0.003237   0.003153  0.000350  ...     0.014093     142  numpy

[2 rows x 11 columns]

Simple multiplication

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=dmul_cython_omp,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
pprint.pprint(res[-1])
  0%|          | 0/8 [00:00<?, ?it/s]
 50%|█████     | 4/8 [00:00<00:00, 16.14it/s]
 75%|███████▌  | 6/8 [00:01<00:00,  4.28it/s]
 88%|████████▊ | 7/8 [00:02<00:00,  2.49it/s]
100%|██████████| 8/8 [00:03<00:00,  1.40it/s]
100%|██████████| 8/8 [00:03<00:00,  2.08it/s]
{'average': 0.003347606799999994,
 'context_size': 232,
 'deviation': 0.0001002722410258464,
 'max_exec': 0.003453190000000177,
 'min_exec': 0.0030643179999970014,
 'number': 50,
 'repeat': 10,
 'ttime': 0.03347606799999994,
 'warmup_time': 0.0032301000001098146,
 'x_name': 142}

Other scenarios

3 differents algorithms, each of them parallelized. See dmul_cython_omp.

for algo in range(0, 2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y: dmul_cython_omp(x, y, algo=algo, parallel=parallel),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|█████     | 4/8 [00:00<00:00, 24.74it/s]
 88%|████████▊ | 7/8 [00:02<00:00,  2.86it/s]
100%|██████████| 8/8 [00:03<00:00,  2.09it/s]
   average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.00201   0.000069  0.001909  ...     0.002264     122  a=0-p=0
7  0.00350   0.000275  0.003198  ...     0.003187     142  a=0-p=0

[2 rows x 11 columns]
algo=0 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:02<00:16,  2.33s/it]
 25%|██▌       | 2/8 [00:04<00:14,  2.45s/it]
 38%|███▊      | 3/8 [00:07<00:13,  2.67s/it]
 50%|█████     | 4/8 [00:11<00:12,  3.11s/it]
 62%|██████▎   | 5/8 [00:14<00:09,  3.18s/it]
 75%|███████▌  | 6/8 [00:17<00:06,  3.00s/it]
 88%|████████▊ | 7/8 [00:21<00:03,  3.17s/it]
100%|██████████| 8/8 [00:23<00:00,  2.80s/it]
100%|██████████| 8/8 [00:23<00:00,  2.88s/it]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.007013   0.001674  0.003740  ...     0.006802     122  a=0-p=1
7  0.003995   0.002629  0.001525  ...     0.000819     142  a=0-p=1

[2 rows x 11 columns]
algo=1 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|█████     | 4/8 [00:00<00:00, 27.83it/s]
 88%|████████▊ | 7/8 [00:01<00:00,  3.04it/s]
100%|██████████| 8/8 [00:03<00:00,  2.27it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.001901   0.000141  0.001733  ...     0.001777     122  a=1-p=0
7  0.003147   0.000174  0.002844  ...     0.002572     142  a=1-p=0

[2 rows x 11 columns]
algo=1 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:05,  1.37it/s]
 25%|██▌       | 2/8 [00:01<00:03,  1.63it/s]
 38%|███▊      | 3/8 [00:01<00:02,  1.95it/s]
 50%|█████     | 4/8 [00:02<00:01,  2.08it/s]
 62%|██████▎   | 5/8 [00:02<00:01,  1.96it/s]
 75%|███████▌  | 6/8 [00:03<00:01,  1.51it/s]
 88%|████████▊ | 7/8 [00:04<00:00,  1.47it/s]
100%|██████████| 8/8 [00:05<00:00,  1.34it/s]
100%|██████████| 8/8 [00:05<00:00,  1.54it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.001437   0.000166  0.001162  ...     0.000865     122  a=1-p=1
7  0.001775   0.000353  0.001520  ...     0.000963     142  a=1-p=1

[2 rows x 11 columns]

One left issue

Will you find it in dmul_cython_omp.

va = numpy.random.randn(3, 4).astype(numpy.float64)
vb = numpy.random.randn(4, 5).astype(numpy.float64)
numpy_mul = va @ vb

try:
    for a in range(0, 50):
        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
        assert_almost_equal(numpy_mul, wrong_mul)
        print("Iteration %d is Ok" % a)
    print("All iterations are unexpectedly Ok. Don't push your luck.")
except AssertionError as e:
    print(e)
Arrays are not almost equal to 7 decimals

Mismatched elements: 3 / 15 (20%)
Max absolute difference: 7.92277843
Max relative difference: 1.88696701e-16
 x: array([[ 0.7306257,  3.4077594, -1.9691805,  3.2485642, -1.4901333],
       [ 5.1295693,  0.8935477, -1.1767275, -2.6285682, -7.9227784],
       [-1.6442586, -1.434684 , -0.0978686, -0.6146182,  1.8069683]])
 y: array([[ 0.7306257,  3.4077594, -1.9691805,  3.2485642,  0.       ],
       [ 5.1295693,  0.8935477, -1.1767275, -2.6285682,  0.       ],
       [-1.6442586, -1.434684 , -0.0978686, -0.6146182,  0.       ]])

Other scenarios but transposed

Same differents algorithms but the second matrix is transposed first: b_trans=1.

for algo in range(0, 2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d transposed" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y: dmul_cython_omp(
                    x, y, algo=algo, parallel=parallel, b_trans=1
                ),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=2))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d-T" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 21.14it/s]
100%|██████████| 8/8 [00:01<00:00,  3.86it/s]
100%|██████████| 8/8 [00:01<00:00,  4.56it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000867   0.000089  0.000740  ...     0.001660     122  a=0-p=0-T
7  0.001602   0.000201  0.001322  ...     0.001921     142  a=0-p=0-T

[2 rows x 11 columns]
algo=0 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 25%|██▌       | 2/8 [00:00<00:01,  3.04it/s]
 38%|███▊      | 3/8 [00:01<00:01,  2.99it/s]
 75%|███████▌  | 6/8 [00:01<00:00,  5.54it/s]
 88%|████████▊ | 7/8 [00:01<00:00,  5.69it/s]
100%|██████████| 8/8 [00:01<00:00,  5.32it/s]
100%|██████████| 8/8 [00:01<00:00,  4.79it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000316   0.000066  0.000196  ...     0.000197     122  a=0-p=1-T
7  0.000448   0.000082  0.000314  ...     0.000477     142  a=0-p=1-T

[2 rows x 11 columns]
algo=1 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 24.13it/s]
100%|██████████| 8/8 [00:01<00:00,  4.44it/s]
100%|██████████| 8/8 [00:01<00:00,  5.24it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000846   0.000053  0.000787  ...     0.000754     122  a=1-p=0-T
7  0.001304   0.000049  0.001255  ...     0.001317     142  a=1-p=0-T

[2 rows x 11 columns]
algo=1 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 38%|███▊      | 3/8 [00:00<00:00,  7.59it/s]
 50%|█████     | 4/8 [00:02<00:02,  1.65it/s]
 62%|██████▎   | 5/8 [00:03<00:02,  1.19it/s]
 75%|███████▌  | 6/8 [00:03<00:01,  1.64it/s]
 88%|████████▊ | 7/8 [00:03<00:00,  2.04it/s]
100%|██████████| 8/8 [00:04<00:00,  2.25it/s]
100%|██████████| 8/8 [00:04<00:00,  2.00it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000431   0.000132  0.000254  ...     0.000255     122  a=1-p=1-T
7  0.000672   0.000175  0.000408  ...     0.000373     142  a=1-p=1-T

[2 rows x 11 columns]

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
ccnp = cc.fct == "numpy"
cct = cc.fct.str.contains("-T")
cca0 = cc.fct.str.contains("a=0")
cc[ccnp | (~cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 1]
)
cc[ccnp | (cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[ccnp | cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 0]
)
cc[ccnp | ~cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 1]
)
fig.suptitle("Comparison of matrix multiplication implementations")
Comparison of matrix multiplication implementations
Text(0.5, 0.98, 'Comparison of matrix multiplication implementations')

The results depends on the machine, its number of cores, the compilation settings of numpy or this module.

Total running time of the script: (0 minutes 55.111 seconds)

Gallery generated by Sphinx-Gallery