Compares matrix multiplication implementations

numpy has a very fast implementation of matrix multiplication. There are many ways to be slower.

Compared implementations:

import pprint
import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame, concat
from teachcompute.validation.cython.mul_cython_omp import dmul_cython_omp
from teachcompute.ext_test_case import measure_time_dim, unit_test_going

dfs = []
if unit_test_going():
    sets = [2, 4, 6]
else:
    sets = list(range(2, 145, 20))

numpy mul

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=lambda x, y: x @ y,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
dfs.append(DataFrame(res))
dfs[-1]["fct"] = "numpy"
pprint.pprint(dfs[-1].tail(n=2))
  0%|          | 0/8 [00:00<?, ?it/s]
 88%|████████▊ | 7/8 [00:00<00:00, 30.26it/s]
100%|██████████| 8/8 [00:00<00:00, 21.91it/s]
    average  deviation  min_exec  ...  warmup_time  x_name    fct
6  0.000247   0.000280  0.000071  ...     0.020874     122  numpy
7  0.000258   0.000267  0.000086  ...     0.003845     142  numpy

[2 rows x 11 columns]

Simple multiplication

ctxs = [
    dict(
        va=numpy.random.randn(n, n).astype(numpy.float64),
        vb=numpy.random.randn(n, n).astype(numpy.float64),
        mul=dmul_cython_omp,
        x_name=n,
    )
    for n in sets
]

res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
pprint.pprint(res[-1])
  0%|          | 0/8 [00:00<?, ?it/s]
 50%|█████     | 4/8 [00:00<00:00, 24.07it/s]
 88%|████████▊ | 7/8 [00:01<00:00,  3.55it/s]
100%|██████████| 8/8 [00:02<00:00,  2.73it/s]
{'average': np.float64(0.002499470811992069),
 'context_size': 232,
 'deviation': np.float64(0.00029008705400920576),
 'max_exec': np.float64(0.0033559929399052633),
 'min_exec': np.float64(0.002329104199889116),
 'number': 50,
 'repeat': 10,
 'ttime': np.float64(0.02499470811992069),
 'warmup_time': 0.002028964998316951,
 'x_name': 142}

Other scenarios

3 differents algorithms, each of them parallelized. See dmul_cython_omp.

for algo in range(2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y, algo=algo, parallel=parallel: dmul_cython_omp(
                    x, y, algo=algo, parallel=parallel
                ),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=1))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 50%|█████     | 4/8 [00:00<00:00, 23.34it/s]
 88%|████████▊ | 7/8 [00:01<00:00,  3.64it/s]
100%|██████████| 8/8 [00:02<00:00,  2.79it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.001536   0.000053  0.001451  ...     0.001489     122  a=0-p=0
7  0.002442   0.000105  0.002280  ...     0.002643     142  a=0-p=0

[2 rows x 11 columns]
algo=0 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:04,  1.49it/s]
 25%|██▌       | 2/8 [00:01<00:03,  1.87it/s]
 38%|███▊      | 3/8 [00:01<00:02,  1.93it/s]
 50%|█████     | 4/8 [00:02<00:02,  1.93it/s]
 62%|██████▎   | 5/8 [00:02<00:01,  2.08it/s]
 75%|███████▌  | 6/8 [00:02<00:00,  2.20it/s]
 88%|████████▊ | 7/8 [00:03<00:00,  2.15it/s]
100%|██████████| 8/8 [00:03<00:00,  2.02it/s]
100%|██████████| 8/8 [00:03<00:00,  2.01it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.000967   0.000090  0.000816  ...     0.001222     122  a=0-p=1
7  0.001111   0.000097  0.000982  ...     0.000898     142  a=0-p=1

[2 rows x 11 columns]
algo=1 parallel=0

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 20.41it/s]
100%|██████████| 8/8 [00:02<00:00,  2.86it/s]
100%|██████████| 8/8 [00:02<00:00,  3.41it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.001274   0.000061  0.001173  ...     0.001101     122  a=1-p=0
7  0.002212   0.000125  0.002033  ...     0.002472     142  a=1-p=0

[2 rows x 11 columns]
algo=1 parallel=1

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:04,  1.46it/s]
 25%|██▌       | 2/8 [00:01<00:03,  1.78it/s]
 38%|███▊      | 3/8 [00:01<00:02,  2.16it/s]
 50%|█████     | 4/8 [00:01<00:01,  2.29it/s]
 62%|██████▎   | 5/8 [00:02<00:01,  2.17it/s]
 75%|███████▌  | 6/8 [00:02<00:00,  2.18it/s]
 88%|████████▊ | 7/8 [00:03<00:00,  2.01it/s]
100%|██████████| 8/8 [00:04<00:00,  1.86it/s]
100%|██████████| 8/8 [00:04<00:00,  1.97it/s]
    average  deviation  min_exec  ...  warmup_time  x_name      fct
6  0.001159   0.000264  0.000957  ...     0.001201     122  a=1-p=1
7  0.001234   0.000092  0.001110  ...     0.003584     142  a=1-p=1

[2 rows x 11 columns]

One left issue

Will you find it in dmul_cython_omp.

va = numpy.random.randn(3, 4).astype(numpy.float64)
vb = numpy.random.randn(4, 5).astype(numpy.float64)
numpy_mul = va @ vb

try:
    for a in range(50):
        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
        assert_almost_equal(numpy_mul, wrong_mul)
        print("Iteration %d is Ok" % a)
    print("All iterations are unexpectedly Ok. Don't push your luck.")
except AssertionError as e:
    print(e)
Arrays are not almost equal to 7 decimals

Mismatched elements: 3 / 15 (20%)
Max absolute difference among violations: 2.88258814
Max relative difference among violations: inf
 ACTUAL: array([[-5.1077327,  4.1230599,  3.3204918, -1.1129511,  2.8825881],
       [ 0.6988875,  0.8754879, -2.8989605,  1.2284433,  0.4185662],
       [-1.7983699,  3.1489788, -1.1362253, -0.509742 ,  2.7538266]])
 DESIRED: array([[-5.1077327,  4.1230599,  3.3204918, -1.1129511,  0.       ],
       [ 0.6988875,  0.8754879, -2.8989605,  1.2284433,  0.       ],
       [-1.7983699,  3.1489788, -1.1362253, -0.509742 ,  0.       ]])

Other scenarios but transposed

Same differents algorithms but the second matrix is transposed first: b_trans=1.

for algo in range(2):
    for parallel in (0, 1):
        print("algo=%d parallel=%d transposed" % (algo, parallel))
        ctxs = [
            dict(
                va=numpy.random.randn(n, n).astype(numpy.float64),
                vb=numpy.random.randn(n, n).astype(numpy.float64),
                mul=lambda x, y, parallel=parallel, algo=algo: dmul_cython_omp(
                    x, y, algo=algo, parallel=parallel, b_trans=1
                ),
                x_name=n,
            )
            for n in sets
        ]

        res = list(measure_time_dim("mul(va, vb)", ctxs, verbose=2))
        dfs.append(DataFrame(res))
        dfs[-1]["fct"] = "a=%d-p=%d-T" % (algo, parallel)
        pprint.pprint(dfs[-1].tail(n=2))
algo=0 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 28.42it/s]
100%|██████████| 8/8 [00:01<00:00,  5.73it/s]
100%|██████████| 8/8 [00:01<00:00,  6.73it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000641   0.000067  0.000534  ...     0.000651     122  a=0-p=0-T
7  0.001041   0.000072  0.000943  ...     0.000847     142  a=0-p=0-T

[2 rows x 11 columns]
algo=0 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:01,  4.99it/s]
 75%|███████▌  | 6/8 [00:00<00:00, 18.80it/s]
100%|██████████| 8/8 [00:00<00:00, 13.51it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000184   0.000037  0.000150  ...     0.000168     122  a=0-p=1-T
7  0.000269   0.000047  0.000217  ...     0.000285     142  a=0-p=1-T

[2 rows x 11 columns]
algo=1 parallel=0 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 62%|██████▎   | 5/8 [00:00<00:00, 36.05it/s]
100%|██████████| 8/8 [00:01<00:00,  7.00it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000640   0.000077  0.000561  ...     0.000521     122  a=1-p=0-T
7  0.000983   0.000077  0.000863  ...     0.001273     142  a=1-p=0-T

[2 rows x 11 columns]
algo=1 parallel=1 transposed

  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:00<00:01,  4.46it/s]
 75%|███████▌  | 6/8 [00:00<00:00, 17.10it/s]
100%|██████████| 8/8 [00:00<00:00, 12.97it/s]
100%|██████████| 8/8 [00:00<00:00, 12.71it/s]
    average  deviation  min_exec  ...  warmup_time  x_name        fct
6  0.000183   0.000028  0.000155  ...     0.000191     122  a=1-p=1-T
7  0.000270   0.000034  0.000224  ...     0.000330     142  a=1-p=1-T

[2 rows x 11 columns]

Let’s display the results

cc = concat(dfs)
cc["N"] = cc["x_name"]

fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
ccnp = cc.fct == "numpy"
cct = cc.fct.str.contains("-T")
cca0 = cc.fct.str.contains("a=0")
cc[ccnp | (~cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[0, 1]
)
cc[ccnp | (cct & cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 0]
)
cc[ccnp | (~cct & ~cca0)].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[1, 1]
)
cc[ccnp | cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 0]
)
cc[ccnp | ~cca0].pivot(index="N", columns="fct", values="average").plot(
    logy=True, logx=True, ax=ax[2, 1]
)
fig.suptitle("Comparison of matrix multiplication implementations")
Comparison of matrix multiplication implementations
Text(0.5, 0.98, 'Comparison of matrix multiplication implementations')

The results depends on the machine, its number of cores, the compilation settings of numpy or this module.

Total running time of the script: (0 minutes 22.283 seconds)

Gallery generated by Sphinx-Gallery