Measuring CPU performance with a parallelized vector sum and AVX

The example compares the time spend in computing the sum of all coefficients of a matrix when the function walks through the coefficients by rows or by columns when the computation is parallelized or uses AVX instructions.

Vector Sum

from tqdm import tqdm
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from teachcompute.ext_test_case import measure_time, unit_test_going
from teachcompute.validation.cpu._validation import (
    vector_sum_array as vector_sum,
    vector_sum_array_parallel as vector_sum_parallel,
    vector_sum_array_avx as vector_sum_avx,
    vector_sum_array_avx_parallel as vector_sum_avx_parallel,
)

obs = []
dims = [500, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 2000]
if unit_test_going():
    dims = [10, 20]
for dim in tqdm(dims):
    values = numpy.ones((dim, dim), dtype=numpy.float32).ravel()
    diff = abs(vector_sum(dim, values, True) - dim**2)

    res = measure_time(lambda: vector_sum(dim, values, True), max_time=0.5)

    obs.append(
        dict(
            dim=dim,
            size=values.size,
            time=res["average"],
            direction="rows",
            time_per_element=res["average"] / dim**2,
            diff=diff,
        )
    )

    res = measure_time(lambda: vector_sum_parallel(dim, values, True), max_time=0.5)

    obs.append(
        dict(
            dim=dim,
            size=values.size,
            time=res["average"],
            direction="rows//",
            time_per_element=res["average"] / dim**2,
            diff=diff,
        )
    )

    diff = abs(vector_sum_avx(dim, values) - dim**2)
    res = measure_time(lambda: vector_sum_avx(dim, values), max_time=0.5)

    obs.append(
        dict(
            dim=dim,
            size=values.size,
            time=res["average"],
            direction="avx",
            time_per_element=res["average"] / dim**2,
            diff=diff,
        )
    )

    diff = abs(vector_sum_avx_parallel(dim, values) - dim**2)
    res = measure_time(lambda: vector_sum_avx_parallel(dim, values), max_time=0.5)

    obs.append(
        dict(
            dim=dim,
            size=values.size,
            time=res["average"],
            direction="avx//",
            time_per_element=res["average"] / dim**2,
            diff=diff,
        )
    )


df = DataFrame(obs)
piv = df.pivot(index="dim", columns="direction", values="time_per_element")
print(piv)
  0%|          | 0/14 [00:00<?, ?it/s]
  7%|▋         | 1/14 [00:18<03:59, 18.44s/it]
 14%|█▍        | 2/14 [00:32<03:10, 15.86s/it]
 21%|██▏       | 3/14 [00:45<02:42, 14.74s/it]
 29%|██▊       | 4/14 [00:54<02:03, 12.39s/it]
 36%|███▌      | 5/14 [00:58<01:22,  9.12s/it]
 43%|████▎     | 6/14 [01:00<00:54,  6.77s/it]
 50%|█████     | 7/14 [01:04<00:42,  6.00s/it]
 57%|█████▋    | 8/14 [01:10<00:36,  6.00s/it]
 64%|██████▍   | 9/14 [01:13<00:25,  5.09s/it]
 71%|███████▏  | 10/14 [01:18<00:19,  4.91s/it]
 79%|███████▊  | 11/14 [01:22<00:13,  4.62s/it]
 86%|████████▌ | 12/14 [01:25<00:08,  4.17s/it]
 93%|█████████▎| 13/14 [01:28<00:03,  3.81s/it]
100%|██████████| 14/14 [01:31<00:00,  3.51s/it]
100%|██████████| 14/14 [01:31<00:00,  6.51s/it]
direction           avx         avx//          rows        rows//
dim
500        1.292148e-10  2.554969e-09  1.462529e-09  8.035897e-09
700        4.268824e-10  3.070183e-09  2.488519e-09  4.196956e-09
800        8.060550e-10  1.926704e-09  3.743734e-09  8.999025e-09
900        4.352112e-10  1.555108e-09  2.135214e-09  8.146449e-09
1000       6.925653e-10  1.184284e-09  1.997798e-09  4.239076e-09
1100       5.881012e-10  1.779311e-09  2.073639e-09  3.763300e-09
1200       6.775749e-10  1.418281e-09  2.415427e-09  3.531702e-09
1300       6.490562e-10  1.510975e-09  3.308405e-09  4.086904e-09
1400       4.259379e-10  1.474191e-09  1.565825e-09  2.193004e-09
1500       3.355112e-10  1.496973e-09  1.572115e-09  1.423634e-09
1600       3.394400e-10  1.247542e-09  1.975824e-09  1.779570e-09
1700       3.190017e-10  9.375555e-10  1.777310e-09  1.295866e-09
1800       3.503798e-10  8.778256e-10  1.777263e-09  1.493882e-09
2000       3.463488e-10  8.235314e-10  1.784440e-09  1.380858e-09

Plots

piv_diff = df.pivot(index="dim", columns="direction", values="diff")
piv_time = df.pivot(index="dim", columns="direction", values="time")

fig, ax = plt.subplots(1, 3, figsize=(12, 6))
piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
fig.tight_layout()
fig.savefig("plot_bench_cpu_vector_sum_avx_parallel.png")
Comparison between two summation, Summation errors, Total time

AVX is faster.

Total running time of the script: (1 minutes 33.560 seconds)

Gallery generated by Sphinx-Gallery