Measuring CUDA performance with a vector sum

The objective is to measure the summation of all elements from a tensor.

nsys profile python _doc/examples/plot_bench_cuda_vector_sum.py

Vector Add

from tqdm import tqdm
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from teachcompute.ext_test_case import measure_time, unit_test_going
import torch

has_cuda = torch.cuda.is_available()

try:
    from teachcompute.validation.cuda.cuda_example_py import (
        vector_sum0,
        vector_sum_atomic,
        vector_sum6,
    )
except ImportError:
    has_cuda = False


def wrap_cuda_call(f, values):
    torch.cuda.nvtx.range_push(f"CUDA f={f.__name__} dim={values.size}")
    res = f(values)
    torch.cuda.nvtx.range_pop()
    return res


obs = []
dims = [2**10, 2**15, 2**20, 2**25, 2**28]
if unit_test_going():
    dims = [10, 20, 30]
for dim in tqdm(dims):
    values = numpy.ones((dim,), dtype=numpy.float32).ravel()

    if has_cuda:
        for f in [vector_sum0, vector_sum_atomic, vector_sum6]:
            if f == vector_sum_atomic and dim > 2**20:
                continue
            diff = numpy.abs(wrap_cuda_call(f, values) - (values.sum()))
            res = measure_time(lambda: wrap_cuda_call(f, values), max_time=0.5)

            obs.append(
                dict(
                    dim=dim,
                    size=values.size,
                    time=res["average"],
                    fct=f"CUDA-{f.__name__}",
                    time_per_element=res["average"] / dim,
                    diff=diff,
                )
            )

    diff = 0
    res = measure_time(lambda: values.sum(), max_time=0.5)

    obs.append(
        dict(
            dim=dim,
            size=values.size,
            time=res["average"],
            fct="numpy",
            time_per_element=res["average"] / dim,
            diff=0,
        )
    )


df = DataFrame(obs)
piv = df.pivot(index="dim", columns="fct", values="time_per_element")
print(piv)
  0%|          | 0/5 [00:00<?, ?it/s]
 20%|██        | 1/5 [00:02<00:10,  2.52s/it]
 40%|████      | 2/5 [00:04<00:07,  2.45s/it]
 60%|██████    | 3/5 [00:11<00:08,  4.25s/it]
 80%|████████  | 4/5 [00:14<00:03,  3.95s/it]
100%|██████████| 5/5 [00:31<00:00,  8.37s/it]
100%|██████████| 5/5 [00:31<00:00,  6.20s/it]
fct        CUDA-vector_sum0  CUDA-vector_sum6  CUDA-vector_sum_atomic         numpy
dim
1024           1.356106e-06      9.753873e-07                0.000003  1.531289e-09
32768          6.525890e-08      5.338412e-08                0.000002  3.312474e-10
1048576        1.191298e-08      9.133097e-09                0.000001  4.214887e-10
33554432       1.027293e-08      6.433465e-09                     NaN  3.360749e-10
268435456      9.087000e-09      6.409552e-09                     NaN  4.604212e-10

Plots

piv_diff = df.pivot(index="dim", columns="fct", values="diff")
piv_time = df.pivot(index="dim", columns="fct", values="time")

fig, ax = plt.subplots(1, 3, figsize=(12, 6))
piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
fig.tight_layout()
fig.savefig("plot_bench_cuda_vector_sum.png")
Comparison between two summation, Summation errors, Total time
/home/xadupre/.local/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py:822: UserWarning: Data has no positive values, and therefore cannot be log-scaled.
  labels = axis.get_majorticklabels() + axis.get_minorticklabels()

CUDA seems very slow but in fact, all the time is spent in moving the data from the CPU memory (Host) to the GPU memory (device).

../_images/nsight_vector_sum.png

Total running time of the script: (0 minutes 33.883 seconds)

Gallery generated by Sphinx-Gallery