Note
Go to the end to download the full example code.
Measuring CUDA performance with a vector addition¶
Measure the time between two additions, one with CUDA, one with numpy. The script can be profiled with Nsight.
nsys profile python _doc/examples/plot_bench_cuda_vector_add.py
Vector Add¶
from tqdm import tqdm
import numpy
import matplotlib.pyplot as plt
from pandas import DataFrame
from teachcompute.ext_test_case import measure_time, unit_test_going
import torch
has_cuda = torch.cuda.is_available()
try:
from teachcompute.validation.cuda.cuda_example_py import vector_add
except ImportError:
has_cuda = False
def cuda_vector_add(values):
torch.cuda.nvtx.range_push(f"CUDA dim={values.size}")
res = vector_add(values, values, 0)
torch.cuda.nvtx.range_pop()
return res
obs = []
dims = [2**10, 2**15, 2**20, 2**25]
if unit_test_going():
dims = [10, 20, 30]
for dim in tqdm(dims):
values = numpy.ones((dim,), dtype=numpy.float32).ravel()
if has_cuda:
diff = numpy.abs(vector_add(values, values, 0) - (values + values)).max()
res = measure_time(lambda values=values: cuda_vector_add(values), max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
fct="CUDA",
time_per_element=res["average"] / dim,
diff=diff,
)
)
diff = 0
res = measure_time(lambda values=values: values + values, max_time=0.5)
obs.append(
dict(
dim=dim,
size=values.size,
time=res["average"],
fct="numpy",
time_per_element=res["average"] / dim,
diff=0,
)
)
df = DataFrame(obs)
piv = df.pivot(index="dim", columns="fct", values="time_per_element")
print(piv)
0%| | 0/4 [00:00<?, ?it/s]
25%|██▌ | 1/4 [00:01<00:05, 1.78s/it]
50%|█████ | 2/4 [00:02<00:02, 1.45s/it]
75%|███████▌ | 3/4 [00:04<00:01, 1.29s/it]
100%|██████████| 4/4 [00:05<00:00, 1.43s/it]
100%|██████████| 4/4 [00:05<00:00, 1.43s/it]
fct CUDA numpy
dim
1024 3.878022e-06 8.271555e-10
32768 9.170654e-08 3.417430e-10
1048576 9.097408e-09 3.034462e-10
33554432 3.975486e-09 8.811870e-10
Plots¶
piv_diff = df.pivot(index="dim", columns="fct", values="diff")
piv_time = df.pivot(index="dim", columns="fct", values="time")
fig, ax = plt.subplots(1, 3, figsize=(12, 6))
piv.plot(ax=ax[0], logx=True, title="Comparison between two summation")
piv_diff.plot(ax=ax[1], logx=True, logy=True, title="Summation errors")
piv_time.plot(ax=ax[2], logx=True, logy=True, title="Total time")
fig.tight_layout()
fig.savefig("plot_bench_cuda_vector_add.png")
data:image/s3,"s3://crabby-images/db186/db186705d4f2ba0d6ffe9af16a2d611c12cb9825" alt="Comparison between two summation, Summation errors, Total time"
/home/xadupre/vv/this/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py:822: UserWarning: Data has no positive values, and therefore cannot be log-scaled.
labels = axis.get_majorticklabels() + axis.get_minorticklabels()
CUDA seems very slow but in fact, all the time is spent in moving the data from the CPU memory (Host) to the GPU memory (device).
data:image/s3,"s3://crabby-images/9d013/9d0139149e5d648e5b1a58482c4df821c98f5313" alt="../_images/nsight_vector_add.png"
Total running time of the script: (0 minutes 12.192 seconds)