Evaluating random access for sparse

Whenever computing the prediction of a tree with a sparse tensor, is it faster to density first and then to compute the prediction or to keep the tensor in its sparse representation and do look up? The parameter nrnd can be seen as the depth of a tree.

import itertools
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnx_extended.ext_test_case import unit_test_going
from onnx_extended.args import get_parsed_args
from onnx_extended.validation.cpu._validation import evaluate_sparse


expose = "repeat,warmup,nrows,ncols,sparsity,nrnd,ntimes"
script_args = get_parsed_args(
    "plot_bench_sparse_access",
    description=__doc__,
    nrows=(10 if unit_test_going() else 100, "number of rows"),
    ncols=(10 if unit_test_going() else 100000, "number of columns"),
    ntimes=(
        "1" if unit_test_going() else "2,4,8",
        "number of times to do nrnd random accesses per row",
    ),
    sparsity=(
        "0.1,0.2" if unit_test_going() else "0.75,0.8,0.9,0.95,0.99,0.999,0.9999",
        "sparsities to try",
    ),
    repeat=2 if unit_test_going() else 5,
    warmup=2 if unit_test_going() else 2,
    nrnd=(10, "number of random features to access"),
    expose=expose,
)

for att in sorted(expose.split(",")):
    print(f"{att}={getattr(script_args, att)}")
ncols=100000
nrnd=10
nrows=100
ntimes=2,4,8
repeat=5
sparsity=0.75,0.8,0.9,0.95,0.99,0.999,0.9999
warmup=2

Sparse tensor

def make_sparse_random_tensor(n_rows: int, n_cols: int, sparsity: float):
    t = np.random.rand(n_rows, n_cols).astype(np.float32)
    m = np.random.rand(n_rows, n_cols).astype(np.float32)
    t[m <= sparsity] = 0
    return t


sparsity = list(map(float, script_args.sparsity.split(",")))
ntimes = list(map(int, script_args.ntimes.split(",")))
t = make_sparse_random_tensor(script_args.nrows, script_args.ncols, sparsity[0])
ev = evaluate_sparse(t, script_args.nrnd, ntimes[0], script_args.repeat, 3)
print(f"dense:  initialization:{ev[0][0]:1.3g}")
print(f"                access:{ev[0][1]:1.3g}")
print(f"sparse: initialization:{ev[1][0]:1.3g}")
print(f"                access:{ev[1][1]:1.3g}")
print(f"Ratio sparse/dense: {ev[1][1] / ev[0][1]}")
dense:  initialization:0.00732
                access:2.81e-05
sparse: initialization:0.0102
                access:0.00043
Ratio sparse/dense: 15.290184921763867

If > 1, sparse is slower.

Try sparsity

tries = list(itertools.product(ntimes, sparsity))

data = []
for nt, sp in tqdm(tries):
    t = make_sparse_random_tensor(script_args.nrows, script_args.ncols, sp)
    ev = evaluate_sparse(t, script_args.nrnd, nt, script_args.repeat, 3)
    obs = dict(
        dense0=ev[0][0],
        dense1=ev[0][1],
        dense=ev[0][0] + ev[0][1],
        sparse0=ev[1][0],
        sparse1=ev[1][1],
        sparse=ev[1][0] + ev[1][1],
        sparsity=sp,
        rows=t.shape[0],
        cols=t.shape[1],
        repeat=script_args.repeat,
        random=script_args.nrnd,
        ntimes=nt,
    )
    data.append(obs)

df = DataFrame(data)
print(df)
  0%|          | 0/21 [00:00<?, ?it/s]
  5%|▍         | 1/21 [00:00<00:06,  2.92it/s]
 10%|▉         | 2/21 [00:00<00:06,  3.17it/s]
 14%|█▍        | 3/21 [00:00<00:05,  3.55it/s]
 19%|█▉        | 4/21 [00:01<00:04,  3.90it/s]
 24%|██▍       | 5/21 [00:01<00:03,  4.27it/s]
 29%|██▊       | 6/21 [00:01<00:03,  4.50it/s]
 33%|███▎      | 7/21 [00:01<00:02,  4.75it/s]
 38%|███▊      | 8/21 [00:02<00:03,  3.86it/s]
 43%|████▎     | 9/21 [00:02<00:03,  3.71it/s]
 48%|████▊     | 10/21 [00:02<00:02,  3.82it/s]
 52%|█████▏    | 11/21 [00:02<00:02,  4.05it/s]
 57%|█████▋    | 12/21 [00:03<00:02,  4.25it/s]
 62%|██████▏   | 13/21 [00:03<00:01,  4.38it/s]
 67%|██████▋   | 14/21 [00:03<00:01,  4.55it/s]
 71%|███████▏  | 15/21 [00:03<00:01,  4.06it/s]
 76%|███████▌  | 16/21 [00:04<00:01,  3.86it/s]
 81%|████████  | 17/21 [00:04<00:01,  3.91it/s]
 86%|████████▌ | 18/21 [00:04<00:00,  4.10it/s]
 90%|█████████ | 19/21 [00:04<00:00,  4.40it/s]
 95%|█████████▌| 20/21 [00:04<00:00,  4.71it/s]
100%|██████████| 21/21 [00:05<00:00,  4.92it/s]
100%|██████████| 21/21 [00:05<00:00,  4.18it/s]
      dense0    dense1     dense   sparse0   sparse1    sparse  sparsity  rows    cols  repeat  random  ntimes
0   0.008539  0.000033  0.008571  0.010443  0.000415  0.010858    0.7500   100  100000       5      10       2
1   0.007293  0.000028  0.007321  0.007318  0.000290  0.007608    0.8000   100  100000       5      10       2
2   0.006298  0.000031  0.006329  0.003685  0.000215  0.003900    0.9000   100  100000       5      10       2
3   0.006052  0.000036  0.006087  0.001938  0.000167  0.002105    0.9500   100  100000       5      10       2
4   0.003819  0.000039  0.003859  0.000462  0.000132  0.000594    0.9900   100  100000       5      10       2
5   0.002728  0.000025  0.002753  0.000040  0.000080  0.000120    0.9990   100  100000       5      10       2
6   0.002728  0.000029  0.002757  0.000007  0.000035  0.000042    0.9999   100  100000       5      10       2
7   0.006708  0.000045  0.006753  0.008967  0.000585  0.009552    0.7500   100  100000       5      10       4
8   0.007725  0.000078  0.007803  0.008212  0.000670  0.008883    0.8000   100  100000       5      10       4
9   0.005805  0.000050  0.005856  0.003578  0.000352  0.003930    0.9000   100  100000       5      10       4
10  0.005253  0.000049  0.005302  0.001743  0.000286  0.002029    0.9500   100  100000       5      10       4
11  0.006534  0.000093  0.006627  0.001228  0.000478  0.001706    0.9900   100  100000       5      10       4
12  0.002450  0.000047  0.002497  0.000038  0.000131  0.000168    0.9990   100  100000       5      10       4
13  0.002333  0.000047  0.002380  0.000007  0.000074  0.000080    0.9999   100  100000       5      10       4
14  0.006716  0.000081  0.006797  0.009080  0.001011  0.010091    0.7500   100  100000       5      10       8
15  0.006299  0.000082  0.006381  0.007108  0.000899  0.008007    0.8000   100  100000       5      10       8
16  0.005854  0.000084  0.005937  0.003494  0.000641  0.004135    0.9000   100  100000       5      10       8
17  0.005178  0.000082  0.005260  0.001745  0.000532  0.002277    0.9500   100  100000       5      10       8
18  0.003623  0.000107  0.003730  0.000385  0.000457  0.000842    0.9900   100  100000       5      10       8
19  0.002295  0.000084  0.002380  0.000037  0.000270  0.000308    0.9990   100  100000       5      10       8
20  0.002347  0.000083  0.002431  0.000007  0.000144  0.000150    0.9999   100  100000       5      10       8

Plots

nts = list(sorted(set(df.ntimes)))

fig, ax = plt.subplots(len(nts), 2, figsize=(3 * len(nts), 10))
for i, nt in enumerate(nts):
    sub = df[df.ntimes == nt]
    sub[["sparsity", "dense", "sparse"]].set_index("sparsity").plot(
        title=f"Dense vs Sparsity, ntimes={nt}",
        logy=True,
        ax=ax[0] if len(ax.shape) == 1 else ax[i, 0],
    )
    sub[["sparsity", "dense1", "sparse1"]].set_index("sparsity").plot(
        title="Dense vs Sparsity (access only)",
        logy=True,
        ax=ax[1] if len(ax.shape) == 1 else ax[i, 0],
    )
fig.tight_layout()
fig.savefig("plot_bench_sparse_access.png")
Dense vs Sparsity (access only), Dense vs Sparsity (access only), Dense vs Sparsity (access only)

Total running time of the script: (0 minutes 6.679 seconds)

Gallery generated by Sphinx-Gallery