Evaluating random access for sparse#

Whenever computing the prediction of a tree with a sparse tensor, is it faster to density first and then to compute the prediction or to keep the tensor in its sparse representation and do look up? The parameter nrnd can be seen as the depth of a tree.

import itertools
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnx_extended.ext_test_case import unit_test_going
from onnx_extended.args import get_parsed_args
from onnx_extended.validation.cpu._validation import evaluate_sparse


expose = "repeat,warmup,nrows,ncols,sparsity,nrnd,ntimes"
script_args = get_parsed_args(
    "plot_bench_sparse_access",
    description=__doc__,
    nrows=(10 if unit_test_going() else 100, "number of rows"),
    ncols=(10 if unit_test_going() else 100000, "number of columns"),
    ntimes=(
        "1" if unit_test_going() else "2,4,8",
        "number of times to do nrnd random accesses per row",
    ),
    sparsity=(
        "0.1,0.2" if unit_test_going() else "0.75,0.8,0.9,0.95,0.99,0.999,0.9999",
        "sparsities to try",
    ),
    repeat=2 if unit_test_going() else 5,
    warmup=2 if unit_test_going() else 2,
    nrnd=(10, "number of random features to access"),
    expose=expose,
)

for att in sorted(expose.split(",")):
    print(f"{att}={getattr(script_args, att)}")

ncols=100000
nrnd=10
nrows=100
ntimes=2,4,8
repeat=5
sparsity=0.75,0.8,0.9,0.95,0.99,0.999,0.9999
warmup=2

Sparse tensor#

def make_sparse_random_tensor(n_rows: int, n_cols: int, sparsity: float):
    t = np.random.rand(n_rows, n_cols).astype(np.float32)
    m = np.random.rand(n_rows, n_cols).astype(np.float32)
    t[m <= sparsity] = 0
    return t


sparsity = list(map(float, script_args.sparsity.split(",")))
ntimes = list(map(int, script_args.ntimes.split(",")))
t = make_sparse_random_tensor(script_args.nrows, script_args.ncols, sparsity[0])
ev = evaluate_sparse(t, script_args.nrnd, ntimes[0], script_args.repeat, 3)
print(f"dense:  initialization:{ev[0][0]:1.3g}")
print(f"                access:{ev[0][1]:1.3g}")
print(f"sparse: initialization:{ev[1][0]:1.3g}")
print(f"                access:{ev[1][1]:1.3g}")
print(f"Ratio sparse/dense: {ev[1][1] / ev[0][1]}")

dense:  initialization:0.0115
                access:7.86e-05
sparse: initialization:0.0126
                access:0.000579
Ratio sparse/dense: 7.363613231552162

If > 1, sparse is slower.

Try sparsity#

tries = list(itertools.product(ntimes, sparsity))

data = []
for nt, sp in tqdm(tries):
    t = make_sparse_random_tensor(script_args.nrows, script_args.ncols, sp)
    ev = evaluate_sparse(t, script_args.nrnd, nt, script_args.repeat, 3)
    obs = dict(
        dense0=ev[0][0],
        dense1=ev[0][1],
        dense=ev[0][0] + ev[0][1],
        sparse0=ev[1][0],
        sparse1=ev[1][1],
        sparse=ev[1][0] + ev[1][1],
        sparsity=sp,
        rows=t.shape[0],
        cols=t.shape[1],
        repeat=script_args.repeat,
        random=script_args.nrnd,
        ntimes=nt,
    )
    data.append(obs)

df = DataFrame(data)
print(df)

  0%|          | 0/21 [00:00<?, ?it/s]
  5%|▍         | 1/21 [00:00<00:10,  1.97it/s]
 10%|▉         | 2/21 [00:00<00:08,  2.29it/s]
 14%|█▍        | 3/21 [00:01<00:06,  2.70it/s]
 19%|█▉        | 4/21 [00:01<00:05,  3.07it/s]
 24%|██▍       | 5/21 [00:01<00:04,  3.36it/s]
 29%|██▊       | 6/21 [00:01<00:04,  3.74it/s]
 33%|███▎      | 7/21 [00:02<00:03,  4.02it/s]
 38%|███▊      | 8/21 [00:02<00:03,  3.60it/s]
 43%|████▎     | 9/21 [00:02<00:03,  3.44it/s]
 48%|████▊     | 10/21 [00:03<00:03,  3.53it/s]
 52%|█████▏    | 11/21 [00:03<00:02,  3.73it/s]
 57%|█████▋    | 12/21 [00:03<00:02,  4.03it/s]
 62%|██████▏   | 13/21 [00:03<00:01,  4.27it/s]
 67%|██████▋   | 14/21 [00:03<00:01,  4.51it/s]
 71%|███████▏  | 15/21 [00:04<00:01,  3.92it/s]
 76%|███████▌  | 16/21 [00:04<00:01,  3.67it/s]
 81%|████████  | 17/21 [00:04<00:01,  3.73it/s]
 86%|████████▌ | 18/21 [00:05<00:00,  3.88it/s]
 90%|█████████ | 19/21 [00:05<00:00,  4.01it/s]
 95%|█████████▌| 20/21 [00:05<00:00,  4.15it/s]
100%|██████████| 21/21 [00:05<00:00,  4.29it/s]
100%|██████████| 21/21 [00:05<00:00,  3.70it/s]
      dense0    dense1     dense   sparse0   sparse1    sparse  sparsity  rows    cols  repeat  random  ntimes
0   0.007894  0.000033  0.007926  0.011227  0.000422  0.011650    0.7500   100  100000       5      10       2
1   0.007839  0.000030  0.007869  0.008657  0.000381  0.009038    0.8000   100  100000       5      10       2
2   0.006501  0.000031  0.006532  0.004088  0.000245  0.004333    0.9000   100  100000       5      10       2
3   0.005541  0.000037  0.005578  0.002129  0.000188  0.002317    0.9500   100  100000       5      10       2
4   0.003747  0.000032  0.003779  0.000391  0.000125  0.000516    0.9900   100  100000       5      10       2
5   0.002479  0.000027  0.002506  0.000043  0.000076  0.000119    0.9990   100  100000       5      10       2
6   0.002905  0.000027  0.002932  0.000007  0.000041  0.000048    0.9999   100  100000       5      10       2
7   0.006894  0.000052  0.006946  0.009924  0.000656  0.010580    0.7500   100  100000       5      10       4
8   0.007062  0.000050  0.007112  0.007804  0.000582  0.008386    0.8000   100  100000       5      10       4
9   0.006250  0.000052  0.006302  0.003893  0.000447  0.004340    0.9000   100  100000       5      10       4
10  0.005859  0.000052  0.005911  0.001995  0.000322  0.002316    0.9500   100  100000       5      10       4
11  0.003394  0.000052  0.003446  0.000397  0.000237  0.000634    0.9900   100  100000       5      10       4
12  0.002858  0.000056  0.002914  0.000051  0.000141  0.000193    0.9990   100  100000       5      10       4
13  0.002318  0.000047  0.002365  0.000008  0.000075  0.000083    0.9999   100  100000       5      10       4
14  0.007497  0.000094  0.007591  0.010031  0.001132  0.011164    0.7500   100  100000       5      10       8
15  0.006721  0.000091  0.006811  0.007839  0.001005  0.008845    0.8000   100  100000       5      10       8
16  0.006311  0.000092  0.006403  0.003879  0.000708  0.004587    0.9000   100  100000       5      10       8
17  0.005430  0.000105  0.005535  0.001929  0.000617  0.002546    0.9500   100  100000       5      10       8
18  0.004344  0.000117  0.004462  0.000489  0.000527  0.001016    0.9900   100  100000       5      10       8
19  0.002696  0.000096  0.002792  0.000045  0.000336  0.000381    0.9990   100  100000       5      10       8
20  0.002998  0.000100  0.003098  0.000008  0.000169  0.000177    0.9999   100  100000       5      10       8

Plots

nts = list(sorted(set(df.ntimes)))

fig, ax = plt.subplots(len(nts), 2, figsize=(3 * len(nts), 10))
for i, nt in enumerate(nts):
    sub = df[df.ntimes == nt]
    sub[["sparsity", "dense", "sparse"]].set_index("sparsity").plot(
        title=f"Dense vs Sparsity, ntimes={nt}",
        logy=True,
        ax=ax[0] if len(ax.shape) == 1 else ax[i, 0],
    )
    sub[["sparsity", "dense1", "sparse1"]].set_index("sparsity").plot(
        title="Dense vs Sparsity (access only)",
        logy=True,
        ax=ax[1] if len(ax.shape) == 1 else ax[i, 0],
    )
fig.tight_layout()
fig.savefig("plot_bench_sparse_access.png")

Dense vs Sparsity (access only), Dense vs Sparsity (access only), Dense vs Sparsity (access only)

Total running time of the script: (0 minutes 8.013 seconds)

Gallery generated by Sphinx-Gallery