Note
Go to the end to download the full example code.
Measuring performance of TfIdfVectorizer¶
The banchmark measures the performance of a TfIdfVectizer along two parameters, the vocabulary size, the batch size whether. It measures the benefit of using sparse implementation through the computation time and the memory peak.
A simple model¶
We start with a model including only one node TfIdfVectorizer. It only contains unigram. The model processes only sequences of 10 integers. The sparsity of the results is then 10 divided by the size of vocabulary.
import gc
import time
import itertools
from typing import Tuple
import numpy as np
import pandas
from onnx import ModelProto
from onnx.helper import make_attribute
from tqdm import tqdm
from onnxruntime import InferenceSession, SessionOptions
from onnx_extended.ext_test_case import measure_time, unit_test_going
from onnx_extended.memory_peak import start_spying_on
from onnx_extended.reference import CReferenceEvaluator
from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
from onnx_extended.plotting.benchmark import vhistograms
def make_onnx(n_words: int) -> ModelProto:
from skl2onnx.common.data_types import Int64TensorType, FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxTfIdfVectorizer
# from onnx_array_api.light_api import start
# onx = (
# start(opset=19, opsets={"ai.onnx.ml": 3})
# .vin("X", elem_type=TensorProto.INT64)
# .ai.onnx.TfIdfVectorizer(
# ...
# )
# .rename(Y)
# .vout(elem_type=TensorProto.FLOAT)
# .to_onnx()
# )
onx = OnnxTfIdfVectorizer(
"X",
mode="TF",
min_gram_length=1,
max_gram_length=1,
max_skip_count=0,
ngram_counts=[0],
ngram_indexes=np.arange(n_words).tolist(),
pool_int64s=np.arange(n_words).tolist(),
output_names=["Y"],
).to_onnx(inputs=[("X", Int64TensorType())], outputs=[("Y", FloatTensorType())])
# .rename(Y)
# .vout(elem_type=TensorProto.FLOAT)
# .to_onnx()
# )
return onx
onx = make_onnx(7)
ref = CReferenceEvaluator(onx)
got = ref.run(None, {"X": np.array([[0, 1], [2, 3]], dtype=np.int64)})
print(got)
[array([[1., 1., 0., 0., 0., 0., 0.],
[0., 0., 1., 1., 0., 0., 0.]], dtype=float32)]
It works as expected. Let’s now compare the execution with onnxruntime for different batch size and vocabulary size.
Benchmark¶
def make_sessions(
onx: ModelProto,
) -> Tuple[InferenceSession, InferenceSession, InferenceSession]:
# first: onnxruntime
ref = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
# second: custom kernel equivalent to the onnxruntime implementation
for node in onx.graph.node:
if node.op_type == "TfIdfVectorizer":
node.domain = "onnx_extended.ortops.optim.cpu"
# new_add = make_attribute("sparse", 1)
# node.attribute.append(new_add)
d = onx.opset_import.add()
d.domain = "onnx_extended.ortops.optim.cpu"
d.version = 1
r = get_ort_ext_libs()
opts = SessionOptions()
opts.register_custom_ops_library(r[0])
cus = InferenceSession(
onx.SerializeToString(), opts, providers=["CPUExecutionProvider"]
)
# third: with sparse
for node in onx.graph.node:
if node.op_type == "TfIdfVectorizer":
new_add = make_attribute("sparse", 1)
node.attribute.append(new_add)
cussp = InferenceSession(
onx.SerializeToString(), opts, providers=["CPUExecutionProvider"]
)
return ref, cus, cussp
if unit_test_going():
vocabulary_sizes = [10, 20]
batch_sizes = [5, 10]
else:
vocabulary_sizes = [100, 1000, 5000, 10000]
batch_sizes = [1, 10, 500, 1000, 2000]
confs = list(itertools.product(vocabulary_sizes, batch_sizes))
data = []
for voc_size, batch_size in tqdm(confs):
onx = make_onnx(voc_size)
ref, cus, sparse = make_sessions(onx)
gc.collect()
feeds = dict(
X=(np.arange(batch_size * 10) % voc_size)
.reshape((batch_size, -1))
.astype(np.int64)
)
# sparse
p = start_spying_on(delay=0.0001)
sparse.run(None, feeds)
obs = measure_time(
lambda sparse=sparse, feeds=feeds: sparse.run(None, feeds), max_time=1
)
mem = p.stop()
obs["peak"] = mem["cpu"].max_peak - mem["cpu"].begin
obs["name"] = "sparse"
obs.update(dict(voc_size=voc_size, batch_size=batch_size))
data.append(obs)
time.sleep(0.1)
# reference
p = start_spying_on(delay=0.0001)
ref.run(None, feeds)
obs = measure_time(lambda ref=ref, feeds=feeds: ref.run(None, feeds), max_time=1)
mem = p.stop()
obs["peak"] = mem["cpu"].max_peak - mem["cpu"].begin
obs["name"] = "ref"
obs.update(dict(voc_size=voc_size, batch_size=batch_size))
data.append(obs)
time.sleep(0.1)
# custom
p = start_spying_on(delay=0.0001)
cus.run(None, feeds)
obs = measure_time(lambda cus=cus, feeds=feeds: cus.run(None, feeds), max_time=1)
mem = p.stop()
obs["peak"] = mem["cpu"].max_peak - mem["cpu"].begin
obs["name"] = "custom"
obs.update(dict(voc_size=voc_size, batch_size=batch_size))
data.append(obs)
time.sleep(0.1)
del sparse
del cus
del ref
del feeds
df = pandas.DataFrame(data)
df["time"] = df["average"]
df.to_csv("plot_op_tfidfvectorizer_sparse.csv", index=False)
print(df.head())
0%| | 0/20 [00:00<?, ?it/s]
5%|▌ | 1/20 [00:04<01:16, 4.00s/it]
10%|█ | 2/20 [00:08<01:13, 4.10s/it]
15%|█▌ | 3/20 [00:12<01:07, 4.00s/it]
20%|██ | 4/20 [00:16<01:03, 3.99s/it]
25%|██▌ | 5/20 [00:20<01:00, 4.06s/it]
30%|███ | 6/20 [00:23<00:51, 3.69s/it]
35%|███▌ | 7/20 [00:27<00:49, 3.82s/it]
40%|████ | 8/20 [00:31<00:46, 3.84s/it]
45%|████▌ | 9/20 [00:35<00:44, 4.03s/it]
50%|█████ | 10/20 [00:39<00:39, 4.00s/it]
55%|█████▌ | 11/20 [00:43<00:35, 3.90s/it]
60%|██████ | 12/20 [00:47<00:31, 3.89s/it]
65%|██████▌ | 13/20 [00:50<00:27, 3.87s/it]
70%|███████ | 14/20 [00:53<00:21, 3.53s/it]
75%|███████▌ | 15/20 [00:57<00:18, 3.68s/it]
80%|████████ | 16/20 [01:01<00:14, 3.75s/it]
85%|████████▌ | 17/20 [01:05<00:11, 3.91s/it]
90%|█████████ | 18/20 [01:09<00:07, 3.91s/it]
95%|█████████▌| 19/20 [01:13<00:03, 3.90s/it]
100%|██████████| 20/20 [01:17<00:00, 3.91s/it]
100%|██████████| 20/20 [01:17<00:00, 3.88s/it]
average deviation min_exec max_exec repeat number ttime context_size warmup_time peak name voc_size batch_size time
0 0.000009 1.925226e-06 0.000007 0.000155 1 130268.0 1.112059 64 0.000296 0 sparse 100 1 0.000009
1 0.000005 1.922921e-07 0.000005 0.000040 1 234006.0 1.235137 64 0.000109 0 ref 100 1 0.000005
2 0.000006 4.807136e-07 0.000005 0.000128 1 193376.0 1.086917 64 0.000105 0 custom 100 1 0.000006
3 0.000019 1.178412e-05 0.000012 0.000418 1 52922.0 1.030980 64 0.000253 0 sparse 100 10 0.000019
4 0.000015 2.246243e-06 0.000013 0.000032 1 72425.0 1.116466 64 0.000369 0 ref 100 10 0.000015
Processing time¶
piv = pandas.pivot_table(
df, index=["voc_size", "name"], columns="batch_size", values="average"
)
print(piv)
batch_size 1 10 500 1000 2000
voc_size name
100 custom 0.000006 0.000007 0.002576 0.004193 0.000110
ref 0.000005 0.000015 0.000032 0.000049 0.000079
sparse 0.000009 0.000019 0.020261 0.010807 0.015054
1000 custom 0.000006 0.000007 0.003343 0.004693 0.004949
ref 0.000006 0.000016 0.000086 0.000158 0.000293
sparse 0.000007 0.000017 0.013015 0.021382 0.008598
5000 custom 0.000006 0.000010 0.004730 0.004188 0.004321
ref 0.000008 0.000024 0.000313 0.000602 0.001269
sparse 0.000010 0.000017 0.018696 0.014958 0.013342
10000 custom 0.000006 0.000015 0.000997 0.003041 0.004751
ref 0.000011 0.000028 0.000568 0.001238 0.002511
sparse 0.000010 0.000015 0.005802 0.003670 0.005919
Memory peak¶
It is always difficult to estimate. A second process is started to measure the physical memory peak during the execution every ms. The figures is the difference between this peak and the memory when the measurement began.
piv = pandas.pivot_table(
df, index=["voc_size", "name"], columns="batch_size", values="peak"
)
print(piv / 2**20)
batch_size 1 10 500 1000 2000
voc_size name
100 custom 0.00000 0.00000 0.00000 0.00000 0.00000
ref 0.00000 0.00000 0.00000 0.00000 0.00000
sparse 0.00000 0.00000 0.15625 0.15625 0.15625
1000 custom 0.00000 0.00000 0.00000 0.62500 7.18750
ref 0.00000 0.00000 0.00000 0.00000 3.90625
sparse 0.00000 0.00000 0.00000 0.00000 0.00000
5000 custom 0.00000 0.00000 5.78125 19.06250 37.96875
ref 0.00000 0.00000 0.00000 19.21875 37.81250
sparse 0.00000 0.00000 0.15625 0.15625 0.00000
10000 custom 0.00000 0.00000 19.37500 38.28125 76.40625
ref 0.00000 0.00000 18.90625 38.28125 75.93750
sparse 0.15625 0.15625 0.00000 0.15625 0.00000
Graphs¶
ax = vhistograms(df)
fig = ax[0, 0].get_figure()
fig.savefig("plot_op_tfidfvectorizer_sparse.png")

Take away¶
Sparse works better when the sparsity is big enough and the batch size as well.
Total running time of the script: (1 minutes 23.417 seconds)