Quantization#

La quantization consiste à discrétiser les paramètres d’un réseau de neurones afin de réduire l’espace mémoire et les temps de calculer en contrepartie d’une perte de performance. Comment estimer ces paramètres pour minimiser la perte ?

Une matrice de coefficients#

On les prend d’un modèle de deep learning MobileNet.

[15]:
import os
import urllib
import urllib.request

url = "https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mobilenet/model/mobilenetv2-12.onnx"
destination = "mobilenetv2-12.onnx"

if not os.path.exists(destination) or os.stat(destination).st_size < 10000:
    print(f"download {destination!r}")
    g = urllib.request.urlopen(url)
    with open(destination, "wb") as f:
        f.write(g.read())
    print("done")
else:
    print(f"already downloaded {destination!r}")
print(f"model size {os.stat(destination).st_size} bytes")
already downloaded 'mobilenetv2-12.onnx'
model size 13964571 bytes
[16]:
from onnx import load

with open(destination, "rb") as f:
    onx = load(f)
    print(f"model size: {len(onx.SerializeToString())}")
model size: 13964571

On prend une des plus grandes matrices de coefficients.

[17]:
initializers = []
for init in onx.graph.initializer:
    initializers.append((len(init.SerializeToString()), init.name, init))

initializers.sort()

for init in initializers[-5:]:
    print(init[:2])
(614421, '616')
(614421, '619')
(1228821, '625')
(1638421, '628')
(5120034, 'classifier.1.weight')
[18]:
from onnx.numpy_helper import to_array

coef = to_array(initializers[-4][-1])
coef.shape, coef.dtype
[18]:
((960, 160, 1, 1), dtype('float32'))

Distributions#

[19]:
import matplotlib.pyplot as plt

cf = coef.ravel()
cf01 = cf[(cf >= 0) & (cf <= 0.001)]

fig, axs = plt.subplots(1, 2, figsize=(10, 4))
axs[0].hist(cf, bins=2048)
axs[0].set_title(
    f"Distribution des coefficients\nd'une matrice de coefficients\n{cf.size} éléments"
)
axs[1].hist(cf01, bins=2048)
title = f"Même distribution entre 0 et {cf01.max():.4f}\n{cf01.size} éléments"
axs[1].set_title(title);
../../_images/notebooks_dsgarden_quantization_f8_8_0.png

Et maintenant la distribution des float 8.

[20]:
import numpy
from onnx.numpy_helper import float8e4m3_to_float32

float8 = [float8e4m3_to_float32(i) for i in range(256)]
no_nan8 = [f for f in float8 if not numpy.isnan(f)]
len(no_nan8)
[20]:
254
[21]:
p = 3
gauss = numpy.random.normal(size=len(no_nan8) * 20)
scale1 = numpy.std(no_nan8) / numpy.std(gauss)
scalep = numpy.std(no_nan8) / numpy.std(gauss**p)


fig, axs = plt.subplots(1, 2, figsize=(10, 4))
axs[0].hist(float8, bins=50, alpha=0.5, label="f8", density=True)
axs[0].hist(gauss * scale1, bins=50, alpha=0.5, label="N", density=True)
axs[0].hist(gauss**p * scalep, bins=50, alpha=0.5, label=f"N^{p}", density=True)
axs[0].set_xlim([-200, 200])
axs[0].set_title("Distribution des float 8")
axs[0].legend()

axs[1].hist(float8, bins=2000, alpha=0.5, label="f8", density=True)
axs[1].hist(gauss * scale1, bins=2000, alpha=0.5, label="N", density=True)
axs[1].hist(gauss**p * scalep, bins=2000, alpha=0.5, label=f"N^{p}", density=True)
axs[1].set_xlim([-50, 50])
axs[1].set_title("Même distribution avec plus de bins")
axs[1].legend();
../../_images/notebooks_dsgarden_quantization_f8_11_0.png

Les coefficients ont l’air distribués selon une loi gaussienne. Les float 8 un peu plus selon une loi gaussienne à la puissance 3. On se sert de cette observations pour estimer le paramètre \(\lambda\). Néanmoins, la normalisation choisie ne permet que de changer d’échelle.

Estimation de l’échelle#

On veut également comparer avec une quantization classique avec des entiers sur 8 bits.

[22]:
from onnx import TensorProto


def estimation_quantization_scale(
    coef: numpy.array,
    to: int = TensorProto.FLOAT8E4M3FN,
    method: str = "naive",
    threshold: float = 0.99999,
) -> tuple[float, float]:
    """
    Estimates the scale parameter for the quantization to float 8 assuming
    the distribution of the coefficients is gaussian.
    """
    if to == TensorProto.FLOAT8E4M3FN:
        float8 = [float8e4m3_to_float32(i) for i in range(256)]
        quant_float = [f for f in float8 if not numpy.isnan(f)]
        if method == "naive":
            std_coef = numpy.std(coef.ravel())
            std_quant = numpy.std(numpy.array(quant_float, dtype=numpy.float32))
        elif method == "power":
            cr = coef.ravel()
            ca = numpy.abs(cr)
            std_coef = numpy.std(ca ** (1.0 / 3.0) * cr / ca)
            std_quant = numpy.std(numpy.array(quant_float, dtype=numpy.float32))
        else:
            raise ValueError(f"Unexpected quantization method {method!r}.")
        zero = 0.0
        scale = std_quant / std_coef
    elif to == TensorProto.UINT8:
        qu = numpy.quantile(coef.ravel(), [1 - threshold, threshold])
        scale = 255 / (qu[1] - qu[0])
        zero = qu[0] * scale
    else:
        raise ValueError(f"Unexpected quantization type for to={to}.")

    return 1.0 / scale, -zero


scale_f8, zero_f8 = estimation_quantization_scale(coef)
scale_f8, zero_f8
[22]:
(np.float32(0.00014669707), -0.0)
[23]:
scale_f8p, zero_f8p = estimation_quantization_scale(coef, method="power")
scale_f8p, zero_f8p
[23]:
(np.float32(0.002160199), -0.0)
[24]:
scale_u8, zero_u8 = estimation_quantization_scale(coef, to=TensorProto.UINT8)
scale_u8, zero_u8
[24]:
(np.float64(0.0007863875484906944), np.float64(123.14246096563787))

Vérification par un graphique

[25]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
axs[0].hist(coef.ravel() / scale_f8, bins=512, density=True, label="coef_f8", alpha=0.5)
axs[0].hist(no_nan8, bins=512, alpha=0.5, label="f8", density=True)
axs[0].legend()
axs[0].set_xlim([-200, 200])
axs[0].set_title("Distribution des coefficients à l'échelle\nfloat 8")

axs[1].hist(
    coef.ravel() / scale_f8p, bins=512, density=True, label="coef_f8", alpha=0.5
)
axs[1].hist(no_nan8, bins=512, alpha=0.5, label="f8", density=True)
axs[1].legend()
axs[1].set_xlim([-200, 200])
axs[1].set_title("Distribution des coefficients à l'échelle\nfloat 8 method='power'")

axs[2].hist(
    coef.ravel() / scale_u8 + zero_u8,
    bins=512,
    density=True,
    label="coef_u8",
    alpha=0.5,
)
axs[2].hist(list(range(256)), bins=100, alpha=0.5, label="u8", density=True)
axs[2].legend()
axs[2].set_title("Distribution des coefficients à l'échelle\nuint 8");
../../_images/notebooks_dsgarden_quantization_f8_18_0.png

Pas évident de choisir les bons paramètres.

QDQ#

On compare la perte avec deux opérations QuantizeLinear + DequantizeLinear.

[26]:
from onnx.helper import (
    make_node,
    make_graph,
    make_model,
    make_tensor_value_info,
    make_operatorsetid,
    make_tensor,
)
from onnx.reference import ReferenceEvaluator

X = make_tensor_value_info("X", TensorProto.FLOAT, [None])
Scale = make_tensor_value_info("Scale", TensorProto.FLOAT, [1])
Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None])

model_f8 = make_model(
    make_graph(
        [
            make_node(
                "Constant",
                [],
                ["Zero"],
                value=make_tensor("Zero", TensorProto.FLOAT8E4M3FN, [1], [0.0]),
            ),
            make_node("QuantizeLinear", ["X", "Scale", "Zero"], ["Q"], axis=0),
            make_node("DequantizeLinear", ["Q", "Scale"], ["Y"], axis=0),
        ],
        "quf8",
        [X, Scale],
        [Y],
    ),
    opset_imports=[make_operatorsetid("", 19)],
)

ref_f8 = ReferenceEvaluator(model_f8)
qu_f8 = ref_f8.run(
    None, {"X": coef.ravel(), "Scale": numpy.array([scale_f8], dtype=numpy.float32)}
)[0]
qu_f8
[26]:
array([-0.00821504,  0.00381412, -0.00010085, ...,  0.00880182,
       -0.02112438, -0.00410752], dtype=float32)
[27]:
qu_f8p = ref_f8.run(
    None, {"X": coef.ravel(), "Scale": numpy.array([scale_f8p], dtype=numpy.float32)}
)[0]
qu_f8p
[27]:
array([-0.0086408 ,  0.00378035, -0.00010126, ...,  0.0086408 ,
       -0.02160199, -0.0043204 ], dtype=float32)
[28]:
model_u8 = make_model(
    make_graph(
        [
            make_node(
                "Constant",
                [],
                ["Zero"],
                value=make_tensor("Zero", TensorProto.UINT8, [1], [int(zero_u8)]),
            ),
            make_node("QuantizeLinear", ["X", "Scale", "Zero"], ["Q"], axis=0),
            make_node("DequantizeLinear", ["Q", "Scale", "Zero"], ["Y"], axis=0),
        ],
        "quu8",
        [X, Scale],
        [Y],
    ),
    opset_imports=[make_operatorsetid("", 19)],
)

ref_u8 = ReferenceEvaluator(model_u8)
qu_u8 = ref_u8.run(
    None, {"X": coef.ravel(), "Scale": numpy.array([scale_u8], dtype=numpy.float32)}
)[0]
qu_u8
[28]:
array([-0.00865026,  0.00393194,  0.        , ...,  0.00865026,
       -0.02123246, -0.00393194], dtype=float32)
[29]:
err_f8 = ((coef.ravel() - qu_f8) ** 2).sum() ** 0.5
err_f8p = ((coef.ravel() - qu_f8p) ** 2).sum() ** 0.5
err_u8 = ((coef.ravel() - qu_u8) ** 2).sum() ** 0.5
err_f8, err_f8p, err_u8
[29]:
(np.float32(0.21044867), np.float32(0.15230674), np.float32(0.09929135))

La quantization avec les float 8 fonctionne moins bien que la quantization avec des entiers.

[30]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6))
for i, bins in [(0, 64), (1, 512)]:
    axs[i, 0].hist(coef.ravel(), bins=bins, density=True, label="coef", alpha=0.5)
    axs[i, 0].hist(qu_f8, bins=bins, alpha=0.5, label="qdq_f8", density=True)
    axs[i, 0].legend()
    axs[i, 0].set_title(f"QDQ float 8\nerr={err_f8:1.3g}")

    axs[i, 1].hist(coef.ravel(), bins=bins, density=True, label="coef", alpha=0.5)
    axs[i, 1].hist(qu_f8p, bins=bins, alpha=0.5, label="qdq_f8", density=True)
    axs[i, 1].legend()
    axs[i, 1].set_title(f"QDQ float 8 (power)\nerr={err_f8p:1.3g}")

    axs[i, 2].hist(coef.ravel(), bins=bins, density=True, label="coef", alpha=0.5)
    axs[i, 2].hist(qu_u8, bins=bins, alpha=0.5, label="qdq_u8", density=True)
    axs[i, 2].legend()
    axs[i, 2].set_title(f"QDQ uint 8\nerr={err_u8:1.3g}")
../../_images/notebooks_dsgarden_quantization_f8_26_0.png

Et avec plusieurs valeurs d’échelle#

[31]:
from pandas import DataFrame
from tqdm import tqdm

a = 0.00014669707383747942
h = 0.00014669707383747942 * 2

data = []
for scale in tqdm([a + h * i for i in range(5)]):
    got = ref_f8.run(
        None, {"X": coef.ravel(), "Scale": numpy.array([scale], dtype=numpy.float32)}
    )[0]
    err = ((coef.ravel() - got) ** 2).sum() ** 0.5
    obs = dict(scale=scale, err=err, scale_f8p=scale_f8p, scale_f8=scale_f8)
    data.append(obs)

df = DataFrame(data)
df
100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
[31]:
scale err scale_f8p scale_f8
0 0.000147 0.210449 0.00216 0.000147
1 0.000440 0.152721 0.00216 0.000147
2 0.000733 0.152714 0.00216 0.000147
3 0.001027 0.151731 0.00216 0.000147
4 0.001320 0.152831 0.00216 0.000147
[32]:
df.plot(x="scale", y="err", logy=True);
../../_images/notebooks_dsgarden_quantization_f8_29_0.png

Pas mieux.

Optimisation#

[ ]: