yobx.reference.torch_evaluator#

yobx.reference.torch_evaluator.get_kernels() → Dict[Tuple[str, str, int], type[OpRunKernel]][source]#

Retrieves all the available kernels class TorchReferenceEvaluator can use. The full list is the following.

<<<

from yobx.reference.torch_evaluator import get_kernels

for k, v in sorted(get_kernels().items()):
    domain, name, version = k
    f = f"{name}({version})" if domain == "" else f"{name}[{domain}]({version})"
    add = " " * max(25 - len(f), 0)
    dd = " -- device dependent" if v.device_dependent() else ""
    print(f"{f}{add} -- {v.__name__}{dd}")

>>>

    Abs(1)                    -- Abs_1
    Add(1)                    -- Add_1
    And(1)                    -- And_1
    AveragePool(11)           -- AveragePool_11
    Cast(6)                   -- Cast_6
    CastLike(15)              -- CastLike_15
    Concat(1)                 -- Concat_1
    ConcatFromSequence(11)    -- ConcatFromSequence_11
    ConstantOfShape(9)        -- ConstantOfShape_9 -- device dependent
    Conv(11)                  -- Conv_11
    Cos(1)                    -- Cos_1
    Div(1)                    -- Div_1
    Equal(1)                  -- Equal_1
    Erf(9)                    -- Erf_9
    Exp(1)                    -- Exp_1
    Expand(8)                 -- Expand_8
    Gather(1)                 -- Gather_1
    Greater(1)                -- Greater_1
    GreaterOrEqual(1)         -- GreaterOrEqual_1
    Identity(1)               -- Identity_1
    If(1)                     -- If_1
    IsNaN(9)                  -- IsNaN_9
    LayerNormalization(17)    -- LayerNormalization_17
    Less(1)                   -- Less_1
    LessOrEqual(1)            -- LessOrEqual_1
    Log(1)                    -- Log_1
    Loop(16)                  -- Loop_16
    MatMul(1)                 -- MatMul_1
    Mul(1)                    -- Mul_1
    Neg(1)                    -- Neg_1
    NonZero(13)               -- NonZero_13
    Not(1)                    -- Not_1
    Or(1)                     -- Or_1
    Pow(12)                   -- Pow_12
    Range(11)                 -- Range_11 -- device dependent
    Reciprocal(1)             -- Reciprocal_1
    ReduceMax(18)             -- ReduceMax_18
    ReduceMean(18)            -- ReduceMean_18
    ReduceMin(17)             -- ReduceMin_17
    ReduceMin(18)             -- ReduceMin_18
    ReduceSum(13)             -- ReduceSum_13
    Reshape(14)               -- Reshape_14
    ScatterND(16)             -- ScatterND_16
    SequenceEmpty(11)         -- SequenceEmpty_11
    SequenceInsert(11)        -- SequenceInsert_11
    Shape(15)                 -- Shape_15
    Sigmoid(6)                -- Sigmoid_6
    Sin(1)                    -- Sin_1
    Slice(13)                 -- Slice_13
    Softmax(13)               -- Softmax_13
    Split(18)                 -- Split_18
    Sqrt(1)                   -- Sqrt_1
    Squeeze(13)               -- Squeeze_13
    Sub(1)                    -- Sub_1
    Tanh(6)                   -- Tanh_6
    Tile(6)                   -- Tile_6
    Transpose(1)              -- Transpose_1
    Trilu(14)                 -- Trilu_14
    Unsqueeze(13)             -- Unsqueeze_13
    Where(9)                  -- Where_9

TorchReferenceEvaluator#

class yobx.reference.torch_evaluator.TorchReferenceEvaluator(proto: FunctionProto | GraphProto | ModelProto | ExportArtifact, providers: Tuple[str, ...] = ('CPUExecutionProvider',), opsets: Dict[str, int] | None = None, local_functions: Dict[Tuple[str, str], TorchReferenceEvaluator] | None = None, verbose: int = 0, custom_kernels: Dict[Tuple[str, str], type[OpRunKernel]] | None = None)[source]#

Torch evaluator for onnx models. The model does not store the original proto it evaluates in order to avoid unnecessary memory usage and potential side effects from mutating a shared object.

Parameters:

proto – a proto
providers – where to run the model
opsets – needed if proto is a graph
functions – known local functions
verbose – verbosity level
custom_kernels – dictionary of kernels the user can defined to overwrite a specific implementation: ("", "LayerNormalization"): CustomKernel

The class holds the following attributes:

providers: providers
default_device: default torch device
constants: all initializers or constants
kernels: kernels
runtime_info: produced by first_used_last_used
last_used: contains the list of intermediate results,
to remove after every node execution, this avoid the memory to grow too much
functions: local functions

The class is not multithreaded. runtime_info gets updated by the class. The list of available kernels is returned by function yobx.reference.torch_evaluator.get_kernels(). Example:

<<<

import onnx
import onnx.helper as oh
import torch
from yobx.helpers import string_type
from yobx.reference.torch_evaluator import TorchReferenceEvaluator

TFLOAT = onnx.TensorProto.FLOAT

proto = oh.make_model(
    oh.make_graph(
        [
            oh.make_node("Sigmoid", ["Y"], ["sy"]),
            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
            oh.make_node("Mul", ["X", "ysy"], ["final"]),
        ],
        "-nd-",
        [
            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
        ],
        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
    ),
    opset_imports=[oh.make_opsetid("", 18)],
    ir_version=9,
)

sess = TorchReferenceEvaluator(proto)
feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
result = sess.run(None, feeds)
print(string_type(result, with_shape=True, with_min_max=True))

>>>

    #1[T1s4x5[0.0029505309648811817,0.36914652585983276:A0.16642865750472993]]

With verbose=1, the class prints out every kernel run and and every result deleted along the run. It shows when a result is not needed anymore. In that case, it is deleted to free the memory it takes.

<<<

import onnx
import onnx.helper as oh
import torch
from yobx.helpers import string_type
from yobx.reference.torch_evaluator import TorchReferenceEvaluator

TFLOAT = onnx.TensorProto.FLOAT

proto = oh.make_model(
    oh.make_graph(
        [
            oh.make_node("Sigmoid", ["Y"], ["sy"]),
            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
            oh.make_node("Mul", ["X", "ysy"], ["final"]),
        ],
        "-nd-",
        [
            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
        ],
        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
    ),
    opset_imports=[oh.make_opsetid("", 18)],
    ir_version=9,
)

sess = TorchReferenceEvaluator(proto, verbose=1)
feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
result = sess.run(None, feeds)
print(string_type(result, with_shape=True, with_min_max=True))

>>>

    +I X: RuntimeValue(name='X', kind=5, shape=(4, 5), value=CT1s4x5[0.034813642501831055,0.9644800424575806:A0.39601448476314544])
    +I Y: RuntimeValue(name='Y', kind=5, shape=(4, 5), value=CT1s4x5[0.021994590759277344,0.9635341763496399:A0.5201189249753952])
    Sigmoid_6(Y) -> sy
    +R sy: RuntimeValue(name='sy', kind=1, shape=(4, 5), is_shape=False, value=CT1s4x5[0.5054984092712402,0.7238288521766663:A0.6244445890188217])
    Mul_1(Y, sy) -> ysy
    +R ysy: RuntimeValue(name='ysy', kind=1, shape=(4, 5), is_shape=False, value=CT1s4x5[0.011118230409920216,0.6974338293075562:A0.3478383567649871])
    - clean Y
    - clean sy
    Mul_1(X, ysy) -> final
    +R final: RuntimeValue(name='final', kind=9, shape=(4, 5), is_shape=False, value=CT1s4x5[0.0026139135006815195,0.6015202403068542:A0.13468847892945632])
    - clean X
    - clean ysy
    ++ outputs final
    - clean X
    - clean Y
    - clean final
    #1[T1s4x5[0.0026139135006815195,0.6015202403068542:A0.13468847892945632]]

The runtime can also execute the kernel the onnx model on CUDA. It follows the same logic as onnxruntime.InferenceSession: providers=["CUDAExecutionProvider"]. It is better in that case to move the input on CUDA. The class tries to move every weight on CUDA but tries to keep any tensor identified as a shape in CPU. Some bugs may remain as torch raises an exception when devices are expected to be the same. The runtime was validated with model arnir0/Tiny-LLM. Next example shows how to replace a kernel with a different one based on onnxruntime.

<<<

import numpy as np
import onnx
import onnx.helper as oh
import onnxruntime
import torch
from yobx.helpers import string_type
from yobx.torch.torch_helper import onnx_dtype_to_torch_dtype
from yobx.reference.torch_evaluator import TorchReferenceEvaluator
from yobx.reference.torch_ops import OpRunKernel, OpRunTensor

TFLOAT16 = onnx.TensorProto.FLOAT16


class LayerNormalizationOrt(OpRunKernel):
    "LayerNormalization based on onnxruntime"

    def __init__(self, node: onnx.NodeProto, version=None, verbose=0):
        super().__init__(node, version, verbose=verbose)
        self.axis = self.get_attribute_int(node, "axis", -1)
        self.epsilon = self.get_attribute_float(node, "epsilon", 1e-5)
        self.stash_type = onnx_dtype_to_torch_dtype(
            self.get_attribute_int(node, "stash_type", onnx.TensorProto.FLOAT)
        )
        self.compute_std = len(node.output) > 1
        assert not self.compute_std, "The keren only computes the first output."
        layer_model = oh.make_model(
            oh.make_graph(
                [
                    oh.make_node(
                        "LayerNormalization",
                        ["X", "W", "B"],
                        ["Z"],
                        axis=-1,
                        epsilon=9.999999974752427e-7,
                    )
                ],
                "dummy",
                [
                    oh.make_tensor_value_info("X", TFLOAT16, ["b", "c", "d"]),
                    oh.make_tensor_value_info("W", TFLOAT16, ["d"]),
                    oh.make_tensor_value_info("B", TFLOAT16, ["d"]),
                ],
                [oh.make_tensor_value_info("Z", TFLOAT16, ["b", "c", "d"])],
            ),
            ir_version=9,
            opset_imports=[oh.make_opsetid("", 17)],
        )
        self.ort_sess = onnxruntime.InferenceSession(
            layer_model.SerializeToString(), providers=["CUDAExecutionProvider"]
        )

    def run(self, x, scale, bias=None):
        print(f"-- running {self.__class__.__name__}")
        feeds = dict(X=x, W=scale)
        if bias is not None:
            feeds["B"] = bias
        feeds = {k: v.tensor.detach().cpu().numpy() for k, v in feeds.items()}
        got = self.ort_sess.run(None, feeds)[0]
        return OpRunTensor(torch.from_numpy(got).to(x.dtype).to(x.device))


# This kernel is tested on this model.
model = oh.make_model(
    oh.make_graph(
        [
            oh.make_node(
                "LayerNormalization",
                ["X", "W", "B"],
                ["ln"],
                axis=-1,
                epsilon=9.999999974752427e-7,
            ),
            oh.make_node(
                "Add", ["ln", "W"], ["Z"], axis=-1, epsilon=9.999999974752427e-7
            ),
        ],
        "dummy",
        [
            oh.make_tensor_value_info("X", TFLOAT16, ["b", "c", "d"]),
            oh.make_tensor_value_info("W", TFLOAT16, ["d"]),
            oh.make_tensor_value_info("B", TFLOAT16, ["d"]),
        ],
        [oh.make_tensor_value_info("Z", TFLOAT16, ["b", "c", "d"])],
    ),
    ir_version=9,
    opset_imports=[oh.make_opsetid("", 17)],
)

torch_sess = TorchReferenceEvaluator(
    model,
    custom_kernels={("", "LayerNormalization"): LayerNormalizationOrt},
    verbose=1,
)
feeds = dict(
    zip(
        torch_sess.input_names,
        [
            torch.rand(3, 4, 5, dtype=torch.float16),
            torch.abs(torch.rand(5, dtype=torch.float16)),
            torch.rand(5, dtype=torch.float16),
        ],
    )
)
res = torch_sess.run(None, feeds)
print(string_type(res, with_shape=True, with_min_max=True))

>>>

    +I X: RuntimeValue(name='X', kind=5, shape=(3, 4, 5), value=CT10s3x4x5[0.0195465087890625,0.9833984375:A0.4074475606282552])
    +I W: RuntimeValue(name='W', kind=5, shape=(5,), value=CT10s5[0.332763671875,0.5341796875:A0.45244140625])
    +I B: RuntimeValue(name='B', kind=5, shape=(5,), value=CT10s5[0.12017822265625,0.87451171875:A0.55987548828125])
    LayerNormalizationOrt(X, W, B) -> ln
    -- running LayerNormalizationOrt
    +R ln: RuntimeValue(name='ln', kind=1, shape=(3, 4, 5), is_shape=False, value=CT10s3x4x5[-0.498779296875,1.5927734375:A0.5605557759602865])
    - clean X
    - clean B
    Add_1(ln, W) -> Z
    +R Z: RuntimeValue(name='Z', kind=9, shape=(3, 4, 5), is_shape=False, value=CT10s3x4x5[0.001708984375,2.126953125:A1.012945556640625])
    - clean W
    - clean ln
    ++ outputs Z
    - clean X
    - clean W
    - clean B
    - clean Z
    #1[T10s3x4x5[0.001708984375,2.126953125:A1.012945556640625]]

class IO[source]#

get_inputs()[source]#: Same API than onnxruntime.

get_outputs()[source]#: Same API than onnxruntime.

property on_cuda: bool#: Tells if the default device is CUDA.

Runs the ONNX model.

Parameters:

outputs – outputs required
feeds – inputs
report_cmp – used as a reference, every intermediate results is compare to every existing one, if not empty, it is an instance of yobx.reference.ReportResultComparison

Returns:

output tensors.

run_with_values(*args: OpRunTensor | None, context: Dict[str, RuntimeValue] | None = None) → OpRunValue | Tuple[OpRunValue, ...][source]#

Runs the ONNX model. The signature is different. This method is called by every kernel hokding a subgraph. The local variables are stored in context.

Parameters:

args – inputs
context – local context for the execution of subgraphs

Returns:

output OpRunTensor