yobx.reference.torch_evaluator#

yobx.reference.torch_evaluator.get_kernels() Dict[Tuple[str, str, int], type[OpRunKernel]][source]#

Retrieves all the available kernels class TorchReferenceEvaluator can use. The full list is the following.

<<<

from yobx.reference.torch_evaluator import get_kernels

for k, v in sorted(get_kernels().items()):
    domain, name, version = k
    f = f"{name}({version})" if domain == "" else f"{name}[{domain}]({version})"
    add = " " * max(25 - len(f), 0)
    dd = " -- device dependent" if v.device_dependent() else ""
    print(f"{f}{add} -- {v.__name__}{dd}")

>>>

    Abs(1)                    -- Abs_1
    Add(1)                    -- Add_1
    And(1)                    -- And_1
    AveragePool(11)           -- AveragePool_11
    Cast(6)                   -- Cast_6
    CastLike(15)              -- CastLike_15
    Concat(1)                 -- Concat_1
    ConcatFromSequence(11)    -- ConcatFromSequence_11
    ConstantOfShape(9)        -- ConstantOfShape_9 -- device dependent
    Conv(11)                  -- Conv_11
    Cos(1)                    -- Cos_1
    Div(1)                    -- Div_1
    Equal(1)                  -- Equal_1
    Erf(9)                    -- Erf_9
    Exp(1)                    -- Exp_1
    Expand(8)                 -- Expand_8
    Gather(1)                 -- Gather_1
    Greater(1)                -- Greater_1
    GreaterOrEqual(1)         -- GreaterOrEqual_1
    Identity(1)               -- Identity_1
    If(1)                     -- If_1
    IsNaN(9)                  -- IsNaN_9
    LayerNormalization(17)    -- LayerNormalization_17
    Less(1)                   -- Less_1
    LessOrEqual(1)            -- LessOrEqual_1
    Log(1)                    -- Log_1
    Loop(16)                  -- Loop_16
    MatMul(1)                 -- MatMul_1
    Mul(1)                    -- Mul_1
    Neg(1)                    -- Neg_1
    NonZero(13)               -- NonZero_13
    Not(1)                    -- Not_1
    Or(1)                     -- Or_1
    Pow(12)                   -- Pow_12
    Range(11)                 -- Range_11 -- device dependent
    Reciprocal(1)             -- Reciprocal_1
    ReduceMax(18)             -- ReduceMax_18
    ReduceMean(18)            -- ReduceMean_18
    ReduceMin(17)             -- ReduceMin_17
    ReduceMin(18)             -- ReduceMin_18
    ReduceSum(13)             -- ReduceSum_13
    Reshape(14)               -- Reshape_14
    ScatterND(16)             -- ScatterND_16
    SequenceEmpty(11)         -- SequenceEmpty_11
    SequenceInsert(11)        -- SequenceInsert_11
    Shape(15)                 -- Shape_15
    Sigmoid(6)                -- Sigmoid_6
    Sin(1)                    -- Sin_1
    Slice(13)                 -- Slice_13
    Softmax(13)               -- Softmax_13
    Split(18)                 -- Split_18
    Sqrt(1)                   -- Sqrt_1
    Squeeze(13)               -- Squeeze_13
    Sub(1)                    -- Sub_1
    Tanh(6)                   -- Tanh_6
    Tile(6)                   -- Tile_6
    Transpose(1)              -- Transpose_1
    Trilu(14)                 -- Trilu_14
    Unsqueeze(13)             -- Unsqueeze_13
    Where(9)                  -- Where_9

TorchReferenceEvaluator#

class yobx.reference.torch_evaluator.TorchReferenceEvaluator(proto: FunctionProto | GraphProto | ModelProto | ExportArtifact, providers: Tuple[str, ...] = ('CPUExecutionProvider',), opsets: Dict[str, int] | None = None, local_functions: Dict[Tuple[str, str], TorchReferenceEvaluator] | None = None, verbose: int = 0, custom_kernels: Dict[Tuple[str, str], type[OpRunKernel]] | None = None)[source]#

Torch evaluator for onnx models. The model does not store the original proto it evaluates in order to avoid unnecessary memory usage and potential side effects from mutating a shared object.

Parameters:
  • proto – a proto

  • providers – where to run the model

  • opsets – needed if proto is a graph

  • functions – known local functions

  • verbose – verbosity level

  • custom_kernels – dictionary of kernels the user can defined to overwrite a specific implementation: ("", "LayerNormalization"): CustomKernel

The class holds the following attributes:

  • providers: providers

  • default_device: default torch device

  • constants: all initializers or constants

  • kernels: kernels

  • runtime_info: produced by first_used_last_used

  • last_used: contains the list of intermediate results,

    to remove after every node execution, this avoid the memory to grow too much

  • functions: local functions

The class is not multithreaded. runtime_info gets updated by the class. The list of available kernels is returned by function yobx.reference.torch_evaluator.get_kernels(). Example:

<<<

import onnx
import onnx.helper as oh
import torch
from yobx.helpers import string_type
from yobx.reference.torch_evaluator import TorchReferenceEvaluator

TFLOAT = onnx.TensorProto.FLOAT

proto = oh.make_model(
    oh.make_graph(
        [
            oh.make_node("Sigmoid", ["Y"], ["sy"]),
            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
            oh.make_node("Mul", ["X", "ysy"], ["final"]),
        ],
        "-nd-",
        [
            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
        ],
        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
    ),
    opset_imports=[oh.make_opsetid("", 18)],
    ir_version=9,
)

sess = TorchReferenceEvaluator(proto)
feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
result = sess.run(None, feeds)
print(string_type(result, with_shape=True, with_min_max=True))

>>>

    #1[T1s4x5[0.0007596650393679738,0.660194456577301:A0.18426050830748864]]

With verbose=1, the class prints out every kernel run and and every result deleted along the run. It shows when a result is not needed anymore. In that case, it is deleted to free the memory it takes.

<<<

import onnx
import onnx.helper as oh
import torch
from yobx.helpers import string_type
from yobx.reference.torch_evaluator import TorchReferenceEvaluator

TFLOAT = onnx.TensorProto.FLOAT

proto = oh.make_model(
    oh.make_graph(
        [
            oh.make_node("Sigmoid", ["Y"], ["sy"]),
            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
            oh.make_node("Mul", ["X", "ysy"], ["final"]),
        ],
        "-nd-",
        [
            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
        ],
        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
    ),
    opset_imports=[oh.make_opsetid("", 18)],
    ir_version=9,
)

sess = TorchReferenceEvaluator(proto, verbose=1)
feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
result = sess.run(None, feeds)
print(string_type(result, with_shape=True, with_min_max=True))

>>>

    +I X: RuntimeValue(name='X', kind=5, shape=(4, 5), value=CT1s4x5[0.0019821524620056152,0.9329335689544678:A0.44820278584957124])
    +I Y: RuntimeValue(name='Y', kind=5, shape=(4, 5), value=CT1s4x5[0.01164865493774414,0.9826557040214539:A0.5178548455238342])
    Sigmoid_6(Y) -> sy
    +R sy: RuntimeValue(name='sy', kind=1, shape=(4, 5), is_shape=False, value=CT1s4x5[0.5029121041297913,0.727634847164154:A0.6245394647121429])
    Mul_1(Y, sy) -> ysy
    +R ysy: RuntimeValue(name='ysy', kind=1, shape=(4, 5), is_shape=False, value=CT1s4x5[0.005858249496668577,0.7150145173072815:A0.33991979171987624])
    - clean Y
    - clean sy
    Mul_1(X, ysy) -> final
    +R final: RuntimeValue(name='final', kind=9, shape=(4, 5), is_shape=False, value=CT1s4x5[0.00014279052265919745,0.4736403524875641:A0.15676576342229964])
    - clean X
    - clean ysy
    ++ outputs final
    - clean X
    - clean Y
    - clean final
    #1[T1s4x5[0.00014279052265919745,0.4736403524875641:A0.15676576342229964]]

The runtime can also execute the kernel the onnx model on CUDA. It follows the same logic as onnxruntime.InferenceSession: providers=["CUDAExecutionProvider"]. It is better in that case to move the input on CUDA. The class tries to move every weight on CUDA but tries to keep any tensor identified as a shape in CPU. Some bugs may remain as torch raises an exception when devices are expected to be the same. The runtime was validated with model arnir0/Tiny-LLM. Next example shows how to replace a kernel with a different one based on onnxruntime.

<<<

import numpy as np
import onnx
import onnx.helper as oh
import onnxruntime
import torch
from yobx.helpers import string_type
from yobx.torch.torch_helper import onnx_dtype_to_torch_dtype
from yobx.reference.torch_evaluator import TorchReferenceEvaluator
from yobx.reference.torch_ops import OpRunKernel, OpRunTensor

TFLOAT16 = onnx.TensorProto.FLOAT16


class LayerNormalizationOrt(OpRunKernel):
    "LayerNormalization based on onnxruntime"

    def __init__(self, node: onnx.NodeProto, version=None, verbose=0):
        super().__init__(node, version, verbose=verbose)
        self.axis = self.get_attribute_int(node, "axis", -1)
        self.epsilon = self.get_attribute_float(node, "epsilon", 1e-5)
        self.stash_type = onnx_dtype_to_torch_dtype(
            self.get_attribute_int(node, "stash_type", onnx.TensorProto.FLOAT)
        )
        self.compute_std = len(node.output) > 1
        assert not self.compute_std, "The keren only computes the first output."
        layer_model = oh.make_model(
            oh.make_graph(
                [
                    oh.make_node(
                        "LayerNormalization",
                        ["X", "W", "B"],
                        ["Z"],
                        axis=-1,
                        epsilon=9.999999974752427e-7,
                    )
                ],
                "dummy",
                [
                    oh.make_tensor_value_info("X", TFLOAT16, ["b", "c", "d"]),
                    oh.make_tensor_value_info("W", TFLOAT16, ["d"]),
                    oh.make_tensor_value_info("B", TFLOAT16, ["d"]),
                ],
                [oh.make_tensor_value_info("Z", TFLOAT16, ["b", "c", "d"])],
            ),
            ir_version=9,
            opset_imports=[oh.make_opsetid("", 17)],
        )
        self.ort_sess = onnxruntime.InferenceSession(
            layer_model.SerializeToString(), providers=["CUDAExecutionProvider"]
        )

    def run(self, x, scale, bias=None):
        print(f"-- running {self.__class__.__name__}")
        feeds = dict(X=x, W=scale)
        if bias is not None:
            feeds["B"] = bias
        feeds = {k: v.tensor.detach().cpu().numpy() for k, v in feeds.items()}
        got = self.ort_sess.run(None, feeds)[0]
        return OpRunTensor(torch.from_numpy(got).to(x.dtype).to(x.device))


# This kernel is tested on this model.
model = oh.make_model(
    oh.make_graph(
        [
            oh.make_node(
                "LayerNormalization",
                ["X", "W", "B"],
                ["ln"],
                axis=-1,
                epsilon=9.999999974752427e-7,
            ),
            oh.make_node(
                "Add", ["ln", "W"], ["Z"], axis=-1, epsilon=9.999999974752427e-7
            ),
        ],
        "dummy",
        [
            oh.make_tensor_value_info("X", TFLOAT16, ["b", "c", "d"]),
            oh.make_tensor_value_info("W", TFLOAT16, ["d"]),
            oh.make_tensor_value_info("B", TFLOAT16, ["d"]),
        ],
        [oh.make_tensor_value_info("Z", TFLOAT16, ["b", "c", "d"])],
    ),
    ir_version=9,
    opset_imports=[oh.make_opsetid("", 17)],
)

torch_sess = TorchReferenceEvaluator(
    model,
    custom_kernels={("", "LayerNormalization"): LayerNormalizationOrt},
    verbose=1,
)
feeds = dict(
    zip(
        torch_sess.input_names,
        [
            torch.rand(3, 4, 5, dtype=torch.float16),
            torch.abs(torch.rand(5, dtype=torch.float16)),
            torch.rand(5, dtype=torch.float16),
        ],
    )
)
res = torch_sess.run(None, feeds)
print(string_type(res, with_shape=True, with_min_max=True))

>>>

    /home/xadupre/vv/this312/lib/python3.12/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:123: UserWarning: Specified provider 'CUDAExecutionProvider' is not in available provider names.Available providers: 'AzureExecutionProvider, CPUExecutionProvider'
      warnings.warn(
    +I X: RuntimeValue(name='X', kind=5, shape=(3, 4, 5), value=CT10s3x4x5[0.01275634765625,0.98388671875:A0.46500625610351565])
    +I W: RuntimeValue(name='W', kind=5, shape=(5,), value=CT10s5[0.0858154296875,0.783203125:A0.4722900390625])
    +I B: RuntimeValue(name='B', kind=5, shape=(5,), value=CT10s5[0.1287841796875,0.97607421875:A0.7562255859375])
    LayerNormalizationOrt(X, W, B) -> ln
    -- running LayerNormalizationOrt
    +R ln: RuntimeValue(name='ln', kind=1, shape=(3, 4, 5), is_shape=False, value=CT10s3x4x5[-0.130859375,2.408203125:A0.7238398869832356])
    - clean X
    - clean B
    Add_1(ln, W) -> Z
    +R Z: RuntimeValue(name='Z', kind=9, shape=(3, 4, 5), is_shape=False, value=CT10s3x4x5[0.080322265625,3.19140625:A1.1961639404296875])
    - clean W
    - clean ln
    ++ outputs Z
    - clean X
    - clean W
    - clean B
    - clean Z
    #1[T10s3x4x5[0.080322265625,3.19140625:A1.1961639404296875]]
class IO[source]#
get_inputs()[source]#

Same API than onnxruntime.

get_outputs()[source]#

Same API than onnxruntime.

property on_cuda: bool#

Tells if the default device is CUDA.

run(outputs: List[str] | None, feeds: Dict[str, Tensor] | Dict[str, ndarray], report_cmp: ReportResultComparison | None = None) List[Tensor | None] | List[ndarray | None][source]#

Runs the ONNX model.

Parameters:
  • outputs – outputs required

  • feeds – inputs

  • report_cmp – used as a reference, every intermediate results is compare to every existing one, if not empty, it is an instance of yobx.reference.ReportResultComparison

Returns:

output tensors.

run_with_values(*args: OpRunTensor | None, context: Dict[str, RuntimeValue] | None = None) OpRunValue | Tuple[OpRunValue, ...][source]#

Runs the ONNX model. The signature is different. This method is called by every kernel hokding a subgraph. The local variables are stored in context.

Parameters:
  • args – inputs

  • context – local context for the execution of subgraphs

Returns:

output OpRunTensor