Source code for mlinsights.mltree.tree_digitize

import numpy
from sklearn.tree._tree import Tree
from sklearn.tree import DecisionTreeRegressor
from ._tree_digitize import tree_add_node



[docs]
def digitize2tree(bins, right=False):
    """
    Builds a decision tree which returns the same result as
    `lambda x: numpy.digitize(x, bins, right=right)`
    (see :epkg:`numpy:digitize`).

    :param bins: array of bins. It has to be 1-dimensional and monotonic.
    :param right: Indicating whether the intervals include the right
        or the left bin edge. Default behavior is (right==False)
        indicating that the interval does not include the right edge.
        The left bin end is open in this case, i.e.,
        `bins[i-1] <= x < bins[i]` is the default behavior for
        monotonically increasing bins.
    :return: decision tree

    .. note::
        The implementation of decision trees in :epkg:`scikit-learn`
        only allows one type of decision (`<=`). That's why the
        function throws an exception when `right=False`. However,
        this could be overcome by using :epkg:`ONNX` where all
        kind of decision rules are implemented. Default value for
        right is still *False* to follow *numpy* API even though
        this value raises an exception in *digitize2tree*.

    The following example shows what the tree looks like.

    .. runpython::
        :showcode:

        import numpy
        from sklearn.tree import export_text
        from mlinsights.mltree import digitize2tree

        x = numpy.array([0.2, 6.4, 3.0, 1.6])
        bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0])
        expected = numpy.digitize(x, bins, right=True)
        tree = digitize2tree(bins, right=True)
        pred = tree.predict(x.reshape((-1, 1)))
        print("Comparison with numpy:")
        print(expected, pred)
        print("Tree:")
        print(export_text(tree, feature_names=['x']))
    """
    if not right:
        raise RuntimeError(f"right must be True not right={right!r}")
    ascending = len(bins) <= 1 or bins[0] < bins[1]

    if not ascending:
        bins2 = bins[::-1]
        cl = digitize2tree(bins2, right=right)
        n = len(bins)
        for i in range(cl.tree_.value.shape[0]):
            cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0]
        return cl

    tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1)
    values = []
    UNUSED = numpy.nan
    n_nodes = []

    def add_root(index):
        assert index >= 0 and index < len(
            bins
        ), "Unexpected index %d / len(bins)=%d." % (index, len(bins))
        parent = -1
        is_left = False
        is_leaf = False
        threshold = bins[index]
        n = tree_add_node(tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.0, 0)
        values.append(UNUSED)
        n_nodes.append(n)
        return n

    def add_nodes(parent, i, j, is_left):
        # add for bins[i:j] (j excluded)
        if is_left:
            # it means j is the parent split
            if i == j:
                # leaf
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.0, 0)
                n_nodes.append(n)
                values.append(i)
                return n
            if i + 1 == j:
                # split
                values.append(UNUSED)
                th = bins[i]
                n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0)
                n_nodes.append(n)
                add_nodes(n, i, i, True)
                add_nodes(n, i, j, False)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        else:
            # it means i is the parent split
            if i + 1 == j:
                # leaf
                values.append(j)
                n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.0, 0)
                n_nodes.append(n)
                return n
            if i + 1 < j:
                # split
                values.append(UNUSED)
                index = (i + j) // 2
                th = bins[index]
                n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0)
                n_nodes.append(n)
                add_nodes(n, i, index, True)
                add_nodes(n, index, j, False)
                return n
        raise NotImplementedError(
            f"Unexpected case where i={i!r}, j={j!r}, is_left={is_left!r}."
        )

    index = len(bins) // 2
    add_root(index)
    add_nodes(0, 0, index, True)
    add_nodes(0, index, len(bins), False)

    cl = DecisionTreeRegressor()
    cl.tree_ = tree
    cl.tree_.value[:, 0, 0] = numpy.array(values, dtype=numpy.float64)
    cl.n_outputs = 1
    cl.n_outputs_ = 1
    cl.n_features_in_ = 1
    return cl