Source code for mlinsights.mltree.tree_digitize

import numpy
from sklearn.tree._tree import Tree
from sklearn.tree import DecisionTreeRegressor
from ._tree_digitize import tree_add_node


[docs] def digitize2tree(bins, right=False): """ Builds a decision tree which returns the same result as `lambda x: numpy.digitize(x, bins, right=right)` (see :epkg:`numpy:digitize`). :param bins: array of bins. It has to be 1-dimensional and monotonic. :param right: Indicating whether the intervals include the right or the left bin edge. Default behavior is (right==False) indicating that the interval does not include the right edge. The left bin end is open in this case, i.e., `bins[i-1] <= x < bins[i]` is the default behavior for monotonically increasing bins. :return: decision tree .. note:: The implementation of decision trees in :epkg:`scikit-learn` only allows one type of decision (`<=`). That's why the function throws an exception when `right=False`. However, this could be overcome by using :epkg:`ONNX` where all kind of decision rules are implemented. Default value for right is still *False* to follow *numpy* API even though this value raises an exception in *digitize2tree*. The following example shows what the tree looks like. .. runpython:: :showcode: import numpy from sklearn.tree import export_text from mlinsights.mltree import digitize2tree x = numpy.array([0.2, 6.4, 3.0, 1.6]) bins = numpy.array([0.0, 1.0, 2.5, 4.0, 7.0]) expected = numpy.digitize(x, bins, right=True) tree = digitize2tree(bins, right=True) pred = tree.predict(x.reshape((-1, 1))) print("Comparison with numpy:") print(expected, pred) print("Tree:") print(export_text(tree, feature_names=['x'])) """ if not right: raise RuntimeError(f"right must be True not right={right!r}") ascending = len(bins) <= 1 or bins[0] < bins[1] if not ascending: bins2 = bins[::-1] cl = digitize2tree(bins2, right=right) n = len(bins) for i in range(cl.tree_.value.shape[0]): cl.tree_.value[i, 0, 0] = n - cl.tree_.value[i, 0, 0] return cl tree = Tree(1, numpy.array([1], dtype=numpy.intp), 1) values = [] UNUSED = numpy.nan n_nodes = [] def add_root(index): assert index >= 0 and index < len( bins ), "Unexpected index %d / len(bins)=%d." % (index, len(bins)) parent = -1 is_left = False is_leaf = False threshold = bins[index] n = tree_add_node(tree, parent, is_left, is_leaf, 0, threshold, 0, 1, 1.0, 0) values.append(UNUSED) n_nodes.append(n) return n def add_nodes(parent, i, j, is_left): # add for bins[i:j] (j excluded) if is_left: # it means j is the parent split if i == j: # leaf n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.0, 0) n_nodes.append(n) values.append(i) return n if i + 1 == j: # split values.append(UNUSED) th = bins[i] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0) n_nodes.append(n) add_nodes(n, i, i, True) add_nodes(n, i, j, False) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n else: # it means i is the parent split if i + 1 == j: # leaf values.append(j) n = tree_add_node(tree, parent, is_left, True, 0, 0, 0, 1, 1.0, 0) n_nodes.append(n) return n if i + 1 < j: # split values.append(UNUSED) index = (i + j) // 2 th = bins[index] n = tree_add_node(tree, parent, is_left, False, 0, th, 0, 1, 1.0, 0) n_nodes.append(n) add_nodes(n, i, index, True) add_nodes(n, index, j, False) return n raise NotImplementedError( f"Unexpected case where i={i!r}, j={j!r}, is_left={is_left!r}." ) index = len(bins) // 2 add_root(index) add_nodes(0, 0, index, True) add_nodes(0, index, len(bins), False) cl = DecisionTreeRegressor() cl.tree_ = tree cl.tree_.value[:, 0, 0] = numpy.array(values, dtype=numpy.float64) cl.n_outputs = 1 cl.n_outputs_ = 1 cl.n_features_in_ = 1 return cl