import numpy
import numpy.random
from sklearn.base import RegressorMixin, clone, BaseEstimator
from sklearn.utils._joblib import Parallel, delayed
try:
from tqdm import tqdm
except ImportError:
pass
[docs]
class IntervalRegressor(BaseEstimator, RegressorMixin):
"""
Trains multiple regressors to provide a confidence
interval on prediction. It only works for
single regression. Every training is made with a new
sample of the training data, parameter *alpha*
let the user choose the size of this sample.
A smaller *alpha* increases the variance
of the predictions. The current implementation
draws sample by random but keeps the weight associated
to each of them. Another way could be to draw
a weighted sample but give them uniform weights.
:param estimator: predictor trained on every bucket
:param n_estimators: number of estimators to train
:param n_jobs: number of parallel jobs (for training and predicting)
:param alpha: proportion of samples resampled for each training
:param verbose: boolean or use ``'tqdm'`` to use :epkg:`tqdm`
to fit the estimators
"""
def __init__(
self, estimator=None, n_estimators=10, n_jobs=None, alpha=1.0, verbose=False
):
BaseEstimator.__init__(self)
RegressorMixin.__init__(self)
assert estimator is not None, "estimator cannot be null."
self.estimator = estimator
self.n_jobs = n_jobs
self.alpha = alpha
self.verbose = verbose
self.n_estimators = n_estimators
@property
def n_estimators_(self):
"""
Returns the number of estimators = the number of buckets
the data was split in.
"""
return len(self.estimators_)
[docs]
def fit(self, X, y, sample_weight=None):
"""
Trains the binner and an estimator on every
bucket.
:param X: features, *X* is converted into an array if *X* is a dataframe
:param y: target
:param sample_weight: sample weights
:return: self: returns an instance of self.
Fitted attributes:
* `binner_`: binner
* `estimators_`: dictionary of estimators, each of them
mapped to a leave to the tree
* `mean_estimator_`: estimator trained on the whole
datasets in case the binner can find a bucket for
a new observation
* `dim_`: dimension of the output
* `mean_`: average targets
"""
self.estimators_ = []
estimators = [clone(self.estimator) for i in range(self.n_estimators)]
loop = (
tqdm(range(len(estimators)))
if self.verbose == "tqdm"
else range(len(estimators))
)
verbose = 1 if self.verbose == "tqdm" else (1 if self.verbose else 0)
def _fit_piecewise_estimator(i, est, X, y, sample_weight, alpha):
new_size = int(X.shape[0] * alpha + 0.5)
rnd = numpy.random.randint(0, X.shape[0] - 1, new_size)
Xr = X[rnd]
yr = y[rnd]
sr = sample_weight[rnd] if sample_weight is not None else None
return est.fit(Xr, yr, sr)
self.estimators_ = Parallel(
n_jobs=self.n_jobs, verbose=verbose, prefer="threads"
)(
delayed(_fit_piecewise_estimator)(
i, estimators[i], X, y, sample_weight, self.alpha
)
for i in loop
)
return self
[docs]
def predict_all(self, X):
"""
Computes the predictions for all estimators.
:param X: features, *X* is converted into an array if *X* is a dataframe
:return: predictions
"""
container = numpy.empty((X.shape[0], len(self.estimators_)))
for i, est in enumerate(self.estimators_):
pred = est.predict(X)
container[:, i] = pred
return container
[docs]
def predict(self, X):
"""
Computes the average predictions.
:param X: features, *X* is converted into an array if *X* is a dataframe
:return: predictions
"""
preds = self.predict_all(X)
return preds.mean(axis=1)
[docs]
def predict_sorted(self, X):
"""
Computes the predictions for all estimators.
Sorts them for all observations.
:param X: features, *X* is converted into an array if *X* is a dataframe
:return: predictions sorted for each observation
"""
preds = self.predict_all(X)
for i in range(preds.shape[0]):
preds[i, :] = numpy.sort(preds[i, :])
return preds