some new features
This commit is contained in:
@ -0,0 +1,791 @@
|
||||
"""
|
||||
Various bayesian regression
|
||||
"""
|
||||
|
||||
# Authors: V. Michel, F. Pedregosa, A. Gramfort
|
||||
# License: BSD 3 clause
|
||||
|
||||
from math import log
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.linalg import pinvh
|
||||
|
||||
from ..base import RegressorMixin, _fit_context
|
||||
from ..utils import _safe_indexing
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._base import LinearModel, _preprocess_data, _rescale_data
|
||||
|
||||
###############################################################################
|
||||
# BayesianRidge regression
|
||||
|
||||
|
||||
class BayesianRidge(RegressorMixin, LinearModel):
|
||||
"""Bayesian ridge regression.
|
||||
|
||||
Fit a Bayesian ridge model. See the Notes section for details on this
|
||||
implementation and the optimization of the regularization parameters
|
||||
lambda (precision of the weights) and alpha (precision of the noise).
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
For an intuitive visualization of how the sinusoid is approximated by
|
||||
a polynomial using different pairs of initial values, see
|
||||
:ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations over the complete dataset before
|
||||
stopping independently of any early stopping criterion.
|
||||
|
||||
.. versionchanged:: 1.3
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
alpha_init : float, default=None
|
||||
Initial value for alpha (precision of the noise).
|
||||
If not set, alpha_init is 1/Var(y).
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
lambda_init : float, default=None
|
||||
Initial value for lambda (precision of the weights).
|
||||
If not set, lambda_init is 1.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the log marginal likelihood at each iteration of the
|
||||
optimization.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model.
|
||||
The intercept is not treated as a probabilistic parameter
|
||||
and thus has no associated variance. If set
|
||||
to False, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
`fit_intercept = False`.
|
||||
|
||||
alpha_ : float
|
||||
Estimated precision of the noise.
|
||||
|
||||
lambda_ : float
|
||||
Estimated precision of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
Estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : array-like of shape (n_iter_+1,)
|
||||
If computed_score is True, value of the log marginal likelihood (to be
|
||||
maximized) at each iteration of the optimization. The array starts
|
||||
with the value of the log marginal likelihood obtained for the initial
|
||||
values of alpha and lambda and ends with the value obtained for the
|
||||
estimated alpha and lambda.
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
X_offset_ : ndarray of shape (n_features,)
|
||||
If `fit_intercept=True`, offset subtracted for centering data to a
|
||||
zero mean. Set to np.zeros(n_features) otherwise.
|
||||
|
||||
X_scale_ : ndarray of shape (n_features,)
|
||||
Set to np.ones(n_features).
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
ARDRegression : Bayesian ARD regression.
|
||||
|
||||
Notes
|
||||
-----
|
||||
There exist several strategies to perform Bayesian ridge regression. This
|
||||
implementation is based on the algorithm described in Appendix A of
|
||||
(Tipping, 2001) where updates of the regularization parameters are done as
|
||||
suggested in (MacKay, 1992). Note that according to A New
|
||||
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
|
||||
update rules do not guarantee that the marginal likelihood is increasing
|
||||
between two consecutive iterations of the optimization.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
|
||||
Vol. 4, No. 3, 1992.
|
||||
|
||||
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
|
||||
Journal of Machine Learning Research, Vol. 1, 2001.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.BayesianRidge()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
BayesianRidge()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="neither")],
|
||||
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_init": [None, Interval(Real, 0, None, closed="left")],
|
||||
"lambda_init": [None, Interval(Real, 0, None, closed="left")],
|
||||
"compute_score": ["boolean"],
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_iter=300,
|
||||
tol=1.0e-3,
|
||||
alpha_1=1.0e-6,
|
||||
alpha_2=1.0e-6,
|
||||
lambda_1=1.0e-6,
|
||||
lambda_2=1.0e-6,
|
||||
alpha_init=None,
|
||||
lambda_init=None,
|
||||
compute_score=False,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
verbose=False,
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.alpha_init = alpha_init
|
||||
self.lambda_init = lambda_init
|
||||
self.compute_score = compute_score
|
||||
self.fit_intercept = fit_intercept
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values. Will be cast to X's dtype if necessary.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), default=None
|
||||
Individual weights for each sample.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
parameter *sample_weight* support to BayesianRidge.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X, y, dtype=[np.float64, np.float32], force_writeable=True, y_numeric=True
|
||||
)
|
||||
dtype = X.dtype
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=self.fit_intercept,
|
||||
copy=self.copy_X,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
if sample_weight is not None:
|
||||
# Sample weight can be implemented via a simple rescaling.
|
||||
X, y, _ = _rescale_data(X, y, sample_weight)
|
||||
|
||||
self.X_offset_ = X_offset_
|
||||
self.X_scale_ = X_scale_
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero
|
||||
alpha_ = self.alpha_init
|
||||
lambda_ = self.lambda_init
|
||||
if alpha_ is None:
|
||||
alpha_ = 1.0 / (np.var(y) + eps)
|
||||
if lambda_ is None:
|
||||
lambda_ = 1.0
|
||||
|
||||
# Avoid unintended type promotion to float64 with numpy 2
|
||||
alpha_ = np.asarray(alpha_, dtype=dtype)
|
||||
lambda_ = np.asarray(lambda_, dtype=dtype)
|
||||
|
||||
verbose = self.verbose
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
XT_y = np.dot(X.T, y)
|
||||
U, S, Vh = linalg.svd(X, full_matrices=False)
|
||||
eigen_vals_ = S**2
|
||||
|
||||
# Convergence loop of the bayesian ridge regression
|
||||
for iter_ in range(self.max_iter):
|
||||
# update posterior mean coef_ based on alpha_ and lambda_ and
|
||||
# compute corresponding rmse
|
||||
coef_, rmse_ = self._update_coef_(
|
||||
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(
|
||||
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||||
)
|
||||
self.scores_.append(s)
|
||||
|
||||
# Update alpha and lambda according to (MacKay, 1992)
|
||||
gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
|
||||
lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
|
||||
alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
|
||||
|
||||
# Check for convergence
|
||||
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Convergence after ", str(iter_), " iterations")
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
self.n_iter_ = iter_ + 1
|
||||
|
||||
# return regularization parameters and corresponding posterior mean,
|
||||
# log marginal likelihood and posterior covariance
|
||||
self.alpha_ = alpha_
|
||||
self.lambda_ = lambda_
|
||||
self.coef_, rmse_ = self._update_coef_(
|
||||
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(
|
||||
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||||
)
|
||||
self.scores_.append(s)
|
||||
self.scores_ = np.array(self.scores_)
|
||||
|
||||
# posterior covariance is given by 1/alpha_ * scaled_sigma_
|
||||
scaled_sigma_ = np.dot(
|
||||
Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
|
||||
)
|
||||
self.sigma_ = (1.0 / alpha_) * scaled_sigma_
|
||||
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if not return_std:
|
||||
return y_mean
|
||||
else:
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||||
return y_mean, y_std
|
||||
|
||||
def _update_coef_(
|
||||
self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
):
|
||||
"""Update posterior mean and compute corresponding rmse.
|
||||
|
||||
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
|
||||
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
|
||||
+ np.dot(X.T, X))^-1
|
||||
"""
|
||||
|
||||
if n_samples > n_features:
|
||||
coef_ = np.linalg.multi_dot(
|
||||
[Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
|
||||
)
|
||||
else:
|
||||
coef_ = np.linalg.multi_dot(
|
||||
[X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
|
||||
)
|
||||
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
|
||||
return coef_, rmse_
|
||||
|
||||
def _log_marginal_likelihood(
|
||||
self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
|
||||
):
|
||||
"""Log marginal likelihood."""
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
|
||||
# compute the log of the determinant of the posterior covariance.
|
||||
# posterior covariance is given by
|
||||
# sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
|
||||
if n_samples > n_features:
|
||||
logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
|
||||
else:
|
||||
logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
|
||||
logdet_sigma[:n_samples] += alpha_ * eigen_vals
|
||||
logdet_sigma = -np.sum(np.log(logdet_sigma))
|
||||
|
||||
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
|
||||
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
score += 0.5 * (
|
||||
n_features * log(lambda_)
|
||||
+ n_samples * log(alpha_)
|
||||
- alpha_ * rmse
|
||||
- lambda_ * np.sum(coef**2)
|
||||
+ logdet_sigma
|
||||
- n_samples * log(2 * np.pi)
|
||||
)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
###############################################################################
|
||||
# ARD (Automatic Relevance Determination) regression
|
||||
|
||||
|
||||
class ARDRegression(RegressorMixin, LinearModel):
|
||||
"""Bayesian ARD regression.
|
||||
|
||||
Fit the weights of a regression model, using an ARD prior. The weights of
|
||||
the regression model are assumed to be in Gaussian distributions.
|
||||
Also estimate the parameters lambda (precisions of the distributions of the
|
||||
weights) and alpha (precision of the distribution of the noise).
|
||||
The estimation is done by an iterative procedures (Evidence Maximization)
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations.
|
||||
|
||||
.. versionchanged:: 1.3
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the objective function at each step of the model.
|
||||
|
||||
threshold_lambda : float, default=10 000
|
||||
Threshold for removing (pruning) weights with high precision from
|
||||
the computation.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model. If set
|
||||
to false, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
alpha_ : float
|
||||
estimated precision of the noise.
|
||||
|
||||
lambda_ : array-like of shape (n_features,)
|
||||
estimated precisions of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : float
|
||||
if computed, value of the objective function (to be maximized)
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
``fit_intercept = False``.
|
||||
|
||||
X_offset_ : float
|
||||
If `fit_intercept=True`, offset subtracted for centering data to a
|
||||
zero mean. Set to np.zeros(n_features) otherwise.
|
||||
|
||||
X_scale_ : float
|
||||
Set to np.ones(n_features).
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
BayesianRidge : Bayesian ridge regression.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/linear_model/plot_ard.py
|
||||
<sphx_glr_auto_examples_linear_model_plot_ard.py>`.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
|
||||
competition, ASHRAE Transactions, 1994.
|
||||
|
||||
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
|
||||
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
|
||||
Their beta is our ``self.alpha_``
|
||||
Their alpha is our ``self.lambda_``
|
||||
ARD is a little different than the slide: only dimensions/features for
|
||||
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
|
||||
discarded.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.ARDRegression()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
ARDRegression()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||||
"compute_score": ["boolean"],
|
||||
"threshold_lambda": [Interval(Real, 0, None, closed="left")],
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_iter=300,
|
||||
tol=1.0e-3,
|
||||
alpha_1=1.0e-6,
|
||||
alpha_2=1.0e-6,
|
||||
lambda_1=1.0e-6,
|
||||
lambda_2=1.0e-6,
|
||||
compute_score=False,
|
||||
threshold_lambda=1.0e4,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
verbose=False,
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.fit_intercept = fit_intercept
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.compute_score = compute_score
|
||||
self.threshold_lambda = threshold_lambda
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit the model according to the given training data and parameters.
|
||||
|
||||
Iterative procedure to maximize the evidence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values (integers). Will be cast to X's dtype if necessary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
dtype=[np.float64, np.float32],
|
||||
force_writeable=True,
|
||||
y_numeric=True,
|
||||
ensure_min_samples=2,
|
||||
)
|
||||
dtype = X.dtype
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
coef_ = np.zeros(n_features, dtype=dtype)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||||
X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
|
||||
)
|
||||
|
||||
self.X_offset_ = X_offset_
|
||||
self.X_scale_ = X_scale_
|
||||
|
||||
# Launch the convergence loop
|
||||
keep_lambda = np.ones(n_features, dtype=bool)
|
||||
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
verbose = self.verbose
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero.
|
||||
# Explicitly set dtype to avoid unintended type promotion with numpy 2.
|
||||
alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
|
||||
lambda_ = np.ones(n_features, dtype=dtype)
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
|
||||
coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
|
||||
[sigma_, X[:, keep_lambda].T, y]
|
||||
)
|
||||
return coef_
|
||||
|
||||
update_sigma = (
|
||||
self._update_sigma
|
||||
if n_samples >= n_features
|
||||
else self._update_sigma_woodbury
|
||||
)
|
||||
# Iterative procedure of ARDRegression
|
||||
for iter_ in range(self.max_iter):
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
|
||||
# Update alpha and lambda
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
|
||||
lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
|
||||
(coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
|
||||
)
|
||||
alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
|
||||
rmse_ + 2.0 * alpha_2
|
||||
)
|
||||
|
||||
# Prune the weights with a precision over a threshold
|
||||
keep_lambda = lambda_ < self.threshold_lambda
|
||||
coef_[~keep_lambda] = 0
|
||||
|
||||
# Compute the objective function
|
||||
if self.compute_score:
|
||||
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
|
||||
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
s += 0.5 * (
|
||||
fast_logdet(sigma_)
|
||||
+ n_samples * log(alpha_)
|
||||
+ np.sum(np.log(lambda_))
|
||||
)
|
||||
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
|
||||
self.scores_.append(s)
|
||||
|
||||
# Check for convergence
|
||||
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Converged after %s iterations" % iter_)
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
if not keep_lambda.any():
|
||||
break
|
||||
|
||||
self.n_iter_ = iter_ + 1
|
||||
|
||||
if keep_lambda.any():
|
||||
# update sigma and mu using updated params from the last iteration
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
else:
|
||||
sigma_ = np.array([]).reshape(0, 0)
|
||||
|
||||
self.coef_ = coef_
|
||||
self.alpha_ = alpha_
|
||||
self.sigma_ = sigma_
|
||||
self.lambda_ = lambda_
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
return self
|
||||
|
||||
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples < n_features and will invert
|
||||
# a matrix of shape (n_samples, n_samples) making use of the
|
||||
# woodbury formula:
|
||||
# https://en.wikipedia.org/wiki/Woodbury_matrix_identity
|
||||
n_samples = X.shape[0]
|
||||
X_keep = X[:, keep_lambda]
|
||||
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
|
||||
sigma_ = pinvh(
|
||||
np.eye(n_samples, dtype=X.dtype) / alpha_
|
||||
+ np.dot(X_keep * inv_lambda, X_keep.T)
|
||||
)
|
||||
sigma_ = np.dot(sigma_, X_keep * inv_lambda)
|
||||
sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
|
||||
sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
|
||||
return sigma_
|
||||
|
||||
def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples >= n_features and will
|
||||
# invert a matrix of shape (n_features, n_features)
|
||||
X_keep = X[:, keep_lambda]
|
||||
gram = np.dot(X_keep.T, X_keep)
|
||||
eye = np.eye(gram.shape[0], dtype=X.dtype)
|
||||
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
|
||||
sigma_ = pinvh(sigma_inv)
|
||||
return sigma_
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if return_std is False:
|
||||
return y_mean
|
||||
else:
|
||||
col_index = self.lambda_ < self.threshold_lambda
|
||||
X = _safe_indexing(X, indices=col_index, axis=1)
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||||
return y_mean, y_std
|
||||
Reference in New Issue
Block a user