some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py
@ -0,0 +1,791 @@
+"""
+Various bayesian regression
+"""
+
+# Authors: V. Michel, F. Pedregosa, A. Gramfort
+# License: BSD 3 clause
+
+from math import log
+from numbers import Integral, Real
+
+import numpy as np
+from scipy import linalg
+from scipy.linalg import pinvh
+
+from ..base import RegressorMixin, _fit_context
+from ..utils import _safe_indexing
+from ..utils._param_validation import Interval
+from ..utils.extmath import fast_logdet
+from ..utils.validation import _check_sample_weight
+from ._base import LinearModel, _preprocess_data, _rescale_data
+
+###############################################################################
+# BayesianRidge regression
+
+
+class BayesianRidge(RegressorMixin, LinearModel):
+    """Bayesian ridge regression.
+
+    Fit a Bayesian ridge model. See the Notes section for details on this
+    implementation and the optimization of the regularization parameters
+    lambda (precision of the weights) and alpha (precision of the noise).
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+    For an intuitive visualization of how the sinusoid is approximated by
+    a polynomial using different pairs of initial values, see
+    :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations over the complete dataset before
+        stopping independently of any early stopping criterion.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    alpha_init : float, default=None
+        Initial value for alpha (precision of the noise).
+        If not set, alpha_init is 1/Var(y).
+
+            .. versionadded:: 0.22
+
+    lambda_init : float, default=None
+        Initial value for lambda (precision of the weights).
+        If not set, lambda_init is 1.
+
+            .. versionadded:: 0.22
+
+    compute_score : bool, default=False
+        If True, compute the log marginal likelihood at each iteration of the
+        optimization.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model.
+        The intercept is not treated as a probabilistic parameter
+        and thus has no associated variance. If set
+        to False, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        `fit_intercept = False`.
+
+    alpha_ : float
+       Estimated precision of the noise.
+
+    lambda_ : float
+       Estimated precision of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        Estimated variance-covariance matrix of the weights
+
+    scores_ : array-like of shape (n_iter_+1,)
+        If computed_score is True, value of the log marginal likelihood (to be
+        maximized) at each iteration of the optimization. The array starts
+        with the value of the log marginal likelihood obtained for the initial
+        values of alpha and lambda and ends with the value obtained for the
+        estimated alpha and lambda.
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    X_offset_ : ndarray of shape (n_features,)
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : ndarray of shape (n_features,)
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    ARDRegression : Bayesian ARD regression.
+
+    Notes
+    -----
+    There exist several strategies to perform Bayesian ridge regression. This
+    implementation is based on the algorithm described in Appendix A of
+    (Tipping, 2001) where updates of the regularization parameters are done as
+    suggested in (MacKay, 1992). Note that according to A New
+    View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
+    update rules do not guarantee that the marginal likelihood is increasing
+    between two consecutive iterations of the optimization.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
+    Vol. 4, No. 3, 1992.
+
+    M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
+    Journal of Machine Learning Research, Vol. 1, 2001.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.BayesianRidge()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    BayesianRidge()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="neither")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "alpha_init": [None, Interval(Real, 0, None, closed="left")],
+        "lambda_init": [None, Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        alpha_init=None,
+        lambda_init=None,
+        compute_score=False,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.alpha_init = alpha_init
+        self.lambda_init = lambda_init
+        self.compute_score = compute_score
+        self.fit_intercept = fit_intercept
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values. Will be cast to X's dtype if necessary.
+
+        sample_weight : ndarray of shape (n_samples,), default=None
+            Individual weights for each sample.
+
+            .. versionadded:: 0.20
+               parameter *sample_weight* support to BayesianRidge.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X, y = self._validate_data(
+            X, y, dtype=[np.float64, np.float32], force_writeable=True, y_numeric=True
+        )
+        dtype = X.dtype
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X,
+            y,
+            fit_intercept=self.fit_intercept,
+            copy=self.copy_X,
+            sample_weight=sample_weight,
+        )
+
+        if sample_weight is not None:
+            # Sample weight can be implemented via a simple rescaling.
+            X, y, _ = _rescale_data(X, y, sample_weight)
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+        n_samples, n_features = X.shape
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
+        # is zero
+        alpha_ = self.alpha_init
+        lambda_ = self.lambda_init
+        if alpha_ is None:
+            alpha_ = 1.0 / (np.var(y) + eps)
+        if lambda_ is None:
+            lambda_ = 1.0
+
+        # Avoid unintended type promotion to float64 with numpy 2
+        alpha_ = np.asarray(alpha_, dtype=dtype)
+        lambda_ = np.asarray(lambda_, dtype=dtype)
+
+        verbose = self.verbose
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        XT_y = np.dot(X.T, y)
+        U, S, Vh = linalg.svd(X, full_matrices=False)
+        eigen_vals_ = S**2
+
+        # Convergence loop of the bayesian ridge regression
+        for iter_ in range(self.max_iter):
+            # update posterior mean coef_ based on alpha_ and lambda_ and
+            # compute corresponding rmse
+            coef_, rmse_ = self._update_coef_(
+                X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+            )
+            if self.compute_score:
+                # compute the log marginal likelihood
+                s = self._log_marginal_likelihood(
+                    n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+                )
+                self.scores_.append(s)
+
+            # Update alpha and lambda according to (MacKay, 1992)
+            gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
+            lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
+            alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
+
+            # Check for convergence
+            if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Convergence after ", str(iter_), " iterations")
+                break
+            coef_old_ = np.copy(coef_)
+
+        self.n_iter_ = iter_ + 1
+
+        # return regularization parameters and corresponding posterior mean,
+        # log marginal likelihood and posterior covariance
+        self.alpha_ = alpha_
+        self.lambda_ = lambda_
+        self.coef_, rmse_ = self._update_coef_(
+            X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+        )
+        if self.compute_score:
+            # compute the log marginal likelihood
+            s = self._log_marginal_likelihood(
+                n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+            )
+            self.scores_.append(s)
+            self.scores_ = np.array(self.scores_)
+
+        # posterior covariance is given by 1/alpha_ * scaled_sigma_
+        scaled_sigma_ = np.dot(
+            Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
+        )
+        self.sigma_ = (1.0 / alpha_) * scaled_sigma_
+
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+
+        return self
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if not return_std:
+            return y_mean
+        else:
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std
+
+    def _update_coef_(
+        self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
+    ):
+        """Update posterior mean and compute corresponding rmse.
+
+        Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
+        scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
+                         + np.dot(X.T, X))^-1
+        """
+
+        if n_samples > n_features:
+            coef_ = np.linalg.multi_dot(
+                [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
+            )
+        else:
+            coef_ = np.linalg.multi_dot(
+                [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
+            )
+
+        rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+
+        return coef_, rmse_
+
+    def _log_marginal_likelihood(
+        self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
+    ):
+        """Log marginal likelihood."""
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+
+        # compute the log of the determinant of the posterior covariance.
+        # posterior covariance is given by
+        # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
+        if n_samples > n_features:
+            logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
+        else:
+            logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
+            logdet_sigma[:n_samples] += alpha_ * eigen_vals
+            logdet_sigma = -np.sum(np.log(logdet_sigma))
+
+        score = lambda_1 * log(lambda_) - lambda_2 * lambda_
+        score += alpha_1 * log(alpha_) - alpha_2 * alpha_
+        score += 0.5 * (
+            n_features * log(lambda_)
+            + n_samples * log(alpha_)
+            - alpha_ * rmse
+            - lambda_ * np.sum(coef**2)
+            + logdet_sigma
+            - n_samples * log(2 * np.pi)
+        )
+
+        return score
+
+
+###############################################################################
+# ARD (Automatic Relevance Determination) regression
+
+
+class ARDRegression(RegressorMixin, LinearModel):
+    """Bayesian ARD regression.
+
+    Fit the weights of a regression model, using an ARD prior. The weights of
+    the regression model are assumed to be in Gaussian distributions.
+    Also estimate the parameters lambda (precisions of the distributions of the
+    weights) and alpha (precision of the distribution of the noise).
+    The estimation is done by an iterative procedures (Evidence Maximization)
+
+    Read more in the :ref:`User Guide <bayesian_regression>`.
+
+    Parameters
+    ----------
+    max_iter : int, default=300
+        Maximum number of iterations.
+
+        .. versionchanged:: 1.3
+
+    tol : float, default=1e-3
+        Stop the algorithm if w has converged.
+
+    alpha_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the alpha parameter.
+
+    alpha_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the alpha parameter.
+
+    lambda_1 : float, default=1e-6
+        Hyper-parameter : shape parameter for the Gamma distribution prior
+        over the lambda parameter.
+
+    lambda_2 : float, default=1e-6
+        Hyper-parameter : inverse scale parameter (rate parameter) for the
+        Gamma distribution prior over the lambda parameter.
+
+    compute_score : bool, default=False
+        If True, compute the objective function at each step of the model.
+
+    threshold_lambda : float, default=10 000
+        Threshold for removing (pruning) weights with high precision from
+        the computation.
+
+    fit_intercept : bool, default=True
+        Whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (i.e. data is expected to be centered).
+
+    copy_X : bool, default=True
+        If True, X will be copied; else, it may be overwritten.
+
+    verbose : bool, default=False
+        Verbose mode when fitting the model.
+
+    Attributes
+    ----------
+    coef_ : array-like of shape (n_features,)
+        Coefficients of the regression model (mean of distribution)
+
+    alpha_ : float
+       estimated precision of the noise.
+
+    lambda_ : array-like of shape (n_features,)
+       estimated precisions of the weights.
+
+    sigma_ : array-like of shape (n_features, n_features)
+        estimated variance-covariance matrix of the weights
+
+    scores_ : float
+        if computed, value of the objective function (to be maximized)
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+        .. versionadded:: 1.3
+
+    intercept_ : float
+        Independent term in decision function. Set to 0.0 if
+        ``fit_intercept = False``.
+
+    X_offset_ : float
+        If `fit_intercept=True`, offset subtracted for centering data to a
+        zero mean. Set to np.zeros(n_features) otherwise.
+
+    X_scale_ : float
+        Set to np.ones(n_features).
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    BayesianRidge : Bayesian ridge regression.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/linear_model/plot_ard.py
+    <sphx_glr_auto_examples_linear_model_plot_ard.py>`.
+
+    References
+    ----------
+    D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
+    competition, ASHRAE Transactions, 1994.
+
+    R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
+    http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
+    Their beta is our ``self.alpha_``
+    Their alpha is our ``self.lambda_``
+    ARD is a little different than the slide: only dimensions/features for
+    which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
+    discarded.
+
+    Examples
+    --------
+    >>> from sklearn import linear_model
+    >>> clf = linear_model.ARDRegression()
+    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
+    ARDRegression()
+    >>> clf.predict([[1, 1]])
+    array([1.])
+    """
+
+    _parameter_constraints: dict = {
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "tol": [Interval(Real, 0, None, closed="left")],
+        "alpha_1": [Interval(Real, 0, None, closed="left")],
+        "alpha_2": [Interval(Real, 0, None, closed="left")],
+        "lambda_1": [Interval(Real, 0, None, closed="left")],
+        "lambda_2": [Interval(Real, 0, None, closed="left")],
+        "compute_score": ["boolean"],
+        "threshold_lambda": [Interval(Real, 0, None, closed="left")],
+        "fit_intercept": ["boolean"],
+        "copy_X": ["boolean"],
+        "verbose": ["verbose"],
+    }
+
+    def __init__(
+        self,
+        *,
+        max_iter=300,
+        tol=1.0e-3,
+        alpha_1=1.0e-6,
+        alpha_2=1.0e-6,
+        lambda_1=1.0e-6,
+        lambda_2=1.0e-6,
+        compute_score=False,
+        threshold_lambda=1.0e4,
+        fit_intercept=True,
+        copy_X=True,
+        verbose=False,
+    ):
+        self.max_iter = max_iter
+        self.tol = tol
+        self.fit_intercept = fit_intercept
+        self.alpha_1 = alpha_1
+        self.alpha_2 = alpha_2
+        self.lambda_1 = lambda_1
+        self.lambda_2 = lambda_2
+        self.compute_score = compute_score
+        self.threshold_lambda = threshold_lambda
+        self.copy_X = copy_X
+        self.verbose = verbose
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit the model according to the given training data and parameters.
+
+        Iterative procedure to maximize the evidence
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vector, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y : array-like of shape (n_samples,)
+            Target values (integers). Will be cast to X's dtype if necessary.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X, y = self._validate_data(
+            X,
+            y,
+            dtype=[np.float64, np.float32],
+            force_writeable=True,
+            y_numeric=True,
+            ensure_min_samples=2,
+        )
+        dtype = X.dtype
+
+        n_samples, n_features = X.shape
+        coef_ = np.zeros(n_features, dtype=dtype)
+
+        X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
+            X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
+        )
+
+        self.X_offset_ = X_offset_
+        self.X_scale_ = X_scale_
+
+        # Launch the convergence loop
+        keep_lambda = np.ones(n_features, dtype=bool)
+
+        lambda_1 = self.lambda_1
+        lambda_2 = self.lambda_2
+        alpha_1 = self.alpha_1
+        alpha_2 = self.alpha_2
+        verbose = self.verbose
+
+        # Initialization of the values of the parameters
+        eps = np.finfo(np.float64).eps
+        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
+        # is zero.
+        # Explicitly set dtype to avoid unintended type promotion with numpy 2.
+        alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
+        lambda_ = np.ones(n_features, dtype=dtype)
+
+        self.scores_ = list()
+        coef_old_ = None
+
+        def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
+            coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
+                [sigma_, X[:, keep_lambda].T, y]
+            )
+            return coef_
+
+        update_sigma = (
+            self._update_sigma
+            if n_samples >= n_features
+            else self._update_sigma_woodbury
+        )
+        # Iterative procedure of ARDRegression
+        for iter_ in range(self.max_iter):
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+
+            # Update alpha and lambda
+            rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+            gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
+            lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
+                (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
+            )
+            alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
+                rmse_ + 2.0 * alpha_2
+            )
+
+            # Prune the weights with a precision over a threshold
+            keep_lambda = lambda_ < self.threshold_lambda
+            coef_[~keep_lambda] = 0
+
+            # Compute the objective function
+            if self.compute_score:
+                s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
+                s += alpha_1 * log(alpha_) - alpha_2 * alpha_
+                s += 0.5 * (
+                    fast_logdet(sigma_)
+                    + n_samples * log(alpha_)
+                    + np.sum(np.log(lambda_))
+                )
+                s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
+                self.scores_.append(s)
+
+            # Check for convergence
+            if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
+                if verbose:
+                    print("Converged after %s iterations" % iter_)
+                break
+            coef_old_ = np.copy(coef_)
+
+            if not keep_lambda.any():
+                break
+
+        self.n_iter_ = iter_ + 1
+
+        if keep_lambda.any():
+            # update sigma and mu using updated params from the last iteration
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+        else:
+            sigma_ = np.array([]).reshape(0, 0)
+
+        self.coef_ = coef_
+        self.alpha_ = alpha_
+        self.sigma_ = sigma_
+        self.lambda_ = lambda_
+        self._set_intercept(X_offset_, y_offset_, X_scale_)
+        return self
+
+    def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples < n_features and will invert
+        # a matrix of shape (n_samples, n_samples) making use of the
+        # woodbury formula:
+        # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+        n_samples = X.shape[0]
+        X_keep = X[:, keep_lambda]
+        inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
+        sigma_ = pinvh(
+            np.eye(n_samples, dtype=X.dtype) / alpha_
+            + np.dot(X_keep * inv_lambda, X_keep.T)
+        )
+        sigma_ = np.dot(sigma_, X_keep * inv_lambda)
+        sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
+        sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
+        return sigma_
+
+    def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples >= n_features and will
+        # invert a matrix of shape (n_features, n_features)
+        X_keep = X[:, keep_lambda]
+        gram = np.dot(X_keep.T, X_keep)
+        eye = np.eye(gram.shape[0], dtype=X.dtype)
+        sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
+        sigma_ = pinvh(sigma_inv)
+        return sigma_
+
+    def predict(self, X, return_std=False):
+        """Predict using the linear model.
+
+        In addition to the mean of the predictive distribution, also its
+        standard deviation can be returned.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        return_std : bool, default=False
+            Whether to return the standard deviation of posterior prediction.
+
+        Returns
+        -------
+        y_mean : array-like of shape (n_samples,)
+            Mean of predictive distribution of query points.
+
+        y_std : array-like of shape (n_samples,)
+            Standard deviation of predictive distribution of query points.
+        """
+        y_mean = self._decision_function(X)
+        if return_std is False:
+            return y_mean
+        else:
+            col_index = self.lambda_ < self.threshold_lambda
+            X = _safe_indexing(X, indices=col_index, axis=1)
+            sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
+            y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
+            return y_mean, y_std