reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/init.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/init.py
@ -0,0 +1,12 @@
+"""Gaussian process based regression and classification."""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Vincent Dubourg <vincent.dubourg@gmail.com>
+#         (mostly translation, see implementation details)
+# License: BSD 3 clause
+
+from . import kernels
+from ._gpc import GaussianProcessClassifier
+from ._gpr import GaussianProcessRegressor
+
+__all__ = ["GaussianProcessRegressor", "GaussianProcessClassifier", "kernels"]
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/_gpc.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/_gpc.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/_gpr.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/_gpr.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/kernels.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/pycache/kernels.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py
@ -0,0 +1,902 @@
+"""Gaussian processes classification."""
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+from numbers import Integral
+from operator import itemgetter
+
+import numpy as np
+import scipy.optimize
+from scipy.linalg import cho_solve, cholesky, solve
+from scipy.special import erf, expit
+
+from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone
+from ..multiclass import OneVsOneClassifier, OneVsRestClassifier
+from ..preprocessing import LabelEncoder
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.optimize import _check_optimize_result
+from ..utils.validation import check_is_fitted
+from .kernels import RBF, CompoundKernel, Kernel
+from .kernels import ConstantKernel as C
+
+# Values required for approximating the logistic sigmoid by
+# error functions. coefs are obtained via:
+# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+# b = logistic(x)
+# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+# coefs = lstsq(A, b)[0]
+LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
+COEFS = np.array(
+    [-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654]
+)[:, np.newaxis]
+
+
+class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
+    """Binary Gaussian process classification based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    X_train_ : array-like of shape (n_samples, n_features) or list of object
+        Feature vectors or other representations of training data (also
+        required for prediction).
+
+    y_train_ : array-like of shape (n_samples,)
+        Target values in training data (also required for prediction)
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    kernel_ : kernl instance
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    L_ : array-like of shape (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
+
+    pi_ : array-like of shape (n_samples,)
+        The probabilities of the positive class for the training points
+        X_train_
+
+    W_sr_ : array-like of shape (n_samples,)
+        Square root of W, the Hessian of log-likelihood of the latent function
+        values for the observed labels. Since W is diagonal, only the diagonal
+        of sqrt(W) is stored.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+    """
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        max_iter_predict=100,
+        warm_start=False,
+        copy_X_train=True,
+        random_state=None,
+    ):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
+                1.0, length_scale_bounds="fixed"
+            )
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self.rng = check_random_state(self.random_state)
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+
+        # Encode class labels and check that it is a binary classification
+        # problem
+        label_encoder = LabelEncoder()
+        self.y_train_ = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        if self.classes_.size > 2:
+            raise ValueError(
+                "%s supports only binary classification. y contains classes %s"
+                % (self.__class__.__name__, self.classes_)
+            )
+        elif self.classes_.size == 1:
+            raise ValueError(
+                "{0:s} requires 2 classes; got {1:d} class".format(
+                    self.__class__.__name__, self.classes_.size
+                )
+            )
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True, clone_kernel=False
+                    )
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta, clone_kernel=False)
+
+            # First optimize starting from theta specified in kernel
+            optima = [
+                self._constrained_optimization(
+                    obj_func, self.kernel_.theta, self.kernel_.bounds
+                )
+            ]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite."
+                    )
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial, bounds)
+                    )
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.kernel_._check_bounds_params()
+
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
+                self.kernel_.theta
+            )
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel_(self.X_train_)
+
+        _, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode(
+            K, return_temporaries=True
+        )
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``
+        """
+        check_is_fitted(self)
+
+        # As discussed on Section 3.4.2 of GPML, for making hard binary
+        # decisions, it is enough to compute the MAP of the posterior and
+        # pass it through the link function
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Algorithm 3.2,Line 4
+
+        return np.where(f_star > 0, self.classes_[1], self.classes_[0])
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute ``classes_``.
+        """
+        check_is_fitted(self)
+
+        # Based on Algorithm 3.2 of GPML
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Line 4
+        v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
+        # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
+        var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
+
+        # Line 7:
+        # Approximate \int log(z) * N(z | f_star, var_f_star)
+        # Approximation is due to Williams & Barber, "Bayesian Classification
+        # with Gaussian Processes", Appendix A: Approximate the logistic
+        # sigmoid by a linear combination of 5 error functions.
+        # For information on how this integral can be computed see
+        # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
+        alpha = 1 / (2 * var_f_star)
+        gamma = LAMBDAS * f_star
+        integrals = (
+            np.sqrt(np.pi / alpha)
+            * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2)))
+            / (2 * np.sqrt(var_f_star * 2 * np.pi))
+        )
+        pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()
+
+        return np.vstack((1 - pi_star, pi_star)).T
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
+                optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        if clone_kernel:
+            kernel = self.kernel_.clone_with_theta(theta)
+        else:
+            kernel = self.kernel_
+            kernel.theta = theta
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Compute log-marginal-likelihood Z and also store some temporaries
+        # which can be reused for computing Z's gradient
+        Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True)
+
+        if not eval_gradient:
+            return Z
+
+        # Compute gradient based on Algorithm 5.1 of GPML
+        d_Z = np.empty(theta.shape[0])
+        # XXX: Get rid of the np.diag() in the next line
+        R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
+        C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
+        # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
+        s_2 = (
+            -0.5
+            * (np.diag(K) - np.einsum("ij, ij -> j", C, C))
+            * (pi * (1 - pi) * (1 - 2 * pi))
+        )  # third derivative
+
+        for j in range(d_Z.shape[0]):
+            C = K_gradient[:, :, j]  # Line 11
+            # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
+            s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())
+
+            b = C.dot(self.y_train_ - pi)  # Line 13
+            s_3 = b - K.dot(R.dot(b))  # Line 14
+
+            d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
+
+        return Z, d_Z
+
+    def _posterior_mode(self, K, return_temporaries=False):
+        """Mode-finding for binary Laplace GPC and fixed kernel.
+
+        This approximates the posterior of the latent function values for given
+        inputs and target observations with a Gaussian approximation and uses
+        Newton's iteration to find the mode of this approximation.
+        """
+        # Based on Algorithm 3.1 of GPML
+
+        # If warm_start are enabled, we reuse the last solution for the
+        # posterior mode as initialization; otherwise, we initialize with 0
+        if (
+            self.warm_start
+            and hasattr(self, "f_cached")
+            and self.f_cached.shape == self.y_train_.shape
+        ):
+            f = self.f_cached
+        else:
+            f = np.zeros_like(self.y_train_, dtype=np.float64)
+
+        # Use Newton's iteration method to find mode of Laplace approximation
+        log_marginal_likelihood = -np.inf
+        for _ in range(self.max_iter_predict):
+            # Line 4
+            pi = expit(f)
+            W = pi * (1 - pi)
+            # Line 5
+            W_sr = np.sqrt(W)
+            W_sr_K = W_sr[:, np.newaxis] * K
+            B = np.eye(W.shape[0]) + W_sr_K * W_sr
+            L = cholesky(B, lower=True)
+            # Line 6
+            b = W * f + (self.y_train_ - pi)
+            # Line 7
+            a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
+            # Line 8
+            f = K.dot(a)
+
+            # Line 10: Compute log marginal likelihood in loop and use as
+            #          convergence criterion
+            lml = (
+                -0.5 * a.T.dot(f)
+                - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()
+                - np.log(np.diag(L)).sum()
+            )
+            # Check if we have converged (log marginal likelihood does
+            # not decrease)
+            # XXX: more complex convergence criterion
+            if lml - log_marginal_likelihood < 1e-10:
+                break
+            log_marginal_likelihood = lml
+
+        self.f_cached = f  # Remember solution for later warm-starts
+        if return_temporaries:
+            return log_marginal_likelihood, (pi, W_sr, L, b, a)
+        else:
+            return log_marginal_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            opt_res = scipy.optimize.minimize(
+                obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds
+            )
+            _check_optimize_result("lbfgs", opt_res)
+            theta_opt, func_min = opt_res.x, opt_res.fun
+        elif callable(self.optimizer):
+            theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
+        return theta_opt, func_min
+
+
+class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
+    """Gaussian process classification (GPC) based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function. For multi-class classification, several binary one-versus rest
+    classifiers are fitted. Note that this class thus does not implement
+    a true multi-class Laplace approximation.
+
+    Read more in the :ref:`User Guide <gaussian_process>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting. Also kernel
+        cannot be a `CompoundKernel`.
+
+    optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
+        Specifies how multi-class classification problems are handled.
+        Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
+        one binary Gaussian process classifier is fitted for each class, which
+        is trained to separate this class from the rest. In 'one_vs_one', one
+        binary Gaussian process classifier is fitted for each pair of classes,
+        which is trained to separate these two classes. The predictions of
+        these binary predictors are combined into multi-class predictions.
+        Note that 'one_vs_one' does not support predicting probability
+        estimates.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation: the specified
+        multiclass problems are computed in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    base_estimator_ : ``Estimator`` instance
+        The estimator instance that defines the likelihood function
+        using the observed data.
+
+    kernel_ : kernel instance
+        The kernel used for prediction. In case of binary classification,
+        the structure of the kernel is the same as the one passed as parameter
+        but with optimized hyperparameters. In case of multi-class
+        classification, a CompoundKernel is returned which consists of the
+        different kernels used in the one-versus-rest classifiers.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    n_classes_ : int
+        The number of classes in the training data
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GaussianProcessRegressor : Gaussian process regression (GPR).
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import RBF
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = 1.0 * RBF(1.0)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9866...
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.83548752, 0.03228706, 0.13222543],
+           [0.79064206, 0.06525643, 0.14410151]])
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [Kernel, None],
+        "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
+        "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
+        "max_iter_predict": [Interval(Integral, 1, None, closed="left")],
+        "warm_start": ["boolean"],
+        "copy_X_train": ["boolean"],
+        "random_state": ["random_state"],
+        "multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})],
+        "n_jobs": [Integral, None],
+    }
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        max_iter_predict=100,
+        warm_start=False,
+        copy_X_train=True,
+        random_state=None,
+        multi_class="one_vs_rest",
+        n_jobs=None,
+    ):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+        self.multi_class = multi_class
+        self.n_jobs = n_jobs
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit Gaussian process classification model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary.
+
+        Returns
+        -------
+        self : object
+            Returns an instance of self.
+        """
+        if isinstance(self.kernel, CompoundKernel):
+            raise ValueError("kernel cannot be a CompoundKernel")
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X, y = self._validate_data(
+                X, y, multi_output=False, ensure_2d=True, dtype="numeric"
+            )
+        else:
+            X, y = self._validate_data(
+                X, y, multi_output=False, ensure_2d=False, dtype=None
+            )
+
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            kernel=self.kernel,
+            optimizer=self.optimizer,
+            n_restarts_optimizer=self.n_restarts_optimizer,
+            max_iter_predict=self.max_iter_predict,
+            warm_start=self.warm_start,
+            copy_X_train=self.copy_X_train,
+            random_state=self.random_state,
+        )
+
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.size
+        if self.n_classes_ == 1:
+            raise ValueError(
+                "GaussianProcessClassifier requires 2 or more "
+                "distinct classes; got %d class (only class %s "
+                "is present)" % (self.n_classes_, self.classes_[0])
+            )
+        if self.n_classes_ > 2:
+            if self.multi_class == "one_vs_rest":
+                self.base_estimator_ = OneVsRestClassifier(
+                    self.base_estimator_, n_jobs=self.n_jobs
+                )
+            elif self.multi_class == "one_vs_one":
+                self.base_estimator_ = OneVsOneClassifier(
+                    self.base_estimator_, n_jobs=self.n_jobs
+                )
+            else:
+                raise ValueError("Unknown multi-class mode %s" % self.multi_class)
+
+        self.base_estimator_.fit(X, y)
+
+        if self.n_classes_ > 2:
+            self.log_marginal_likelihood_value_ = np.mean(
+                [
+                    estimator.log_marginal_likelihood()
+                    for estimator in self.base_estimator_.estimators_
+                ]
+            )
+        else:
+            self.log_marginal_likelihood_value_ = (
+                self.base_estimator_.log_marginal_likelihood()
+            )
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``.
+        """
+        check_is_fitted(self)
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
+        else:
+            X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
+
+        return self.base_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
+            raise ValueError(
+                "one_vs_one multi-class mode does not support "
+                "predicting probability estimates. Use "
+                "one_vs_rest mode instead."
+            )
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
+        else:
+            X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
+
+        return self.base_estimator_.predict_proba(X)
+
+    @property
+    def kernel_(self):
+        """Return the kernel of the base estimator."""
+        if self.n_classes_ == 2:
+            return self.base_estimator_.kernel_
+        else:
+            return CompoundKernel(
+                [estimator.kernel_ for estimator in self.base_estimator_.estimators_]
+            )
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Return log-marginal likelihood of theta for training data.
+
+        In the case of multi-class classification, the mean log-marginal
+        likelihood of the one-versus-rest classifiers are returned.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. In the case of multi-class classification, theta may
+            be the  hyperparameters of the compound kernel or of an individual
+            kernel. In the latter case, all individual kernel get assigned the
+            same theta values. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. Note that gradient computation is not supported
+            for non-binary classification. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        check_is_fitted(self)
+
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        theta = np.asarray(theta)
+        if self.n_classes_ == 2:
+            return self.base_estimator_.log_marginal_likelihood(
+                theta, eval_gradient, clone_kernel=clone_kernel
+            )
+        else:
+            if eval_gradient:
+                raise NotImplementedError(
+                    "Gradient of log-marginal-likelihood not implemented for "
+                    "multi-class GPC."
+                )
+            estimators = self.base_estimator_.estimators_
+            n_dims = estimators[0].kernel_.n_dims
+            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
+                return np.mean(
+                    [
+                        estimator.log_marginal_likelihood(
+                            theta, clone_kernel=clone_kernel
+                        )
+                        for i, estimator in enumerate(estimators)
+                    ]
+                )
+            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
+                # theta for compound kernel
+                return np.mean(
+                    [
+                        estimator.log_marginal_likelihood(
+                            theta[n_dims * i : n_dims * (i + 1)],
+                            clone_kernel=clone_kernel,
+                        )
+                        for i, estimator in enumerate(estimators)
+                    ]
+                )
+            else:
+                raise ValueError(
+                    "Shape of theta must be either %d or %d. "
+                    "Obtained theta with shape %d."
+                    % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0])
+                )
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/_gpr.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/_gpr.py
@ -0,0 +1,669 @@
+"""Gaussian processes regression."""
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Modified by: Pete Green <p.l.green@liverpool.ac.uk>
+# License: BSD 3 clause
+
+import warnings
+from numbers import Integral, Real
+from operator import itemgetter
+
+import numpy as np
+import scipy.optimize
+from scipy.linalg import cho_solve, cholesky, solve_triangular
+
+from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone
+from ..preprocessing._data import _handle_zeros_in_scale
+from ..utils import check_random_state
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.optimize import _check_optimize_result
+from .kernels import RBF, Kernel
+from .kernels import ConstantKernel as C
+
+GPR_CHOLESKY_LOWER = True
+
+
+class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
+    """Gaussian process regression (GPR).
+
+    The implementation is based on Algorithm 2.1 of [RW2006]_.
+
+    In addition to standard scikit-learn estimator API,
+    :class:`GaussianProcessRegressor`:
+
+       * allows prediction without prior fitting (based on the GP prior)
+       * provides an additional method `sample_y(X)`, which evaluates samples
+         drawn from the GPR (prior or posterior) at given inputs
+       * exposes a method `log_marginal_likelihood(theta)`, which can be used
+         externally for other ways of selecting hyperparameters, e.g., via
+         Markov chain Monte Carlo.
+
+    To learn the difference between a point-estimate approach vs. a more
+    Bayesian modelling approach, refer to the example entitled
+    :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`.
+
+    Read more in the :ref:`User Guide <gaussian_process>`.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed")
+        * RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that
+        the kernel hyperparameters are optimized during fitting unless the
+        bounds are marked as "fixed".
+
+    alpha : float or ndarray of shape (n_samples,), default=1e-10
+        Value added to the diagonal of the kernel matrix during fitting.
+        This can prevent a potential numerical issue during fitting, by
+        ensuring that the calculated values form a positive definite matrix.
+        It can also be interpreted as the variance of additional Gaussian
+        measurement noise on the training observations. Note that this is
+        different from using a `WhiteKernel`. If an array is passed, it must
+        have the same number of entries as the data used for fitting and is
+        used as datapoint-dependent noise level. Allowing to specify the
+        noise level directly as a parameter is mainly for convenience and
+        for consistency with :class:`~sklearn.linear_model.Ridge`.
+
+    optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b"
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func': the objective function to be minimized, which
+                #   takes the hyperparameters theta as a parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are: `{'fmin_l_bfgs_b'}`.
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that `n_restarts_optimizer == 0` implies that one
+        run is performed.
+
+    normalize_y : bool, default=False
+        Whether or not to normalize the target values `y` by removing the mean
+        and scaling to unit-variance. This is recommended for cases where
+        zero-mean, unit-variance priors are used. Note that, in this
+        implementation, the normalisation is reversed before the GP predictions
+        are reported.
+
+        .. versionchanged:: 0.23
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    n_targets : int, default=None
+        The number of dimensions of the target values. Used to decide the number
+        of outputs when sampling from the prior distributions (i.e. calling
+        :meth:`sample_y` before :meth:`fit`). This parameter is ignored once
+        :meth:`fit` has been called.
+
+        .. versionadded:: 1.3
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    X_train_ : array-like of shape (n_samples, n_features) or list of object
+        Feature vectors or other representations of training data (also
+        required for prediction).
+
+    y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)
+        Target values in training data (also required for prediction).
+
+    kernel_ : kernel instance
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters.
+
+    L_ : array-like of shape (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.
+
+    alpha_ : array-like of shape (n_samples,)
+        Dual coefficients of training data points in kernel space.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    GaussianProcessClassifier : Gaussian process classification (GPC)
+        based on Laplace approximation.
+
+    References
+    ----------
+    .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
+       "Gaussian Processes for Machine Learning",
+       MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman2
+    >>> from sklearn.gaussian_process import GaussianProcessRegressor
+    >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
+    >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
+    >>> kernel = DotProduct() + WhiteKernel()
+    >>> gpr = GaussianProcessRegressor(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpr.score(X, y)
+    0.3680...
+    >>> gpr.predict(X[:2,:], return_std=True)
+    (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
+    """
+
+    _parameter_constraints: dict = {
+        "kernel": [None, Kernel],
+        "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
+        "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
+        "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
+        "normalize_y": ["boolean"],
+        "copy_X_train": ["boolean"],
+        "n_targets": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        kernel=None,
+        *,
+        alpha=1e-10,
+        optimizer="fmin_l_bfgs_b",
+        n_restarts_optimizer=0,
+        normalize_y=False,
+        copy_X_train=True,
+        n_targets=None,
+        random_state=None,
+    ):
+        self.kernel = kernel
+        self.alpha = alpha
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.normalize_y = normalize_y
+        self.copy_X_train = copy_X_train
+        self.n_targets = n_targets
+        self.random_state = random_state
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y):
+        """Fit Gaussian process regression model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+
+        Returns
+        -------
+        self : object
+            GaussianProcessRegressor class instance.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
+                1.0, length_scale_bounds="fixed"
+            )
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self._rng = check_random_state(self.random_state)
+
+        if self.kernel_.requires_vector_input:
+            dtype, ensure_2d = "numeric", True
+        else:
+            dtype, ensure_2d = None, False
+        X, y = self._validate_data(
+            X,
+            y,
+            multi_output=True,
+            y_numeric=True,
+            ensure_2d=ensure_2d,
+            dtype=dtype,
+        )
+
+        n_targets_seen = y.shape[1] if y.ndim > 1 else 1
+        if self.n_targets is not None and n_targets_seen != self.n_targets:
+            raise ValueError(
+                "The number of targets seen in `y` is different from the parameter "
+                f"`n_targets`. Got {n_targets_seen} != {self.n_targets}."
+            )
+
+        # Normalize target value
+        if self.normalize_y:
+            self._y_train_mean = np.mean(y, axis=0)
+            self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)
+
+            # Remove mean and make unit variance
+            y = (y - self._y_train_mean) / self._y_train_std
+
+        else:
+            shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1
+            self._y_train_mean = np.zeros(shape=shape_y_stats)
+            self._y_train_std = np.ones(shape=shape_y_stats)
+
+        if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:
+            if self.alpha.shape[0] == 1:
+                self.alpha = self.alpha[0]
+            else:
+                raise ValueError(
+                    "alpha must be a scalar or an array with same number of "
+                    f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})"
+                )
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+        self.y_train_ = np.copy(y) if self.copy_X_train else y
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True, clone_kernel=False
+                    )
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta, clone_kernel=False)
+
+            # First optimize starting from theta specified in kernel
+            optima = [
+                (
+                    self._constrained_optimization(
+                        obj_func, self.kernel_.theta, self.kernel_.bounds
+                    )
+                )
+            ]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite."
+                    )
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial, bounds)
+                    )
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.kernel_._check_bounds_params()
+
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
+                self.kernel_.theta, clone_kernel=False
+            )
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
+        K = self.kernel_(self.X_train_)
+        K[np.diag_indices_from(K)] += self.alpha
+        try:
+            self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
+        except np.linalg.LinAlgError as exc:
+            exc.args = (
+                (
+                    f"The kernel, {self.kernel_}, is not returning a positive "
+                    "definite matrix. Try gradually increasing the 'alpha' "
+                    "parameter of your GaussianProcessRegressor estimator."
+                ),
+            ) + exc.args
+            raise
+        # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
+        self.alpha_ = cho_solve(
+            (self.L_, GPR_CHOLESKY_LOWER),
+            self.y_train_,
+            check_finite=False,
+        )
+        return self
+
+    def predict(self, X, return_std=False, return_cov=False):
+        """Predict using the Gaussian process regression model.
+
+        We can also predict based on an unfitted model by using the GP prior.
+        In addition to the mean of the predictive distribution, optionally also
+        returns its standard deviation (`return_std=True`) or covariance
+        (`return_cov=True`). Note that at most one of the two can be requested.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated.
+
+        return_std : bool, default=False
+            If True, the standard-deviation of the predictive distribution at
+            the query points is returned along with the mean.
+
+        return_cov : bool, default=False
+            If True, the covariance of the joint predictive distribution at
+            the query points is returned along with the mean.
+
+        Returns
+        -------
+        y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
+            Mean of predictive distribution at query points.
+
+        y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional
+            Standard deviation of predictive distribution at query points.
+            Only returned when `return_std` is True.
+
+        y_cov : ndarray of shape (n_samples, n_samples) or \
+                (n_samples, n_samples, n_targets), optional
+            Covariance of joint predictive distribution at query points.
+            Only returned when `return_cov` is True.
+        """
+        if return_std and return_cov:
+            raise RuntimeError(
+                "At most one of return_std or return_cov can be requested."
+            )
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            dtype, ensure_2d = "numeric", True
+        else:
+            dtype, ensure_2d = None, False
+
+        X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)
+
+        if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
+            if self.kernel is None:
+                kernel = C(1.0, constant_value_bounds="fixed") * RBF(
+                    1.0, length_scale_bounds="fixed"
+                )
+            else:
+                kernel = self.kernel
+
+            n_targets = self.n_targets if self.n_targets is not None else 1
+            y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze()
+
+            if return_cov:
+                y_cov = kernel(X)
+                if n_targets > 1:
+                    y_cov = np.repeat(
+                        np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1
+                    )
+                return y_mean, y_cov
+            elif return_std:
+                y_var = kernel.diag(X)
+                if n_targets > 1:
+                    y_var = np.repeat(
+                        np.expand_dims(y_var, -1), repeats=n_targets, axis=-1
+                    )
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+        else:  # Predict based on GP posterior
+            # Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha
+            K_trans = self.kernel_(X, self.X_train_)
+            y_mean = K_trans @ self.alpha_
+
+            # undo normalisation
+            y_mean = self._y_train_std * y_mean + self._y_train_mean
+
+            # if y_mean has shape (n_samples, 1), reshape to (n_samples,)
+            if y_mean.ndim > 1 and y_mean.shape[1] == 1:
+                y_mean = np.squeeze(y_mean, axis=1)
+
+            # Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T
+            V = solve_triangular(
+                self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False
+            )
+
+            if return_cov:
+                # Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v
+                y_cov = self.kernel_(X) - V.T @ V
+
+                # undo normalisation
+                y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)
+                # if y_cov has shape (n_samples, n_samples, 1), reshape to
+                # (n_samples, n_samples)
+                if y_cov.shape[2] == 1:
+                    y_cov = np.squeeze(y_cov, axis=2)
+
+                return y_mean, y_cov
+            elif return_std:
+                # Compute variance of predictive distribution
+                # Use einsum to avoid explicitly forming the large matrix
+                # V^T @ V just to extract its diagonal afterward.
+                y_var = self.kernel_.diag(X).copy()
+                y_var -= np.einsum("ij,ji->i", V.T, V)
+
+                # Check if any of the variances is negative because of
+                # numerical issues. If yes: set the variance to 0.
+                y_var_negative = y_var < 0
+                if np.any(y_var_negative):
+                    warnings.warn(
+                        "Predicted variances smaller than 0. "
+                        "Setting those variances to 0."
+                    )
+                    y_var[y_var_negative] = 0.0
+
+                # undo normalisation
+                y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)
+
+                # if y_var has shape (n_samples, 1), reshape to (n_samples,)
+                if y_var.shape[1] == 1:
+                    y_var = np.squeeze(y_var, axis=1)
+
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+
+    def sample_y(self, X, n_samples=1, random_state=0):
+        """Draw samples from Gaussian process and evaluate at X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples_X, n_features) or list of object
+            Query points where the GP is evaluated.
+
+        n_samples : int, default=1
+            Number of samples drawn from the Gaussian process per query point.
+
+        random_state : int, RandomState instance or None, default=0
+            Determines random number generation to randomly draw samples.
+            Pass an int for reproducible results across multiple function
+            calls.
+            See :term:`Glossary <random_state>`.
+
+        Returns
+        -------
+        y_samples : ndarray of shape (n_samples_X, n_samples), or \
+            (n_samples_X, n_targets, n_samples)
+            Values of n_samples samples drawn from Gaussian process and
+            evaluated at query points.
+        """
+        rng = check_random_state(random_state)
+
+        y_mean, y_cov = self.predict(X, return_cov=True)
+        if y_mean.ndim == 1:
+            y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
+        else:
+            y_samples = [
+                rng.multivariate_normal(
+                    y_mean[:, target], y_cov[..., target], n_samples
+                ).T[:, np.newaxis]
+                for target in range(y_mean.shape[1])
+            ]
+            y_samples = np.hstack(y_samples)
+        return y_samples
+
+    def log_marginal_likelihood(
+        self, theta=None, eval_gradient=False, clone_kernel=True
+    ):
+        """Return log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,) default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError("Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        if clone_kernel:
+            kernel = self.kernel_.clone_with_theta(theta)
+        else:
+            kernel = self.kernel_
+            kernel.theta = theta
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
+        K[np.diag_indices_from(K)] += self.alpha
+        try:
+            L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
+        except np.linalg.LinAlgError:
+            return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf
+
+        # Support multi-dimensional output of self.y_train_
+        y_train = self.y_train_
+        if y_train.ndim == 1:
+            y_train = y_train[:, np.newaxis]
+
+        # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
+        alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)
+
+        # Alg 2.1, page 19, line 7
+        # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi)
+        # y is originally thought to be a (1, n_samples) row vector. However,
+        # in multioutputs, y is of shape (n_samples, 2) and we need to compute
+        # y^T . alpha for each output, independently using einsum. Thus, it
+        # is equivalent to:
+        # for output_idx in range(n_outputs):
+        #     log_likelihood_dims[output_idx] = (
+        #         y_train[:, [output_idx]] @ alpha[:, [output_idx]]
+        #     )
+        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
+        log_likelihood_dims -= np.log(np.diag(L)).sum()
+        log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
+        # the log likehood is sum-up across the outputs
+        log_likelihood = log_likelihood_dims.sum(axis=-1)
+
+        if eval_gradient:
+            # Eq. 5.9, p. 114, and footnote 5 in p. 114
+            # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient)
+            # alpha is supposed to be a vector of (n_samples,) elements. With
+            # multioutputs, alpha is a matrix of size (n_samples, n_outputs).
+            # Therefore, we want to construct a matrix of
+            # (n_samples, n_samples, n_outputs) equivalent to
+            # for output_idx in range(n_outputs):
+            #     output_alpha = alpha[:, [output_idx]]
+            #     inner_term[..., output_idx] = output_alpha @ output_alpha.T
+            inner_term = np.einsum("ik,jk->ijk", alpha, alpha)
+            # compute K^-1 of shape (n_samples, n_samples)
+            K_inv = cho_solve(
+                (L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False
+            )
+            # create a new axis to use broadcasting between inner_term and
+            # K_inv
+            inner_term -= K_inv[..., np.newaxis]
+            # Since we are interested about the trace of
+            # inner_term @ K_gradient, we don't explicitly compute the
+            # matrix-by-matrix operation and instead use an einsum. Therefore
+            # it is equivalent to:
+            # for param_idx in range(n_kernel_params):
+            #     for output_idx in range(n_output):
+            #         log_likehood_gradient_dims[param_idx, output_idx] = (
+            #             inner_term[..., output_idx] @
+            #             K_gradient[..., param_idx]
+            #         )
+            log_likelihood_gradient_dims = 0.5 * np.einsum(
+                "ijl,jik->kl", inner_term, K_gradient
+            )
+            # the log likehood gradient is the sum-up across the outputs
+            log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)
+
+        if eval_gradient:
+            return log_likelihood, log_likelihood_gradient
+        else:
+            return log_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            opt_res = scipy.optimize.minimize(
+                obj_func,
+                initial_theta,
+                method="L-BFGS-B",
+                jac=True,
+                bounds=bounds,
+            )
+            _check_optimize_result("lbfgs", opt_res)
+            theta_opt, func_min = opt_res.x, opt_res.fun
+        elif callable(self.optimizer):
+            theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError(f"Unknown optimizer {self.optimizer}.")
+
+        return theta_opt, func_min
+
+    def _more_tags(self):
+        return {"requires_fit": False}
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/kernels.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/kernels.py
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/init.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/init.py
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/_mini_sequence_kernel.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/_mini_sequence_kernel.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_gpc.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_gpc.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_gpr.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_gpr.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_kernels.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/pycache/test_kernels.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py
@ -0,0 +1,54 @@
+import numpy as np
+
+from sklearn.base import clone
+from sklearn.gaussian_process.kernels import (
+    GenericKernelMixin,
+    Hyperparameter,
+    Kernel,
+    StationaryKernelMixin,
+)
+
+
+class MiniSeqKernel(GenericKernelMixin, StationaryKernelMixin, Kernel):
+    """
+    A minimal (but valid) convolutional kernel for sequences of variable
+    length.
+    """
+
+    def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
+        self.baseline_similarity = baseline_similarity
+        self.baseline_similarity_bounds = baseline_similarity_bounds
+
+    @property
+    def hyperparameter_baseline_similarity(self):
+        return Hyperparameter(
+            "baseline_similarity", "numeric", self.baseline_similarity_bounds
+        )
+
+    def _f(self, s1, s2):
+        return sum(
+            [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
+        )
+
+    def _g(self, s1, s2):
+        return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            return (
+                np.array([[self._f(x, y) for y in Y] for x in X]),
+                np.array([[[self._g(x, y)] for y in Y] for x in X]),
+            )
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+
+    def diag(self, X):
+        return np.array([self._f(x, x) for x in X])
+
+    def clone_with_theta(self, theta):
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_gpc.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_gpc.py
@ -0,0 +1,284 @@
+"""Testing for Gaussian process classification"""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+import warnings
+
+import numpy as np
+import pytest
+from scipy.optimize import approx_fprime
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    CompoundKernel,
+    WhiteKernel,
+)
+from sklearn.gaussian_process.kernels import (
+    ConstantKernel as C,
+)
+from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
+from sklearn.utils._testing import assert_almost_equal, assert_array_equal
+
+
+def f(x):
+    return np.sin(x)
+
+
+X = np.atleast_2d(np.linspace(0, 10, 30)).T
+X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
+y = np.array(f(X).ravel() > 0, dtype=int)
+fX = f(X).ravel()
+y_mc = np.empty(y.shape, dtype=int)  # multi-class
+y_mc[fX < -0.35] = 0
+y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
+y_mc[fX > 0.35] = 2
+
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [
+    RBF(length_scale=0.1),
+    fixed_kernel,
+    RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+]
+non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_predict_consistent(kernel):
+    # Check binary predict decision has also predicted probability above 0.5.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
+
+
+def test_predict_consistent_structured():
+    # Check binary predict decision has also predicted probability above 0.5.
+    X = ["A", "AB", "B"]
+    y = np.array([True, False, True])
+    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_lml_improving(kernel):
+    # Test that hyperparameter-tuning improves log-marginal likelihood.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert gpc.log_marginal_likelihood(gpc.kernel_.theta) > gpc.log_marginal_likelihood(
+        kernel.theta
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_precomputed(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    assert_almost_equal(
+        gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_without_cloning_kernel(kernel):
+    # Test that clone_kernel=False has side-effects of kernel.theta.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+    input_theta = np.ones(gpc.kernel_.theta.shape, dtype=np.float64)
+
+    gpc.log_marginal_likelihood(input_theta, clone_kernel=False)
+    assert_almost_equal(gpc.kernel_.theta, input_theta, 7)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_converged_to_local_maximum(kernel):
+    # Test that we are in local maximum after hyperparameter-optimization.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
+
+    assert np.all(
+        (np.abs(lml_gradient) < 1e-4)
+        | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0])
+        | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1])
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_gradient(kernel):
+    # Compare analytic and numeric gradient of log marginal likelihood.
+    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True)
+    lml_gradient_approx = approx_fprime(
+        kernel.theta, lambda theta: gpc.log_marginal_likelihood(theta, False), 1e-10
+    )
+
+    assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+def test_random_starts(global_random_seed):
+    # Test that an increasing number of random-starts of GP fitting only
+    # increases the log marginal likelihood of the chosen theta.
+    n_samples, n_features = 25, 2
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
+
+    kernel = C(1.0, (1e-2, 1e2)) * RBF(
+        length_scale=[1e-3] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
+    )
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(5):
+        gp = GaussianProcessClassifier(
+            kernel=kernel,
+            n_restarts_optimizer=n_restarts_optimizer,
+            random_state=global_random_seed,
+        ).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert lml > last_lml - np.finfo(np.float32).eps
+        last_lml = lml
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_custom_optimizer(kernel, global_random_seed):
+    # Test that GPC can use externally defined optimizers.
+    # Define a dummy optimizer that simply tests 10 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(global_random_seed)
+        theta_opt, func_min = initial_theta, obj_func(
+            initial_theta, eval_gradient=False
+        )
+        for _ in range(10):
+            theta = np.atleast_1d(
+                rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
+            )
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
+    gpc.fit(X, y_mc)
+    # Checks that optimizer improved marginal likelihood
+    assert gpc.log_marginal_likelihood(
+        gpc.kernel_.theta
+    ) >= gpc.log_marginal_likelihood(kernel.theta)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_multi_class(kernel):
+    # Test GPC for multi-class classification problems.
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    gpc.fit(X, y_mc)
+
+    y_prob = gpc.predict_proba(X2)
+    assert_almost_equal(y_prob.sum(1), 1)
+
+    y_pred = gpc.predict(X2)
+    assert_array_equal(np.argmax(y_prob, 1), y_pred)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_multi_class_n_jobs(kernel):
+    # Test that multi-class GPC produces identical results with n_jobs>1.
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    gpc.fit(X, y_mc)
+
+    gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
+    gpc_2.fit(X, y_mc)
+
+    y_prob = gpc.predict_proba(X2)
+    y_prob_2 = gpc_2.predict_proba(X2)
+    assert_almost_equal(y_prob, y_prob_2)
+
+
+def test_warning_bounds():
+    kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
+    gpc = GaussianProcessClassifier(kernel=kernel)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpc.fit(X, y)
+
+    kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
+        length_scale_bounds=[1e3, 1e5]
+    )
+    gpc_sum = GaussianProcessClassifier(kernel=kernel_sum)
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpc_sum.fit(X, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k1__noise_level is close to the "
+            "specified upper bound 0.001. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k2__length_scale is close to the "
+            "specified lower bound 1000.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+    X_tile = np.tile(X, 2)
+    kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
+    gpc_dims = GaussianProcessClassifier(kernel=kernel_dims)
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpc_dims.fit(X_tile, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "length_scale is close to the "
+            "specified upper bound 100.0. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 1 of parameter "
+            "length_scale is close to the "
+            "specified upper bound 100.0. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+
+@pytest.mark.parametrize(
+    "params, error_type, err_msg",
+    [
+        (
+            {"kernel": CompoundKernel(0)},
+            ValueError,
+            "kernel cannot be a CompoundKernel",
+        )
+    ],
+)
+def test_gpc_fit_error(params, error_type, err_msg):
+    """Check that expected error are raised during fit."""
+    gpc = GaussianProcessClassifier(**params)
+    with pytest.raises(error_type, match=err_msg):
+        gpc.fit(X, y)
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_gpr.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_gpr.py
@ -0,0 +1,849 @@
+"""Testing for Gaussian process regression"""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Modified by: Pete Green <p.l.green@liverpool.ac.uk>
+# License: BSD 3 clause
+
+import re
+import sys
+import warnings
+
+import numpy as np
+import pytest
+from scipy.optimize import approx_fprime
+
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    DotProduct,
+    ExpSineSquared,
+    WhiteKernel,
+)
+from sklearn.gaussian_process.kernels import (
+    ConstantKernel as C,
+)
+from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_less,
+)
+
+
+def f(x):
+    return x * np.sin(x)
+
+
+X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
+X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
+y = f(X).ravel()
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [
+    RBF(length_scale=1.0),
+    fixed_kernel,
+    RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+    C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+    + C(1e-5, (1e-5, 1e2)),
+    C(0.1, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+    + C(1e-5, (1e-5, 1e2)),
+]
+non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_gpr_interpolation(kernel):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test the interpolating property for different kernels.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(y_pred, y)
+    assert_almost_equal(np.diag(y_cov), 0.0)
+
+
+def test_gpr_interpolation_structured():
+    # Test the interpolating property for different kernels.
+    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
+    X = ["A", "B", "C"]
+    y = np.array([1, 2, 3])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(
+        kernel(X, eval_gradient=True)[1].ravel(), (1 - np.eye(len(X))).ravel()
+    )
+    assert_almost_equal(y_pred, y)
+    assert_almost_equal(np.diag(y_cov), 0.0)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_lml_improving(kernel):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test that hyperparameter-tuning improves log-marginal likelihood.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
+        kernel.theta
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_precomputed(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) == pytest.approx(
+        gpr.log_marginal_likelihood()
+    )
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_without_cloning_kernel(kernel):
+    # Test that lml of optimized kernel is stored correctly.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    input_theta = np.ones(gpr.kernel_.theta.shape, dtype=np.float64)
+
+    gpr.log_marginal_likelihood(input_theta, clone_kernel=False)
+    assert_almost_equal(gpr.kernel_.theta, input_theta, 7)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_converged_to_local_maximum(kernel):
+    # Test that we are in local maximum after hyperparameter-optimization.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
+
+    assert np.all(
+        (np.abs(lml_gradient) < 1e-4)
+        | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
+        | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])
+    )
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_solution_inside_bounds(kernel):
+    # Test that hyperparameter-optimization remains in bounds#
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    bounds = gpr.kernel_.bounds
+    max_ = np.finfo(gpr.kernel_.theta.dtype).max
+    tiny = 1e-10
+    bounds[~np.isfinite(bounds[:, 1]), 1] = max_
+
+    assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
+    assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_lml_gradient(kernel):
+    # Compare analytic and numeric gradient of log marginal likelihood.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True)
+    lml_gradient_approx = approx_fprime(
+        kernel.theta, lambda theta: gpr.log_marginal_likelihood(theta, False), 1e-10
+    )
+
+    assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_prior(kernel):
+    # Test that GP prior has mean 0 and identical variances.
+    gpr = GaussianProcessRegressor(kernel=kernel)
+
+    y_mean, y_cov = gpr.predict(X, return_cov=True)
+
+    assert_almost_equal(y_mean, 0, 5)
+    if len(gpr.kernel.theta) > 1:
+        # XXX: quite hacky, works only for current kernels
+        assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5)
+    else:
+        assert_almost_equal(np.diag(y_cov), 1, 5)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_sample_statistics(kernel):
+    # Test that statistics of samples drawn from GP are correct.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+    y_mean, y_cov = gpr.predict(X2, return_cov=True)
+
+    samples = gpr.sample_y(X2, 300000)
+
+    # More digits accuracy would require many more samples
+    assert_almost_equal(y_mean, np.mean(samples, 1), 1)
+    assert_almost_equal(
+        np.diag(y_cov) / np.diag(y_cov).max(),
+        np.var(samples, 1) / np.diag(y_cov).max(),
+        1,
+    )
+
+
+def test_no_optimizer():
+    # Test that kernel parameters are unmodified when optimizer is None.
+    kernel = RBF(1.0)
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
+    assert np.exp(gpr.kernel_.theta) == 1.0
+
+
+@pytest.mark.parametrize("kernel", kernels)
+@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)])
+def test_predict_cov_vs_std(kernel, target):
+    if sys.maxsize <= 2**32:
+        pytest.xfail("This test may fail on 32 bit Python")
+
+    # Test that predicted std.-dev. is consistent with cov's diagonal.
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    y_mean, y_cov = gpr.predict(X2, return_cov=True)
+    y_mean, y_std = gpr.predict(X2, return_std=True)
+    assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)
+
+
+def test_anisotropic_kernel():
+    # Test that GPR can identify meaningful anisotropic length-scales.
+    # We learn a function which varies in one dimension ten-times slower
+    # than in the other. The corresponding length-scales should differ by at
+    # least a factor 5
+    rng = np.random.RandomState(0)
+    X = rng.uniform(-1, 1, (50, 2))
+    y = X[:, 0] + 0.1 * X[:, 1]
+
+    kernel = RBF([1.0, 1.0])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert np.exp(gpr.kernel_.theta[1]) > np.exp(gpr.kernel_.theta[0]) * 5
+
+
+def test_random_starts():
+    # Test that an increasing number of random-starts of GP fitting only
+    # increases the log marginal likelihood of the chosen theta.
+    n_samples, n_features = 25, 2
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (
+        np.sin(X).sum(axis=1)
+        + np.sin(3 * X).sum(axis=1)
+        + rng.normal(scale=0.1, size=n_samples)
+    )
+
+    kernel = C(1.0, (1e-2, 1e2)) * RBF(
+        length_scale=[1.0] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
+    ) + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1))
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(5):
+        gp = GaussianProcessRegressor(
+            kernel=kernel,
+            n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0,
+        ).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert lml > last_lml - np.finfo(np.float32).eps
+        last_lml = lml
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_y_normalization(kernel):
+    """
+    Test normalization of the target values in GP
+
+    Fitting non-normalizing GP on normalized y and fitting normalizing GP
+    on unnormalized y should yield identical results. Note that, here,
+    'normalized y' refers to y that has been made zero mean and unit
+    variance.
+
+    """
+
+    y_mean = np.mean(y)
+    y_std = np.std(y)
+    y_norm = (y - y_mean) / y_std
+
+    # Fit non-normalizing GP on normalized y
+    gpr = GaussianProcessRegressor(kernel=kernel)
+    gpr.fit(X, y_norm)
+
+    # Fit normalizing GP on unnormalized y
+    gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr_norm.fit(X, y)
+
+    # Compare predicted mean, std-devs and covariances
+    y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+    y_pred = y_pred * y_std + y_mean
+    y_pred_std = y_pred_std * y_std
+    y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
+
+    assert_almost_equal(y_pred, y_pred_norm)
+    assert_almost_equal(y_pred_std, y_pred_std_norm)
+
+    _, y_cov = gpr.predict(X2, return_cov=True)
+    y_cov = y_cov * y_std**2
+    _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_cov, y_cov_norm)
+
+
+def test_large_variance_y():
+    """
+    Here we test that, when noramlize_y=True, our GP can produce a
+    sensible fit to training data whose variance is significantly
+    larger than unity. This test was made in response to issue #15612.
+
+    GP predictions are verified against predictions that were made
+    using GPy which, here, is treated as the 'gold standard'. Note that we
+    only investigate the RBF kernel here, as that is what was used in the
+    GPy implementation.
+
+    The following code can be used to recreate the GPy data:
+
+    --------------------------------------------------------------------------
+    import GPy
+
+    kernel_gpy = GPy.kern.RBF(input_dim=1, lengthscale=1.)
+    gpy = GPy.models.GPRegression(X, np.vstack(y_large), kernel_gpy)
+    gpy.optimize()
+    y_pred_gpy, y_var_gpy = gpy.predict(X2)
+    y_pred_std_gpy = np.sqrt(y_var_gpy)
+    --------------------------------------------------------------------------
+    """
+
+    # Here we utilise a larger variance version of the training data
+    y_large = 10 * y
+
+    # Standard GP with normalize_y=True
+    RBF_params = {"length_scale": 1.0}
+    kernel = RBF(**RBF_params)
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_large)
+    y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+
+    # 'Gold standard' mean predictions from GPy
+    y_pred_gpy = np.array(
+        [15.16918303, -27.98707845, -39.31636019, 14.52605515, 69.18503589]
+    )
+
+    # 'Gold standard' std predictions from GPy
+    y_pred_std_gpy = np.array(
+        [7.78860962, 3.83179178, 0.63149951, 0.52745188, 0.86170042]
+    )
+
+    # Based on numerical experiments, it's reasonable to expect our
+    # GP's mean predictions to get within 7% of predictions of those
+    # made by GPy.
+    assert_allclose(y_pred, y_pred_gpy, rtol=0.07, atol=0)
+
+    # Based on numerical experiments, it's reasonable to expect our
+    # GP's std predictions to get within 15% of predictions of those
+    # made by GPy.
+    assert_allclose(y_pred_std, y_pred_std_gpy, rtol=0.15, atol=0)
+
+
+def test_y_multioutput():
+    # Test that GPR can deal with multi-dimensional target values
+    y_2d = np.vstack((y, y * 2)).T
+
+    # Test for fixed kernel that first dimension of 2d GP equals the output
+    # of 1d GP and that second dimension is twice as large
+    kernel = RBF(length_scale=1.0)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
+    gpr.fit(X, y)
+
+    gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
+    gpr_2d.fit(X, y_2d)
+
+    y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True)
+    y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True)
+    _, y_cov_1d = gpr.predict(X2, return_cov=True)
+    _, y_cov_2d = gpr_2d.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 0])
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2)
+
+    # Standard deviation and covariance do not depend on output
+    for target in range(y_2d.shape[1]):
+        assert_almost_equal(y_std_1d, y_std_2d[..., target])
+        assert_almost_equal(y_cov_1d, y_cov_2d[..., target])
+
+    y_sample_1d = gpr.sample_y(X2, n_samples=10)
+    y_sample_2d = gpr_2d.sample_y(X2, n_samples=10)
+
+    assert y_sample_1d.shape == (5, 10)
+    assert y_sample_2d.shape == (5, 2, 10)
+    # Only the first target will be equal
+    assert_almost_equal(y_sample_1d, y_sample_2d[:, 0, :])
+
+    # Test hyperparameter optimization
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr.fit(X, y)
+
+        gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr_2d.fit(X, np.vstack((y, y)).T)
+
+        assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
+
+
+@pytest.mark.parametrize("kernel", non_fixed_kernels)
+def test_custom_optimizer(kernel):
+    # Test that GPR can use externally defined optimizers.
+    # Define a dummy optimizer that simply tests 50 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = initial_theta, obj_func(
+            initial_theta, eval_gradient=False
+        )
+        for _ in range(50):
+            theta = np.atleast_1d(
+                rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
+            )
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
+    gpr.fit(X, y)
+    # Checks that optimizer improved marginal likelihood
+    assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
+        gpr.kernel.theta
+    )
+
+
+def test_gpr_correct_error_message():
+    X = np.arange(12).reshape(6, -1)
+    y = np.ones(6)
+    kernel = DotProduct()
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+    message = (
+        "The kernel, %s, is not returning a "
+        "positive definite matrix. Try gradually increasing "
+        "the 'alpha' parameter of your "
+        "GaussianProcessRegressor estimator." % kernel
+    )
+    with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)):
+        gpr.fit(X, y)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_duplicate_input(kernel):
+    # Test GPR can handle two different output-values for the same input.
+    gpr_equal_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+    gpr_similar_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+
+    X_ = np.vstack((X, X[0]))
+    y_ = np.hstack((y, y[0] + 1))
+    gpr_equal_inputs.fit(X_, y_)
+
+    X_ = np.vstack((X, X[0] + 1e-15))
+    y_ = np.hstack((y, y[0] + 1))
+    gpr_similar_inputs.fit(X_, y_)
+
+    X_test = np.linspace(0, 10, 100)[:, None]
+    y_pred_equal, y_std_equal = gpr_equal_inputs.predict(X_test, return_std=True)
+    y_pred_similar, y_std_similar = gpr_similar_inputs.predict(X_test, return_std=True)
+
+    assert_almost_equal(y_pred_equal, y_pred_similar)
+    assert_almost_equal(y_std_equal, y_std_similar)
+
+
+def test_no_fit_default_predict():
+    # Test that GPR predictions without fit does not break by default.
+    default_kernel = C(1.0, constant_value_bounds="fixed") * RBF(
+        1.0, length_scale_bounds="fixed"
+    )
+    gpr1 = GaussianProcessRegressor()
+    _, y_std1 = gpr1.predict(X, return_std=True)
+    _, y_cov1 = gpr1.predict(X, return_cov=True)
+
+    gpr2 = GaussianProcessRegressor(kernel=default_kernel)
+    _, y_std2 = gpr2.predict(X, return_std=True)
+    _, y_cov2 = gpr2.predict(X, return_cov=True)
+
+    assert_array_almost_equal(y_std1, y_std2)
+    assert_array_almost_equal(y_cov1, y_cov2)
+
+
+def test_warning_bounds():
+    kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
+    gpr = GaussianProcessRegressor(kernel=kernel)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpr.fit(X, y)
+
+    kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
+        length_scale_bounds=[1e3, 1e5]
+    )
+    gpr_sum = GaussianProcessRegressor(kernel=kernel_sum)
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpr_sum.fit(X, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k1__noise_level is close to the "
+            "specified upper bound 0.001. "
+            "Increasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "k2__length_scale is close to the "
+            "specified lower bound 1000.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+    X_tile = np.tile(X, 2)
+    kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
+    gpr_dims = GaussianProcessRegressor(kernel=kernel_dims)
+
+    with warnings.catch_warnings(record=True) as record:
+        warnings.simplefilter("always")
+        gpr_dims.fit(X_tile, y)
+
+        assert len(record) == 2
+
+        assert issubclass(record[0].category, ConvergenceWarning)
+        assert (
+            record[0].message.args[0] == "The optimal value found for "
+            "dimension 0 of parameter "
+            "length_scale is close to the "
+            "specified lower bound 10.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+        assert issubclass(record[1].category, ConvergenceWarning)
+        assert (
+            record[1].message.args[0] == "The optimal value found for "
+            "dimension 1 of parameter "
+            "length_scale is close to the "
+            "specified lower bound 10.0. "
+            "Decreasing the bound and calling "
+            "fit again may find a better value."
+        )
+
+
+def test_bound_check_fixed_hyperparameter():
+    # Regression test for issue #17943
+    # Check that having a hyperparameter with fixed bounds doesn't cause an
+    # error
+    k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
+    k2 = ExpSineSquared(
+        length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed"
+    )  # seasonal component
+    kernel = k1 + k2
+    GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_constant_target(kernel):
+    """Check that the std. dev. is affected to 1 when normalizing a constant
+    feature.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18318
+    NaN where affected to the target when scaling due to null std. dev. with
+    constant target.
+    """
+    y_constant = np.ones(X.shape[0], dtype=np.float64)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_constant)
+    assert gpr._y_train_std == pytest.approx(1.0)
+
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+    assert_allclose(y_pred, y_constant)
+    # set atol because we compare to zero
+    assert_allclose(np.diag(y_cov), 0.0, atol=1e-9)
+
+    # Test multi-target data
+    n_samples, n_targets = X.shape[0], 2
+    rng = np.random.RandomState(0)
+    y = np.concatenate(
+        [
+            rng.normal(size=(n_samples, 1)),  # non-constant target
+            np.full(shape=(n_samples, 1), fill_value=2),  # constant target
+        ],
+        axis=1,
+    )
+
+    gpr.fit(X, y)
+    Y_pred, Y_cov = gpr.predict(X, return_cov=True)
+
+    assert_allclose(Y_pred[:, 1], 2)
+    assert_allclose(np.diag(Y_cov[..., 1]), 0.0, atol=1e-9)
+
+    assert Y_pred.shape == (n_samples, n_targets)
+    assert Y_cov.shape == (n_samples, n_samples, n_targets)
+
+
+def test_gpr_consistency_std_cov_non_invertible_kernel():
+    """Check the consistency between the returned std. dev. and the covariance.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19936
+    Inconsistencies were observed when the kernel cannot be inverted (or
+    numerically stable).
+    """
+    kernel = C(8.98576054e05, (1e-12, 1e12)) * RBF(
+        [5.91326520e02, 1.32584051e03], (1e-12, 1e12)
+    ) + WhiteKernel(noise_level=1e-5)
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None)
+    X_train = np.array(
+        [
+            [0.0, 0.0],
+            [1.54919334, -0.77459667],
+            [-1.54919334, 0.0],
+            [0.0, -1.54919334],
+            [0.77459667, 0.77459667],
+            [-0.77459667, 1.54919334],
+        ]
+    )
+    y_train = np.array(
+        [
+            [-2.14882017e-10],
+            [-4.66975823e00],
+            [4.01823986e00],
+            [-1.30303674e00],
+            [-1.35760156e00],
+            [3.31215668e00],
+        ]
+    )
+    gpr.fit(X_train, y_train)
+    X_test = np.array(
+        [
+            [-1.93649167, -1.93649167],
+            [1.93649167, -1.93649167],
+            [-1.93649167, 1.93649167],
+            [1.93649167, 1.93649167],
+        ]
+    )
+    pred1, std = gpr.predict(X_test, return_std=True)
+    pred2, cov = gpr.predict(X_test, return_cov=True)
+    assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "params, TypeError, err_msg",
+    [
+        (
+            {"alpha": np.zeros(100)},
+            ValueError,
+            "alpha must be a scalar or an array with same number of entries as y",
+        ),
+        (
+            {
+                "kernel": WhiteKernel(noise_level_bounds=(-np.inf, np.inf)),
+                "n_restarts_optimizer": 2,
+            },
+            ValueError,
+            "requires that all bounds are finite",
+        ),
+    ],
+)
+def test_gpr_fit_error(params, TypeError, err_msg):
+    """Check that expected error are raised during fit."""
+    gpr = GaussianProcessRegressor(**params)
+    with pytest.raises(TypeError, match=err_msg):
+        gpr.fit(X, y)
+
+
+def test_gpr_lml_error():
+    """Check that we raise the proper error in the LML method."""
+    gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
+
+    err_msg = "Gradient can only be evaluated for theta!=None"
+    with pytest.raises(ValueError, match=err_msg):
+        gpr.log_marginal_likelihood(eval_gradient=True)
+
+
+def test_gpr_predict_error():
+    """Check that we raise the proper error during predict."""
+    gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
+
+    err_msg = "At most one of return_std or return_cov can be requested."
+    with pytest.raises(RuntimeError, match=err_msg):
+        gpr.predict(X, return_cov=True, return_std=True)
+
+
+@pytest.mark.parametrize("normalize_y", [True, False])
+@pytest.mark.parametrize("n_targets", [None, 1, 10])
+def test_predict_shapes(normalize_y, n_targets):
+    """Check the shapes of y_mean, y_std, and y_cov in single-output
+    (n_targets=None) and multi-output settings, including the edge case when
+    n_targets=1, where the sklearn convention is to squeeze the predictions.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/17394
+    https://github.com/scikit-learn/scikit-learn/issues/18065
+    https://github.com/scikit-learn/scikit-learn/issues/22174
+    """
+    rng = np.random.RandomState(1234)
+
+    n_features, n_samples_train, n_samples_test = 6, 9, 7
+
+    y_train_shape = (n_samples_train,)
+    if n_targets is not None:
+        y_train_shape = y_train_shape + (n_targets,)
+
+    # By convention single-output data is squeezed upon prediction
+    y_test_shape = (n_samples_test,)
+    if n_targets is not None and n_targets > 1:
+        y_test_shape = y_test_shape + (n_targets,)
+
+    X_train = rng.randn(n_samples_train, n_features)
+    X_test = rng.randn(n_samples_test, n_features)
+    y_train = rng.randn(*y_train_shape)
+
+    model = GaussianProcessRegressor(normalize_y=normalize_y)
+    model.fit(X_train, y_train)
+
+    y_pred, y_std = model.predict(X_test, return_std=True)
+    _, y_cov = model.predict(X_test, return_cov=True)
+
+    assert y_pred.shape == y_test_shape
+    assert y_std.shape == y_test_shape
+    assert y_cov.shape == (n_samples_test,) + y_test_shape
+
+
+@pytest.mark.parametrize("normalize_y", [True, False])
+@pytest.mark.parametrize("n_targets", [None, 1, 10])
+def test_sample_y_shapes(normalize_y, n_targets):
+    """Check the shapes of y_samples in single-output (n_targets=0) and
+    multi-output settings, including the edge case when n_targets=1, where the
+    sklearn convention is to squeeze the predictions.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/22175
+    """
+    rng = np.random.RandomState(1234)
+
+    n_features, n_samples_train = 6, 9
+    # Number of spatial locations to predict at
+    n_samples_X_test = 7
+    # Number of sample predictions per test point
+    n_samples_y_test = 5
+
+    y_train_shape = (n_samples_train,)
+    if n_targets is not None:
+        y_train_shape = y_train_shape + (n_targets,)
+
+    # By convention single-output data is squeezed upon prediction
+    if n_targets is not None and n_targets > 1:
+        y_test_shape = (n_samples_X_test, n_targets, n_samples_y_test)
+    else:
+        y_test_shape = (n_samples_X_test, n_samples_y_test)
+
+    X_train = rng.randn(n_samples_train, n_features)
+    X_test = rng.randn(n_samples_X_test, n_features)
+    y_train = rng.randn(*y_train_shape)
+
+    model = GaussianProcessRegressor(normalize_y=normalize_y)
+
+    # FIXME: before fitting, the estimator does not have information regarding
+    # the number of targets and default to 1. This is inconsistent with the shape
+    # provided after `fit`. This assert should be made once the following issue
+    # is fixed:
+    # https://github.com/scikit-learn/scikit-learn/issues/22430
+    # y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
+    # assert y_samples.shape == y_test_shape
+
+    model.fit(X_train, y_train)
+
+    y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
+    assert y_samples.shape == y_test_shape
+
+
+@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
+@pytest.mark.parametrize("n_samples", [1, 5])
+def test_sample_y_shape_with_prior(n_targets, n_samples):
+    """Check the output shape of `sample_y` is consistent before and after `fit`."""
+    rng = np.random.RandomState(1024)
+
+    X = rng.randn(10, 3)
+    y = rng.randn(10, n_targets if n_targets is not None else 1)
+
+    model = GaussianProcessRegressor(n_targets=n_targets)
+    shape_before_fit = model.sample_y(X, n_samples=n_samples).shape
+    model.fit(X, y)
+    shape_after_fit = model.sample_y(X, n_samples=n_samples).shape
+    assert shape_before_fit == shape_after_fit
+
+
+@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
+def test_predict_shape_with_prior(n_targets):
+    """Check the output shape of `predict` with prior distribution."""
+    rng = np.random.RandomState(1024)
+
+    n_sample = 10
+    X = rng.randn(n_sample, 3)
+    y = rng.randn(n_sample, n_targets if n_targets is not None else 1)
+
+    model = GaussianProcessRegressor(n_targets=n_targets)
+    mean_prior, cov_prior = model.predict(X, return_cov=True)
+    _, std_prior = model.predict(X, return_std=True)
+
+    model.fit(X, y)
+    mean_post, cov_post = model.predict(X, return_cov=True)
+    _, std_post = model.predict(X, return_std=True)
+
+    assert mean_prior.shape == mean_post.shape
+    assert cov_prior.shape == cov_post.shape
+    assert std_prior.shape == std_post.shape
+
+
+def test_n_targets_error():
+    """Check that an error is raised when the number of targets seen at fit is
+    inconsistent with n_targets.
+    """
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 3)
+    y = rng.randn(10, 2)
+
+    model = GaussianProcessRegressor(n_targets=1)
+    with pytest.raises(ValueError, match="The number of targets seen in `y`"):
+        model.fit(X, y)
+
+
+class CustomKernel(C):
+    """
+    A custom kernel that has a diag method that returns the first column of the
+    input matrix X. This is a helper for the test to check that the input
+    matrix X is not mutated.
+    """
+
+    def diag(self, X):
+        return X[:, 0]
+
+
+def test_gpr_predict_input_not_modified():
+    """
+    Check that the input X is not modified by the predict method of the
+    GaussianProcessRegressor when setting return_std=True.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24340
+    """
+    gpr = GaussianProcessRegressor(kernel=CustomKernel()).fit(X, y)
+
+    X2_copy = np.copy(X2)
+    _, _ = gpr.predict(X2, return_std=True)
+
+    assert_allclose(X2, X2_copy)
--- a/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_kernels.py
+++ b/venv/lib/python3.11/site-packages/sklearn/gaussian_process/tests/test_kernels.py
@ -0,0 +1,388 @@
+"""Testing for kernels for Gaussian processes."""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+from inspect import signature
+
+import numpy as np
+import pytest
+
+from sklearn.base import clone
+from sklearn.gaussian_process.kernels import (
+    RBF,
+    CompoundKernel,
+    ConstantKernel,
+    DotProduct,
+    Exponentiation,
+    ExpSineSquared,
+    KernelOperator,
+    Matern,
+    PairwiseKernel,
+    RationalQuadratic,
+    WhiteKernel,
+    _approx_fprime,
+)
+from sklearn.metrics.pairwise import (
+    PAIRWISE_KERNEL_FUNCTIONS,
+    euclidean_distances,
+    pairwise_kernels,
+)
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+)
+
+X = np.random.RandomState(0).normal(0, 1, (5, 2))
+Y = np.random.RandomState(0).normal(0, 1, (6, 2))
+
+kernel_rbf_plus_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
+kernels = [
+    RBF(length_scale=2.0),
+    RBF(length_scale_bounds=(0.5, 2.0)),
+    ConstantKernel(constant_value=10.0),
+    2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
+    2.0 * RBF(length_scale=0.5),
+    kernel_rbf_plus_white,
+    2.0 * RBF(length_scale=[0.5, 2.0]),
+    2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
+    2.0 * Matern(length_scale=0.5, nu=0.5),
+    2.0 * Matern(length_scale=1.5, nu=1.5),
+    2.0 * Matern(length_scale=2.5, nu=2.5),
+    2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
+    3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
+    4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
+    RationalQuadratic(length_scale=0.5, alpha=1.5),
+    ExpSineSquared(length_scale=0.5, periodicity=1.5),
+    DotProduct(sigma_0=2.0),
+    DotProduct(sigma_0=2.0) ** 2,
+    RBF(length_scale=[2.0]),
+    Matern(length_scale=[2.0]),
+]
+for metric in PAIRWISE_KERNEL_FUNCTIONS:
+    if metric in ["additive_chi2", "chi2"]:
+        continue
+    kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_gradient(kernel):
+    # Compare analytic and numeric gradient of kernels.
+    K, K_gradient = kernel(X, eval_gradient=True)
+
+    assert K_gradient.shape[0] == X.shape[0]
+    assert K_gradient.shape[1] == X.shape[0]
+    assert K_gradient.shape[2] == kernel.theta.shape[0]
+
+    def eval_kernel_for_theta(theta):
+        kernel_clone = kernel.clone_with_theta(theta)
+        K = kernel_clone(X, eval_gradient=False)
+        return K
+
+    K_gradient_approx = _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10)
+
+    assert_almost_equal(K_gradient, K_gradient_approx, 4)
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    [
+        kernel
+        for kernel in kernels
+        # skip non-basic kernels
+        if not (isinstance(kernel, (KernelOperator, Exponentiation)))
+    ],
+)
+def test_kernel_theta(kernel):
+    # Check that parameter vector theta of kernel is set correctly.
+    theta = kernel.theta
+    _, K_gradient = kernel(X, eval_gradient=True)
+
+    # Determine kernel parameters that contribute to theta
+    init_sign = signature(kernel.__class__.__init__).parameters.values()
+    args = [p.name for p in init_sign if p.name != "self"]
+    theta_vars = map(
+        lambda s: s[0 : -len("_bounds")], filter(lambda s: s.endswith("_bounds"), args)
+    )
+    assert set(hyperparameter.name for hyperparameter in kernel.hyperparameters) == set(
+        theta_vars
+    )
+
+    # Check that values returned in theta are consistent with
+    # hyperparameter values (being their logarithms)
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        assert theta[i] == np.log(getattr(kernel, hyperparameter.name))
+
+    # Fixed kernel parameters must be excluded from theta and gradient.
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        # create copy with certain hyperparameter fixed
+        params = kernel.get_params()
+        params[hyperparameter.name + "_bounds"] = "fixed"
+        kernel_class = kernel.__class__
+        new_kernel = kernel_class(**params)
+        # Check that theta and K_gradient are identical with the fixed
+        # dimension left out
+        _, K_gradient_new = new_kernel(X, eval_gradient=True)
+        assert theta.shape[0] == new_kernel.theta.shape[0] + 1
+        assert K_gradient.shape[2] == K_gradient_new.shape[2] + 1
+        if i > 0:
+            assert theta[:i] == new_kernel.theta[:i]
+            assert_array_equal(K_gradient[..., :i], K_gradient_new[..., :i])
+        if i + 1 < len(kernel.hyperparameters):
+            assert theta[i + 1 :] == new_kernel.theta[i:]
+            assert_array_equal(K_gradient[..., i + 1 :], K_gradient_new[..., i:])
+
+    # Check that values of theta are modified correctly
+    for i, hyperparameter in enumerate(kernel.hyperparameters):
+        theta[i] = np.log(42)
+        kernel.theta = theta
+        assert_almost_equal(getattr(kernel, hyperparameter.name), 42)
+
+        setattr(kernel, hyperparameter.name, 43)
+        assert_almost_equal(kernel.theta[i], np.log(43))
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    [
+        kernel
+        for kernel in kernels
+        # Identity is not satisfied on diagonal
+        if kernel != kernel_rbf_plus_white
+    ],
+)
+def test_auto_vs_cross(kernel):
+    # Auto-correlation and cross-correlation should be consistent.
+    K_auto = kernel(X)
+    K_cross = kernel(X, X)
+    assert_almost_equal(K_auto, K_cross, 5)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_diag(kernel):
+    # Test that diag method of kernel returns consistent results.
+    K_call_diag = np.diag(kernel(X))
+    K_diag = kernel.diag(X)
+    assert_almost_equal(K_call_diag, K_diag, 5)
+
+
+def test_kernel_operator_commutative():
+    # Adding kernels and multiplying kernels should be commutative.
+    # Check addition
+    assert_almost_equal((RBF(2.0) + 1.0)(X), (1.0 + RBF(2.0))(X))
+
+    # Check multiplication
+    assert_almost_equal((3.0 * RBF(2.0))(X), (RBF(2.0) * 3.0)(X))
+
+
+def test_kernel_anisotropic():
+    # Anisotropic kernel should be consistent with isotropic kernels.
+    kernel = 3.0 * RBF([0.5, 2.0])
+
+    K = kernel(X)
+    X1 = np.array(X)
+    X1[:, 0] *= 4
+    K1 = 3.0 * RBF(2.0)(X1)
+    assert_almost_equal(K, K1)
+
+    X2 = np.array(X)
+    X2[:, 1] /= 4
+    K2 = 3.0 * RBF(0.5)(X2)
+    assert_almost_equal(K, K2)
+
+    # Check getting and setting via theta
+    kernel.theta = kernel.theta + np.log(2)
+    assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
+    assert_array_equal(kernel.k2.length_scale, [1.0, 4.0])
+
+
+@pytest.mark.parametrize(
+    "kernel", [kernel for kernel in kernels if kernel.is_stationary()]
+)
+def test_kernel_stationary(kernel):
+    # Test stationarity of kernels.
+    K = kernel(X, X + 1)
+    assert_almost_equal(K[0, 0], np.diag(K))
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_input_type(kernel):
+    # Test whether kernels is for vectors or structured data
+    if isinstance(kernel, Exponentiation):
+        assert kernel.requires_vector_input == kernel.kernel.requires_vector_input
+    if isinstance(kernel, KernelOperator):
+        assert kernel.requires_vector_input == (
+            kernel.k1.requires_vector_input or kernel.k2.requires_vector_input
+        )
+
+
+def test_compound_kernel_input_type():
+    kernel = CompoundKernel([WhiteKernel(noise_level=3.0)])
+    assert not kernel.requires_vector_input
+
+    kernel = CompoundKernel([WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
+    assert kernel.requires_vector_input
+
+
+def check_hyperparameters_equal(kernel1, kernel2):
+    # Check that hyperparameters of two kernels are equal
+    for attr in set(dir(kernel1) + dir(kernel2)):
+        if attr.startswith("hyperparameter_"):
+            attr_value1 = getattr(kernel1, attr)
+            attr_value2 = getattr(kernel2, attr)
+            assert attr_value1 == attr_value2
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_clone(kernel):
+    # Test that sklearn's clone works correctly on kernels.
+    kernel_cloned = clone(kernel)
+
+    # XXX: Should this be fixed?
+    # This differs from the sklearn's estimators equality check.
+    assert kernel == kernel_cloned
+    assert id(kernel) != id(kernel_cloned)
+
+    # Check that all constructor parameters are equal.
+    assert kernel.get_params() == kernel_cloned.get_params()
+
+    # Check that all hyperparameters are equal.
+    check_hyperparameters_equal(kernel, kernel_cloned)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_clone_after_set_params(kernel):
+    # This test is to verify that using set_params does not
+    # break clone on kernels.
+    # This used to break because in kernels such as the RBF, non-trivial
+    # logic that modified the length scale used to be in the constructor
+    # See https://github.com/scikit-learn/scikit-learn/issues/6961
+    # for more details.
+    bounds = (1e-5, 1e5)
+    kernel_cloned = clone(kernel)
+    params = kernel.get_params()
+    # RationalQuadratic kernel is isotropic.
+    isotropic_kernels = (ExpSineSquared, RationalQuadratic)
+    if "length_scale" in params and not isinstance(kernel, isotropic_kernels):
+        length_scale = params["length_scale"]
+        if np.iterable(length_scale):
+            # XXX unreached code as of v0.22
+            params["length_scale"] = length_scale[0]
+            params["length_scale_bounds"] = bounds
+        else:
+            params["length_scale"] = [length_scale] * 2
+            params["length_scale_bounds"] = bounds * 2
+        kernel_cloned.set_params(**params)
+        kernel_cloned_clone = clone(kernel_cloned)
+        assert kernel_cloned_clone.get_params() == kernel_cloned.get_params()
+        assert id(kernel_cloned_clone) != id(kernel_cloned)
+        check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone)
+
+
+def test_matern_kernel():
+    # Test consistency of Matern kernel for special values of nu.
+    K = Matern(nu=1.5, length_scale=1.0)(X)
+    # the diagonal elements of a matern kernel are 1
+    assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
+    # matern kernel for coef0==0.5 is equal to absolute exponential kernel
+    K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
+    K = Matern(nu=0.5, length_scale=1.0)(X)
+    assert_array_almost_equal(K, K_absexp)
+    # matern kernel with coef0==inf is equal to RBF kernel
+    K_rbf = RBF(length_scale=1.0)(X)
+    K = Matern(nu=np.inf, length_scale=1.0)(X)
+    assert_array_almost_equal(K, K_rbf)
+    assert_allclose(K, K_rbf)
+    # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
+    # result in nearly identical results as the general case for coef0 in
+    # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
+    tiny = 1e-10
+    for nu in [0.5, 1.5, 2.5]:
+        K1 = Matern(nu=nu, length_scale=1.0)(X)
+        K2 = Matern(nu=nu + tiny, length_scale=1.0)(X)
+        assert_array_almost_equal(K1, K2)
+    # test that coef0==large is close to RBF
+    large = 100
+    K1 = Matern(nu=large, length_scale=1.0)(X)
+    K2 = RBF(length_scale=1.0)(X)
+    assert_array_almost_equal(K1, K2, decimal=2)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_kernel_versus_pairwise(kernel):
+    # Check that GP kernels can also be used as pairwise kernels.
+
+    # Test auto-kernel
+    if kernel != kernel_rbf_plus_white:
+        # For WhiteKernel: k(X) != k(X,X). This is assumed by
+        # pairwise_kernels
+        K1 = kernel(X)
+        K2 = pairwise_kernels(X, metric=kernel)
+        assert_array_almost_equal(K1, K2)
+
+    # Test cross-kernel
+    K1 = kernel(X, Y)
+    K2 = pairwise_kernels(X, Y, metric=kernel)
+    assert_array_almost_equal(K1, K2)
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_set_get_params(kernel):
+    # Check that set_params()/get_params() is consistent with kernel.theta.
+
+    # Test get_params()
+    index = 0
+    params = kernel.get_params()
+    for hyperparameter in kernel.hyperparameters:
+        if isinstance("string", type(hyperparameter.bounds)):
+            if hyperparameter.bounds == "fixed":
+                continue
+        size = hyperparameter.n_elements
+        if size > 1:  # anisotropic kernels
+            assert_almost_equal(
+                np.exp(kernel.theta[index : index + size]), params[hyperparameter.name]
+            )
+            index += size
+        else:
+            assert_almost_equal(
+                np.exp(kernel.theta[index]), params[hyperparameter.name]
+            )
+            index += 1
+    # Test set_params()
+    index = 0
+    value = 10  # arbitrary value
+    for hyperparameter in kernel.hyperparameters:
+        if isinstance("string", type(hyperparameter.bounds)):
+            if hyperparameter.bounds == "fixed":
+                continue
+        size = hyperparameter.n_elements
+        if size > 1:  # anisotropic kernels
+            kernel.set_params(**{hyperparameter.name: [value] * size})
+            assert_almost_equal(
+                np.exp(kernel.theta[index : index + size]), [value] * size
+            )
+            index += size
+        else:
+            kernel.set_params(**{hyperparameter.name: value})
+            assert_almost_equal(np.exp(kernel.theta[index]), value)
+            index += 1
+
+
+@pytest.mark.parametrize("kernel", kernels)
+def test_repr_kernels(kernel):
+    # Smoke-test for repr in kernels.
+
+    repr(kernel)
+
+
+def test_rational_quadratic_kernel():
+    kernel = RationalQuadratic(length_scale=[1.0, 1.0])
+    message = (
+        "RationalQuadratic kernel only supports isotropic "
+        "version, please use a single "
+        "scalar for length_scale"
+    )
+    with pytest.raises(AttributeError, match=message):
+        kernel(X)