reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,12 @@
|
||||
"""Gaussian process based regression and classification."""
|
||||
|
||||
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# Vincent Dubourg <vincent.dubourg@gmail.com>
|
||||
# (mostly translation, see implementation details)
|
||||
# License: BSD 3 clause
|
||||
|
||||
from . import kernels
|
||||
from ._gpc import GaussianProcessClassifier
|
||||
from ._gpr import GaussianProcessRegressor
|
||||
|
||||
__all__ = ["GaussianProcessRegressor", "GaussianProcessClassifier", "kernels"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,902 @@
|
||||
"""Gaussian processes classification."""
|
||||
|
||||
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from numbers import Integral
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import scipy.optimize
|
||||
from scipy.linalg import cho_solve, cholesky, solve
|
||||
from scipy.special import erf, expit
|
||||
|
||||
from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone
|
||||
from ..multiclass import OneVsOneClassifier, OneVsRestClassifier
|
||||
from ..preprocessing import LabelEncoder
|
||||
from ..utils import check_random_state
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.optimize import _check_optimize_result
|
||||
from ..utils.validation import check_is_fitted
|
||||
from .kernels import RBF, CompoundKernel, Kernel
|
||||
from .kernels import ConstantKernel as C
|
||||
|
||||
# Values required for approximating the logistic sigmoid by
|
||||
# error functions. coefs are obtained via:
|
||||
# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
|
||||
# b = logistic(x)
|
||||
# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
|
||||
# coefs = lstsq(A, b)[0]
|
||||
LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
|
||||
COEFS = np.array(
|
||||
[-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654]
|
||||
)[:, np.newaxis]
|
||||
|
||||
|
||||
class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
|
||||
"""Binary Gaussian process classification based on Laplace approximation.
|
||||
|
||||
The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
|
||||
|
||||
Internally, the Laplace approximation is used for approximating the
|
||||
non-Gaussian posterior by a Gaussian.
|
||||
|
||||
Currently, the implementation is restricted to using the logistic link
|
||||
function.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : kernel instance, default=None
|
||||
The kernel specifying the covariance function of the GP. If None is
|
||||
passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
|
||||
the kernel's hyperparameters are optimized during fitting.
|
||||
|
||||
optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
|
||||
Can either be one of the internally supported optimizers for optimizing
|
||||
the kernel's parameters, specified by a string, or an externally
|
||||
defined optimizer passed as a callable. If a callable is passed, it
|
||||
must have the signature::
|
||||
|
||||
def optimizer(obj_func, initial_theta, bounds):
|
||||
# * 'obj_func' is the objective function to be maximized, which
|
||||
# takes the hyperparameters theta as parameter and an
|
||||
# optional flag eval_gradient, which determines if the
|
||||
# gradient is returned additionally to the function value
|
||||
# * 'initial_theta': the initial value for theta, which can be
|
||||
# used by local optimizers
|
||||
# * 'bounds': the bounds on the values of theta
|
||||
....
|
||||
# Returned are the best found hyperparameters theta and
|
||||
# the corresponding value of the target function.
|
||||
return theta_opt, func_min
|
||||
|
||||
Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
|
||||
is used. If None is passed, the kernel's parameters are kept fixed.
|
||||
Available internal optimizers are::
|
||||
|
||||
'fmin_l_bfgs_b'
|
||||
|
||||
n_restarts_optimizer : int, default=0
|
||||
The number of restarts of the optimizer for finding the kernel's
|
||||
parameters which maximize the log-marginal likelihood. The first run
|
||||
of the optimizer is performed from the kernel's initial parameters,
|
||||
the remaining ones (if any) from thetas sampled log-uniform randomly
|
||||
from the space of allowed theta-values. If greater than 0, all bounds
|
||||
must be finite. Note that n_restarts_optimizer=0 implies that one
|
||||
run is performed.
|
||||
|
||||
max_iter_predict : int, default=100
|
||||
The maximum number of iterations in Newton's method for approximating
|
||||
the posterior during predict. Smaller values will reduce computation
|
||||
time at the cost of worse results.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If warm-starts are enabled, the solution of the last Newton iteration
|
||||
on the Laplace approximation of the posterior mode is used as
|
||||
initialization for the next call of _posterior_mode(). This can speed
|
||||
up convergence when _posterior_mode is called several times on similar
|
||||
problems as in hyperparameter optimization. See :term:`the Glossary
|
||||
<warm_start>`.
|
||||
|
||||
copy_X_train : bool, default=True
|
||||
If True, a persistent copy of the training data is stored in the
|
||||
object. Otherwise, just a reference to the training data is stored,
|
||||
which might cause predictions to change if the data is modified
|
||||
externally.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_train_ : array-like of shape (n_samples, n_features) or list of object
|
||||
Feature vectors or other representations of training data (also
|
||||
required for prediction).
|
||||
|
||||
y_train_ : array-like of shape (n_samples,)
|
||||
Target values in training data (also required for prediction)
|
||||
|
||||
classes_ : array-like of shape (n_classes,)
|
||||
Unique class labels.
|
||||
|
||||
kernel_ : kernl instance
|
||||
The kernel used for prediction. The structure of the kernel is the
|
||||
same as the one passed as parameter but with optimized hyperparameters
|
||||
|
||||
L_ : array-like of shape (n_samples, n_samples)
|
||||
Lower-triangular Cholesky decomposition of the kernel in X_train_
|
||||
|
||||
pi_ : array-like of shape (n_samples,)
|
||||
The probabilities of the positive class for the training points
|
||||
X_train_
|
||||
|
||||
W_sr_ : array-like of shape (n_samples,)
|
||||
Square root of W, the Hessian of log-likelihood of the latent function
|
||||
values for the observed labels. Since W is diagonal, only the diagonal
|
||||
of sqrt(W) is stored.
|
||||
|
||||
log_marginal_likelihood_value_ : float
|
||||
The log-marginal-likelihood of ``self.kernel_.theta``
|
||||
|
||||
References
|
||||
----------
|
||||
.. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
|
||||
"Gaussian Processes for Machine Learning",
|
||||
MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel=None,
|
||||
*,
|
||||
optimizer="fmin_l_bfgs_b",
|
||||
n_restarts_optimizer=0,
|
||||
max_iter_predict=100,
|
||||
warm_start=False,
|
||||
copy_X_train=True,
|
||||
random_state=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.optimizer = optimizer
|
||||
self.n_restarts_optimizer = n_restarts_optimizer
|
||||
self.max_iter_predict = max_iter_predict
|
||||
self.warm_start = warm_start
|
||||
self.copy_X_train = copy_X_train
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit Gaussian process classification model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Feature vectors or other representations of training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values, must be binary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
"""
|
||||
if self.kernel is None: # Use an RBF kernel as default
|
||||
self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
|
||||
1.0, length_scale_bounds="fixed"
|
||||
)
|
||||
else:
|
||||
self.kernel_ = clone(self.kernel)
|
||||
|
||||
self.rng = check_random_state(self.random_state)
|
||||
|
||||
self.X_train_ = np.copy(X) if self.copy_X_train else X
|
||||
|
||||
# Encode class labels and check that it is a binary classification
|
||||
# problem
|
||||
label_encoder = LabelEncoder()
|
||||
self.y_train_ = label_encoder.fit_transform(y)
|
||||
self.classes_ = label_encoder.classes_
|
||||
if self.classes_.size > 2:
|
||||
raise ValueError(
|
||||
"%s supports only binary classification. y contains classes %s"
|
||||
% (self.__class__.__name__, self.classes_)
|
||||
)
|
||||
elif self.classes_.size == 1:
|
||||
raise ValueError(
|
||||
"{0:s} requires 2 classes; got {1:d} class".format(
|
||||
self.__class__.__name__, self.classes_.size
|
||||
)
|
||||
)
|
||||
|
||||
if self.optimizer is not None and self.kernel_.n_dims > 0:
|
||||
# Choose hyperparameters based on maximizing the log-marginal
|
||||
# likelihood (potentially starting from several initial values)
|
||||
def obj_func(theta, eval_gradient=True):
|
||||
if eval_gradient:
|
||||
lml, grad = self.log_marginal_likelihood(
|
||||
theta, eval_gradient=True, clone_kernel=False
|
||||
)
|
||||
return -lml, -grad
|
||||
else:
|
||||
return -self.log_marginal_likelihood(theta, clone_kernel=False)
|
||||
|
||||
# First optimize starting from theta specified in kernel
|
||||
optima = [
|
||||
self._constrained_optimization(
|
||||
obj_func, self.kernel_.theta, self.kernel_.bounds
|
||||
)
|
||||
]
|
||||
|
||||
# Additional runs are performed from log-uniform chosen initial
|
||||
# theta
|
||||
if self.n_restarts_optimizer > 0:
|
||||
if not np.isfinite(self.kernel_.bounds).all():
|
||||
raise ValueError(
|
||||
"Multiple optimizer restarts (n_restarts_optimizer>0) "
|
||||
"requires that all bounds are finite."
|
||||
)
|
||||
bounds = self.kernel_.bounds
|
||||
for iteration in range(self.n_restarts_optimizer):
|
||||
theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))
|
||||
optima.append(
|
||||
self._constrained_optimization(obj_func, theta_initial, bounds)
|
||||
)
|
||||
# Select result from run with minimal (negative) log-marginal
|
||||
# likelihood
|
||||
lml_values = list(map(itemgetter(1), optima))
|
||||
self.kernel_.theta = optima[np.argmin(lml_values)][0]
|
||||
self.kernel_._check_bounds_params()
|
||||
|
||||
self.log_marginal_likelihood_value_ = -np.min(lml_values)
|
||||
else:
|
||||
self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
|
||||
self.kernel_.theta
|
||||
)
|
||||
|
||||
# Precompute quantities required for predictions which are independent
|
||||
# of actual query points
|
||||
K = self.kernel_(self.X_train_)
|
||||
|
||||
_, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode(
|
||||
K, return_temporaries=True
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Perform classification on an array of test vectors X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Query points where the GP is evaluated for classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : ndarray of shape (n_samples,)
|
||||
Predicted target values for X, values are from ``classes_``
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
# As discussed on Section 3.4.2 of GPML, for making hard binary
|
||||
# decisions, it is enough to compute the MAP of the posterior and
|
||||
# pass it through the link function
|
||||
K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
|
||||
f_star = K_star.T.dot(self.y_train_ - self.pi_) # Algorithm 3.2,Line 4
|
||||
|
||||
return np.where(f_star > 0, self.classes_[1], self.classes_[0])
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test vector X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Query points where the GP is evaluated for classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array-like of shape (n_samples, n_classes)
|
||||
Returns the probability of the samples for each class in
|
||||
the model. The columns correspond to the classes in sorted
|
||||
order, as they appear in the attribute ``classes_``.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
# Based on Algorithm 3.2 of GPML
|
||||
K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
|
||||
f_star = K_star.T.dot(self.y_train_ - self.pi_) # Line 4
|
||||
v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star) # Line 5
|
||||
# Line 6 (compute np.diag(v.T.dot(v)) via einsum)
|
||||
var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
|
||||
|
||||
# Line 7:
|
||||
# Approximate \int log(z) * N(z | f_star, var_f_star)
|
||||
# Approximation is due to Williams & Barber, "Bayesian Classification
|
||||
# with Gaussian Processes", Appendix A: Approximate the logistic
|
||||
# sigmoid by a linear combination of 5 error functions.
|
||||
# For information on how this integral can be computed see
|
||||
# blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
|
||||
alpha = 1 / (2 * var_f_star)
|
||||
gamma = LAMBDAS * f_star
|
||||
integrals = (
|
||||
np.sqrt(np.pi / alpha)
|
||||
* erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2)))
|
||||
/ (2 * np.sqrt(var_f_star * 2 * np.pi))
|
||||
)
|
||||
pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()
|
||||
|
||||
return np.vstack((1 - pi_star, pi_star)).T
|
||||
|
||||
def log_marginal_likelihood(
|
||||
self, theta=None, eval_gradient=False, clone_kernel=True
|
||||
):
|
||||
"""Returns log-marginal likelihood of theta for training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
theta : array-like of shape (n_kernel_params,), default=None
|
||||
Kernel hyperparameters for which the log-marginal likelihood is
|
||||
evaluated. If None, the precomputed log_marginal_likelihood
|
||||
of ``self.kernel_.theta`` is returned.
|
||||
|
||||
eval_gradient : bool, default=False
|
||||
If True, the gradient of the log-marginal likelihood with respect
|
||||
to the kernel hyperparameters at position theta is returned
|
||||
additionally. If True, theta must not be None.
|
||||
|
||||
clone_kernel : bool, default=True
|
||||
If True, the kernel attribute is copied. If False, the kernel
|
||||
attribute is modified, but may result in a performance improvement.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood : float
|
||||
Log-marginal likelihood of theta for training data.
|
||||
|
||||
log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
|
||||
optional
|
||||
Gradient of the log-marginal likelihood with respect to the kernel
|
||||
hyperparameters at position theta.
|
||||
Only returned when `eval_gradient` is True.
|
||||
"""
|
||||
if theta is None:
|
||||
if eval_gradient:
|
||||
raise ValueError("Gradient can only be evaluated for theta!=None")
|
||||
return self.log_marginal_likelihood_value_
|
||||
|
||||
if clone_kernel:
|
||||
kernel = self.kernel_.clone_with_theta(theta)
|
||||
else:
|
||||
kernel = self.kernel_
|
||||
kernel.theta = theta
|
||||
|
||||
if eval_gradient:
|
||||
K, K_gradient = kernel(self.X_train_, eval_gradient=True)
|
||||
else:
|
||||
K = kernel(self.X_train_)
|
||||
|
||||
# Compute log-marginal-likelihood Z and also store some temporaries
|
||||
# which can be reused for computing Z's gradient
|
||||
Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True)
|
||||
|
||||
if not eval_gradient:
|
||||
return Z
|
||||
|
||||
# Compute gradient based on Algorithm 5.1 of GPML
|
||||
d_Z = np.empty(theta.shape[0])
|
||||
# XXX: Get rid of the np.diag() in the next line
|
||||
R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7
|
||||
C = solve(L, W_sr[:, np.newaxis] * K) # Line 8
|
||||
# Line 9: (use einsum to compute np.diag(C.T.dot(C))))
|
||||
s_2 = (
|
||||
-0.5
|
||||
* (np.diag(K) - np.einsum("ij, ij -> j", C, C))
|
||||
* (pi * (1 - pi) * (1 - 2 * pi))
|
||||
) # third derivative
|
||||
|
||||
for j in range(d_Z.shape[0]):
|
||||
C = K_gradient[:, :, j] # Line 11
|
||||
# Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
|
||||
s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())
|
||||
|
||||
b = C.dot(self.y_train_ - pi) # Line 13
|
||||
s_3 = b - K.dot(R.dot(b)) # Line 14
|
||||
|
||||
d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15
|
||||
|
||||
return Z, d_Z
|
||||
|
||||
def _posterior_mode(self, K, return_temporaries=False):
|
||||
"""Mode-finding for binary Laplace GPC and fixed kernel.
|
||||
|
||||
This approximates the posterior of the latent function values for given
|
||||
inputs and target observations with a Gaussian approximation and uses
|
||||
Newton's iteration to find the mode of this approximation.
|
||||
"""
|
||||
# Based on Algorithm 3.1 of GPML
|
||||
|
||||
# If warm_start are enabled, we reuse the last solution for the
|
||||
# posterior mode as initialization; otherwise, we initialize with 0
|
||||
if (
|
||||
self.warm_start
|
||||
and hasattr(self, "f_cached")
|
||||
and self.f_cached.shape == self.y_train_.shape
|
||||
):
|
||||
f = self.f_cached
|
||||
else:
|
||||
f = np.zeros_like(self.y_train_, dtype=np.float64)
|
||||
|
||||
# Use Newton's iteration method to find mode of Laplace approximation
|
||||
log_marginal_likelihood = -np.inf
|
||||
for _ in range(self.max_iter_predict):
|
||||
# Line 4
|
||||
pi = expit(f)
|
||||
W = pi * (1 - pi)
|
||||
# Line 5
|
||||
W_sr = np.sqrt(W)
|
||||
W_sr_K = W_sr[:, np.newaxis] * K
|
||||
B = np.eye(W.shape[0]) + W_sr_K * W_sr
|
||||
L = cholesky(B, lower=True)
|
||||
# Line 6
|
||||
b = W * f + (self.y_train_ - pi)
|
||||
# Line 7
|
||||
a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
|
||||
# Line 8
|
||||
f = K.dot(a)
|
||||
|
||||
# Line 10: Compute log marginal likelihood in loop and use as
|
||||
# convergence criterion
|
||||
lml = (
|
||||
-0.5 * a.T.dot(f)
|
||||
- np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()
|
||||
- np.log(np.diag(L)).sum()
|
||||
)
|
||||
# Check if we have converged (log marginal likelihood does
|
||||
# not decrease)
|
||||
# XXX: more complex convergence criterion
|
||||
if lml - log_marginal_likelihood < 1e-10:
|
||||
break
|
||||
log_marginal_likelihood = lml
|
||||
|
||||
self.f_cached = f # Remember solution for later warm-starts
|
||||
if return_temporaries:
|
||||
return log_marginal_likelihood, (pi, W_sr, L, b, a)
|
||||
else:
|
||||
return log_marginal_likelihood
|
||||
|
||||
def _constrained_optimization(self, obj_func, initial_theta, bounds):
|
||||
if self.optimizer == "fmin_l_bfgs_b":
|
||||
opt_res = scipy.optimize.minimize(
|
||||
obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds
|
||||
)
|
||||
_check_optimize_result("lbfgs", opt_res)
|
||||
theta_opt, func_min = opt_res.x, opt_res.fun
|
||||
elif callable(self.optimizer):
|
||||
theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
|
||||
else:
|
||||
raise ValueError("Unknown optimizer %s." % self.optimizer)
|
||||
|
||||
return theta_opt, func_min
|
||||
|
||||
|
||||
class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
|
||||
"""Gaussian process classification (GPC) based on Laplace approximation.
|
||||
|
||||
The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
|
||||
|
||||
Internally, the Laplace approximation is used for approximating the
|
||||
non-Gaussian posterior by a Gaussian.
|
||||
|
||||
Currently, the implementation is restricted to using the logistic link
|
||||
function. For multi-class classification, several binary one-versus rest
|
||||
classifiers are fitted. Note that this class thus does not implement
|
||||
a true multi-class Laplace approximation.
|
||||
|
||||
Read more in the :ref:`User Guide <gaussian_process>`.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : kernel instance, default=None
|
||||
The kernel specifying the covariance function of the GP. If None is
|
||||
passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
|
||||
the kernel's hyperparameters are optimized during fitting. Also kernel
|
||||
cannot be a `CompoundKernel`.
|
||||
|
||||
optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b'
|
||||
Can either be one of the internally supported optimizers for optimizing
|
||||
the kernel's parameters, specified by a string, or an externally
|
||||
defined optimizer passed as a callable. If a callable is passed, it
|
||||
must have the signature::
|
||||
|
||||
def optimizer(obj_func, initial_theta, bounds):
|
||||
# * 'obj_func' is the objective function to be maximized, which
|
||||
# takes the hyperparameters theta as parameter and an
|
||||
# optional flag eval_gradient, which determines if the
|
||||
# gradient is returned additionally to the function value
|
||||
# * 'initial_theta': the initial value for theta, which can be
|
||||
# used by local optimizers
|
||||
# * 'bounds': the bounds on the values of theta
|
||||
....
|
||||
# Returned are the best found hyperparameters theta and
|
||||
# the corresponding value of the target function.
|
||||
return theta_opt, func_min
|
||||
|
||||
Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
|
||||
is used. If None is passed, the kernel's parameters are kept fixed.
|
||||
Available internal optimizers are::
|
||||
|
||||
'fmin_l_bfgs_b'
|
||||
|
||||
n_restarts_optimizer : int, default=0
|
||||
The number of restarts of the optimizer for finding the kernel's
|
||||
parameters which maximize the log-marginal likelihood. The first run
|
||||
of the optimizer is performed from the kernel's initial parameters,
|
||||
the remaining ones (if any) from thetas sampled log-uniform randomly
|
||||
from the space of allowed theta-values. If greater than 0, all bounds
|
||||
must be finite. Note that n_restarts_optimizer=0 implies that one
|
||||
run is performed.
|
||||
|
||||
max_iter_predict : int, default=100
|
||||
The maximum number of iterations in Newton's method for approximating
|
||||
the posterior during predict. Smaller values will reduce computation
|
||||
time at the cost of worse results.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If warm-starts are enabled, the solution of the last Newton iteration
|
||||
on the Laplace approximation of the posterior mode is used as
|
||||
initialization for the next call of _posterior_mode(). This can speed
|
||||
up convergence when _posterior_mode is called several times on similar
|
||||
problems as in hyperparameter optimization. See :term:`the Glossary
|
||||
<warm_start>`.
|
||||
|
||||
copy_X_train : bool, default=True
|
||||
If True, a persistent copy of the training data is stored in the
|
||||
object. Otherwise, just a reference to the training data is stored,
|
||||
which might cause predictions to change if the data is modified
|
||||
externally.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
|
||||
Specifies how multi-class classification problems are handled.
|
||||
Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
|
||||
one binary Gaussian process classifier is fitted for each class, which
|
||||
is trained to separate this class from the rest. In 'one_vs_one', one
|
||||
binary Gaussian process classifier is fitted for each pair of classes,
|
||||
which is trained to separate these two classes. The predictions of
|
||||
these binary predictors are combined into multi-class predictions.
|
||||
Note that 'one_vs_one' does not support predicting probability
|
||||
estimates.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation: the specified
|
||||
multiclass problems are computed in parallel.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
base_estimator_ : ``Estimator`` instance
|
||||
The estimator instance that defines the likelihood function
|
||||
using the observed data.
|
||||
|
||||
kernel_ : kernel instance
|
||||
The kernel used for prediction. In case of binary classification,
|
||||
the structure of the kernel is the same as the one passed as parameter
|
||||
but with optimized hyperparameters. In case of multi-class
|
||||
classification, a CompoundKernel is returned which consists of the
|
||||
different kernels used in the one-versus-rest classifiers.
|
||||
|
||||
log_marginal_likelihood_value_ : float
|
||||
The log-marginal-likelihood of ``self.kernel_.theta``
|
||||
|
||||
classes_ : array-like of shape (n_classes,)
|
||||
Unique class labels.
|
||||
|
||||
n_classes_ : int
|
||||
The number of classes in the training data
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
GaussianProcessRegressor : Gaussian process regression (GPR).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
|
||||
"Gaussian Processes for Machine Learning",
|
||||
MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
||||
>>> from sklearn.gaussian_process.kernels import RBF
|
||||
>>> X, y = load_iris(return_X_y=True)
|
||||
>>> kernel = 1.0 * RBF(1.0)
|
||||
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
||||
... random_state=0).fit(X, y)
|
||||
>>> gpc.score(X, y)
|
||||
0.9866...
|
||||
>>> gpc.predict_proba(X[:2,:])
|
||||
array([[0.83548752, 0.03228706, 0.13222543],
|
||||
[0.79064206, 0.06525643, 0.14410151]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"kernel": [Kernel, None],
|
||||
"optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
|
||||
"n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
|
||||
"max_iter_predict": [Interval(Integral, 1, None, closed="left")],
|
||||
"warm_start": ["boolean"],
|
||||
"copy_X_train": ["boolean"],
|
||||
"random_state": ["random_state"],
|
||||
"multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})],
|
||||
"n_jobs": [Integral, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel=None,
|
||||
*,
|
||||
optimizer="fmin_l_bfgs_b",
|
||||
n_restarts_optimizer=0,
|
||||
max_iter_predict=100,
|
||||
warm_start=False,
|
||||
copy_X_train=True,
|
||||
random_state=None,
|
||||
multi_class="one_vs_rest",
|
||||
n_jobs=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.optimizer = optimizer
|
||||
self.n_restarts_optimizer = n_restarts_optimizer
|
||||
self.max_iter_predict = max_iter_predict
|
||||
self.warm_start = warm_start
|
||||
self.copy_X_train = copy_X_train
|
||||
self.random_state = random_state
|
||||
self.multi_class = multi_class
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit Gaussian process classification model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Feature vectors or other representations of training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values, must be binary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
if isinstance(self.kernel, CompoundKernel):
|
||||
raise ValueError("kernel cannot be a CompoundKernel")
|
||||
|
||||
if self.kernel is None or self.kernel.requires_vector_input:
|
||||
X, y = self._validate_data(
|
||||
X, y, multi_output=False, ensure_2d=True, dtype="numeric"
|
||||
)
|
||||
else:
|
||||
X, y = self._validate_data(
|
||||
X, y, multi_output=False, ensure_2d=False, dtype=None
|
||||
)
|
||||
|
||||
self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
|
||||
kernel=self.kernel,
|
||||
optimizer=self.optimizer,
|
||||
n_restarts_optimizer=self.n_restarts_optimizer,
|
||||
max_iter_predict=self.max_iter_predict,
|
||||
warm_start=self.warm_start,
|
||||
copy_X_train=self.copy_X_train,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
|
||||
self.classes_ = np.unique(y)
|
||||
self.n_classes_ = self.classes_.size
|
||||
if self.n_classes_ == 1:
|
||||
raise ValueError(
|
||||
"GaussianProcessClassifier requires 2 or more "
|
||||
"distinct classes; got %d class (only class %s "
|
||||
"is present)" % (self.n_classes_, self.classes_[0])
|
||||
)
|
||||
if self.n_classes_ > 2:
|
||||
if self.multi_class == "one_vs_rest":
|
||||
self.base_estimator_ = OneVsRestClassifier(
|
||||
self.base_estimator_, n_jobs=self.n_jobs
|
||||
)
|
||||
elif self.multi_class == "one_vs_one":
|
||||
self.base_estimator_ = OneVsOneClassifier(
|
||||
self.base_estimator_, n_jobs=self.n_jobs
|
||||
)
|
||||
else:
|
||||
raise ValueError("Unknown multi-class mode %s" % self.multi_class)
|
||||
|
||||
self.base_estimator_.fit(X, y)
|
||||
|
||||
if self.n_classes_ > 2:
|
||||
self.log_marginal_likelihood_value_ = np.mean(
|
||||
[
|
||||
estimator.log_marginal_likelihood()
|
||||
for estimator in self.base_estimator_.estimators_
|
||||
]
|
||||
)
|
||||
else:
|
||||
self.log_marginal_likelihood_value_ = (
|
||||
self.base_estimator_.log_marginal_likelihood()
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Perform classification on an array of test vectors X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Query points where the GP is evaluated for classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : ndarray of shape (n_samples,)
|
||||
Predicted target values for X, values are from ``classes_``.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if self.kernel is None or self.kernel.requires_vector_input:
|
||||
X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
|
||||
else:
|
||||
X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
|
||||
|
||||
return self.base_estimator_.predict(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test vector X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Query points where the GP is evaluated for classification.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array-like of shape (n_samples, n_classes)
|
||||
Returns the probability of the samples for each class in
|
||||
the model. The columns correspond to the classes in sorted
|
||||
order, as they appear in the attribute :term:`classes_`.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
|
||||
raise ValueError(
|
||||
"one_vs_one multi-class mode does not support "
|
||||
"predicting probability estimates. Use "
|
||||
"one_vs_rest mode instead."
|
||||
)
|
||||
|
||||
if self.kernel is None or self.kernel.requires_vector_input:
|
||||
X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
|
||||
else:
|
||||
X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
|
||||
|
||||
return self.base_estimator_.predict_proba(X)
|
||||
|
||||
@property
|
||||
def kernel_(self):
|
||||
"""Return the kernel of the base estimator."""
|
||||
if self.n_classes_ == 2:
|
||||
return self.base_estimator_.kernel_
|
||||
else:
|
||||
return CompoundKernel(
|
||||
[estimator.kernel_ for estimator in self.base_estimator_.estimators_]
|
||||
)
|
||||
|
||||
def log_marginal_likelihood(
|
||||
self, theta=None, eval_gradient=False, clone_kernel=True
|
||||
):
|
||||
"""Return log-marginal likelihood of theta for training data.
|
||||
|
||||
In the case of multi-class classification, the mean log-marginal
|
||||
likelihood of the one-versus-rest classifiers are returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
theta : array-like of shape (n_kernel_params,), default=None
|
||||
Kernel hyperparameters for which the log-marginal likelihood is
|
||||
evaluated. In the case of multi-class classification, theta may
|
||||
be the hyperparameters of the compound kernel or of an individual
|
||||
kernel. In the latter case, all individual kernel get assigned the
|
||||
same theta values. If None, the precomputed log_marginal_likelihood
|
||||
of ``self.kernel_.theta`` is returned.
|
||||
|
||||
eval_gradient : bool, default=False
|
||||
If True, the gradient of the log-marginal likelihood with respect
|
||||
to the kernel hyperparameters at position theta is returned
|
||||
additionally. Note that gradient computation is not supported
|
||||
for non-binary classification. If True, theta must not be None.
|
||||
|
||||
clone_kernel : bool, default=True
|
||||
If True, the kernel attribute is copied. If False, the kernel
|
||||
attribute is modified, but may result in a performance improvement.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood : float
|
||||
Log-marginal likelihood of theta for training data.
|
||||
|
||||
log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
|
||||
Gradient of the log-marginal likelihood with respect to the kernel
|
||||
hyperparameters at position theta.
|
||||
Only returned when `eval_gradient` is True.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if theta is None:
|
||||
if eval_gradient:
|
||||
raise ValueError("Gradient can only be evaluated for theta!=None")
|
||||
return self.log_marginal_likelihood_value_
|
||||
|
||||
theta = np.asarray(theta)
|
||||
if self.n_classes_ == 2:
|
||||
return self.base_estimator_.log_marginal_likelihood(
|
||||
theta, eval_gradient, clone_kernel=clone_kernel
|
||||
)
|
||||
else:
|
||||
if eval_gradient:
|
||||
raise NotImplementedError(
|
||||
"Gradient of log-marginal-likelihood not implemented for "
|
||||
"multi-class GPC."
|
||||
)
|
||||
estimators = self.base_estimator_.estimators_
|
||||
n_dims = estimators[0].kernel_.n_dims
|
||||
if theta.shape[0] == n_dims: # use same theta for all sub-kernels
|
||||
return np.mean(
|
||||
[
|
||||
estimator.log_marginal_likelihood(
|
||||
theta, clone_kernel=clone_kernel
|
||||
)
|
||||
for i, estimator in enumerate(estimators)
|
||||
]
|
||||
)
|
||||
elif theta.shape[0] == n_dims * self.classes_.shape[0]:
|
||||
# theta for compound kernel
|
||||
return np.mean(
|
||||
[
|
||||
estimator.log_marginal_likelihood(
|
||||
theta[n_dims * i : n_dims * (i + 1)],
|
||||
clone_kernel=clone_kernel,
|
||||
)
|
||||
for i, estimator in enumerate(estimators)
|
||||
]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Shape of theta must be either %d or %d. "
|
||||
"Obtained theta with shape %d."
|
||||
% (n_dims, n_dims * self.classes_.shape[0], theta.shape[0])
|
||||
)
|
||||
@ -0,0 +1,669 @@
|
||||
"""Gaussian processes regression."""
|
||||
|
||||
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# Modified by: Pete Green <p.l.green@liverpool.ac.uk>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import scipy.optimize
|
||||
from scipy.linalg import cho_solve, cholesky, solve_triangular
|
||||
|
||||
from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone
|
||||
from ..preprocessing._data import _handle_zeros_in_scale
|
||||
from ..utils import check_random_state
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.optimize import _check_optimize_result
|
||||
from .kernels import RBF, Kernel
|
||||
from .kernels import ConstantKernel as C
|
||||
|
||||
GPR_CHOLESKY_LOWER = True
|
||||
|
||||
|
||||
class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
||||
"""Gaussian process regression (GPR).
|
||||
|
||||
The implementation is based on Algorithm 2.1 of [RW2006]_.
|
||||
|
||||
In addition to standard scikit-learn estimator API,
|
||||
:class:`GaussianProcessRegressor`:
|
||||
|
||||
* allows prediction without prior fitting (based on the GP prior)
|
||||
* provides an additional method `sample_y(X)`, which evaluates samples
|
||||
drawn from the GPR (prior or posterior) at given inputs
|
||||
* exposes a method `log_marginal_likelihood(theta)`, which can be used
|
||||
externally for other ways of selecting hyperparameters, e.g., via
|
||||
Markov chain Monte Carlo.
|
||||
|
||||
To learn the difference between a point-estimate approach vs. a more
|
||||
Bayesian modelling approach, refer to the example entitled
|
||||
:ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`.
|
||||
|
||||
Read more in the :ref:`User Guide <gaussian_process>`.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : kernel instance, default=None
|
||||
The kernel specifying the covariance function of the GP. If None is
|
||||
passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed")
|
||||
* RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that
|
||||
the kernel hyperparameters are optimized during fitting unless the
|
||||
bounds are marked as "fixed".
|
||||
|
||||
alpha : float or ndarray of shape (n_samples,), default=1e-10
|
||||
Value added to the diagonal of the kernel matrix during fitting.
|
||||
This can prevent a potential numerical issue during fitting, by
|
||||
ensuring that the calculated values form a positive definite matrix.
|
||||
It can also be interpreted as the variance of additional Gaussian
|
||||
measurement noise on the training observations. Note that this is
|
||||
different from using a `WhiteKernel`. If an array is passed, it must
|
||||
have the same number of entries as the data used for fitting and is
|
||||
used as datapoint-dependent noise level. Allowing to specify the
|
||||
noise level directly as a parameter is mainly for convenience and
|
||||
for consistency with :class:`~sklearn.linear_model.Ridge`.
|
||||
|
||||
optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b"
|
||||
Can either be one of the internally supported optimizers for optimizing
|
||||
the kernel's parameters, specified by a string, or an externally
|
||||
defined optimizer passed as a callable. If a callable is passed, it
|
||||
must have the signature::
|
||||
|
||||
def optimizer(obj_func, initial_theta, bounds):
|
||||
# * 'obj_func': the objective function to be minimized, which
|
||||
# takes the hyperparameters theta as a parameter and an
|
||||
# optional flag eval_gradient, which determines if the
|
||||
# gradient is returned additionally to the function value
|
||||
# * 'initial_theta': the initial value for theta, which can be
|
||||
# used by local optimizers
|
||||
# * 'bounds': the bounds on the values of theta
|
||||
....
|
||||
# Returned are the best found hyperparameters theta and
|
||||
# the corresponding value of the target function.
|
||||
return theta_opt, func_min
|
||||
|
||||
Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`
|
||||
is used. If None is passed, the kernel's parameters are kept fixed.
|
||||
Available internal optimizers are: `{'fmin_l_bfgs_b'}`.
|
||||
|
||||
n_restarts_optimizer : int, default=0
|
||||
The number of restarts of the optimizer for finding the kernel's
|
||||
parameters which maximize the log-marginal likelihood. The first run
|
||||
of the optimizer is performed from the kernel's initial parameters,
|
||||
the remaining ones (if any) from thetas sampled log-uniform randomly
|
||||
from the space of allowed theta-values. If greater than 0, all bounds
|
||||
must be finite. Note that `n_restarts_optimizer == 0` implies that one
|
||||
run is performed.
|
||||
|
||||
normalize_y : bool, default=False
|
||||
Whether or not to normalize the target values `y` by removing the mean
|
||||
and scaling to unit-variance. This is recommended for cases where
|
||||
zero-mean, unit-variance priors are used. Note that, in this
|
||||
implementation, the normalisation is reversed before the GP predictions
|
||||
are reported.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
|
||||
copy_X_train : bool, default=True
|
||||
If True, a persistent copy of the training data is stored in the
|
||||
object. Otherwise, just a reference to the training data is stored,
|
||||
which might cause predictions to change if the data is modified
|
||||
externally.
|
||||
|
||||
n_targets : int, default=None
|
||||
The number of dimensions of the target values. Used to decide the number
|
||||
of outputs when sampling from the prior distributions (i.e. calling
|
||||
:meth:`sample_y` before :meth:`fit`). This parameter is ignored once
|
||||
:meth:`fit` has been called.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_train_ : array-like of shape (n_samples, n_features) or list of object
|
||||
Feature vectors or other representations of training data (also
|
||||
required for prediction).
|
||||
|
||||
y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)
|
||||
Target values in training data (also required for prediction).
|
||||
|
||||
kernel_ : kernel instance
|
||||
The kernel used for prediction. The structure of the kernel is the
|
||||
same as the one passed as parameter but with optimized hyperparameters.
|
||||
|
||||
L_ : array-like of shape (n_samples, n_samples)
|
||||
Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.
|
||||
|
||||
alpha_ : array-like of shape (n_samples,)
|
||||
Dual coefficients of training data points in kernel space.
|
||||
|
||||
log_marginal_likelihood_value_ : float
|
||||
The log-marginal-likelihood of ``self.kernel_.theta``.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
GaussianProcessClassifier : Gaussian process classification (GPC)
|
||||
based on Laplace approximation.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
|
||||
"Gaussian Processes for Machine Learning",
|
||||
MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import make_friedman2
|
||||
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
>>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
|
||||
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
||||
>>> kernel = DotProduct() + WhiteKernel()
|
||||
>>> gpr = GaussianProcessRegressor(kernel=kernel,
|
||||
... random_state=0).fit(X, y)
|
||||
>>> gpr.score(X, y)
|
||||
0.3680...
|
||||
>>> gpr.predict(X[:2,:], return_std=True)
|
||||
(array([653.0..., 592.1...]), array([316.6..., 316.6...]))
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"kernel": [None, Kernel],
|
||||
"alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
|
||||
"optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
|
||||
"n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
|
||||
"normalize_y": ["boolean"],
|
||||
"copy_X_train": ["boolean"],
|
||||
"n_targets": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kernel=None,
|
||||
*,
|
||||
alpha=1e-10,
|
||||
optimizer="fmin_l_bfgs_b",
|
||||
n_restarts_optimizer=0,
|
||||
normalize_y=False,
|
||||
copy_X_train=True,
|
||||
n_targets=None,
|
||||
random_state=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.alpha = alpha
|
||||
self.optimizer = optimizer
|
||||
self.n_restarts_optimizer = n_restarts_optimizer
|
||||
self.normalize_y = normalize_y
|
||||
self.copy_X_train = copy_X_train
|
||||
self.n_targets = n_targets
|
||||
self.random_state = random_state
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit Gaussian process regression model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Feature vectors or other representations of training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_targets)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
GaussianProcessRegressor class instance.
|
||||
"""
|
||||
if self.kernel is None: # Use an RBF kernel as default
|
||||
self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
|
||||
1.0, length_scale_bounds="fixed"
|
||||
)
|
||||
else:
|
||||
self.kernel_ = clone(self.kernel)
|
||||
|
||||
self._rng = check_random_state(self.random_state)
|
||||
|
||||
if self.kernel_.requires_vector_input:
|
||||
dtype, ensure_2d = "numeric", True
|
||||
else:
|
||||
dtype, ensure_2d = None, False
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
multi_output=True,
|
||||
y_numeric=True,
|
||||
ensure_2d=ensure_2d,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
n_targets_seen = y.shape[1] if y.ndim > 1 else 1
|
||||
if self.n_targets is not None and n_targets_seen != self.n_targets:
|
||||
raise ValueError(
|
||||
"The number of targets seen in `y` is different from the parameter "
|
||||
f"`n_targets`. Got {n_targets_seen} != {self.n_targets}."
|
||||
)
|
||||
|
||||
# Normalize target value
|
||||
if self.normalize_y:
|
||||
self._y_train_mean = np.mean(y, axis=0)
|
||||
self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)
|
||||
|
||||
# Remove mean and make unit variance
|
||||
y = (y - self._y_train_mean) / self._y_train_std
|
||||
|
||||
else:
|
||||
shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1
|
||||
self._y_train_mean = np.zeros(shape=shape_y_stats)
|
||||
self._y_train_std = np.ones(shape=shape_y_stats)
|
||||
|
||||
if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:
|
||||
if self.alpha.shape[0] == 1:
|
||||
self.alpha = self.alpha[0]
|
||||
else:
|
||||
raise ValueError(
|
||||
"alpha must be a scalar or an array with same number of "
|
||||
f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})"
|
||||
)
|
||||
|
||||
self.X_train_ = np.copy(X) if self.copy_X_train else X
|
||||
self.y_train_ = np.copy(y) if self.copy_X_train else y
|
||||
|
||||
if self.optimizer is not None and self.kernel_.n_dims > 0:
|
||||
# Choose hyperparameters based on maximizing the log-marginal
|
||||
# likelihood (potentially starting from several initial values)
|
||||
def obj_func(theta, eval_gradient=True):
|
||||
if eval_gradient:
|
||||
lml, grad = self.log_marginal_likelihood(
|
||||
theta, eval_gradient=True, clone_kernel=False
|
||||
)
|
||||
return -lml, -grad
|
||||
else:
|
||||
return -self.log_marginal_likelihood(theta, clone_kernel=False)
|
||||
|
||||
# First optimize starting from theta specified in kernel
|
||||
optima = [
|
||||
(
|
||||
self._constrained_optimization(
|
||||
obj_func, self.kernel_.theta, self.kernel_.bounds
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
# Additional runs are performed from log-uniform chosen initial
|
||||
# theta
|
||||
if self.n_restarts_optimizer > 0:
|
||||
if not np.isfinite(self.kernel_.bounds).all():
|
||||
raise ValueError(
|
||||
"Multiple optimizer restarts (n_restarts_optimizer>0) "
|
||||
"requires that all bounds are finite."
|
||||
)
|
||||
bounds = self.kernel_.bounds
|
||||
for iteration in range(self.n_restarts_optimizer):
|
||||
theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])
|
||||
optima.append(
|
||||
self._constrained_optimization(obj_func, theta_initial, bounds)
|
||||
)
|
||||
# Select result from run with minimal (negative) log-marginal
|
||||
# likelihood
|
||||
lml_values = list(map(itemgetter(1), optima))
|
||||
self.kernel_.theta = optima[np.argmin(lml_values)][0]
|
||||
self.kernel_._check_bounds_params()
|
||||
|
||||
self.log_marginal_likelihood_value_ = -np.min(lml_values)
|
||||
else:
|
||||
self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
|
||||
self.kernel_.theta, clone_kernel=False
|
||||
)
|
||||
|
||||
# Precompute quantities required for predictions which are independent
|
||||
# of actual query points
|
||||
# Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
|
||||
K = self.kernel_(self.X_train_)
|
||||
K[np.diag_indices_from(K)] += self.alpha
|
||||
try:
|
||||
self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
|
||||
except np.linalg.LinAlgError as exc:
|
||||
exc.args = (
|
||||
(
|
||||
f"The kernel, {self.kernel_}, is not returning a positive "
|
||||
"definite matrix. Try gradually increasing the 'alpha' "
|
||||
"parameter of your GaussianProcessRegressor estimator."
|
||||
),
|
||||
) + exc.args
|
||||
raise
|
||||
# Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
|
||||
self.alpha_ = cho_solve(
|
||||
(self.L_, GPR_CHOLESKY_LOWER),
|
||||
self.y_train_,
|
||||
check_finite=False,
|
||||
)
|
||||
return self
|
||||
|
||||
def predict(self, X, return_std=False, return_cov=False):
|
||||
"""Predict using the Gaussian process regression model.
|
||||
|
||||
We can also predict based on an unfitted model by using the GP prior.
|
||||
In addition to the mean of the predictive distribution, optionally also
|
||||
returns its standard deviation (`return_std=True`) or covariance
|
||||
(`return_cov=True`). Note that at most one of the two can be requested.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or list of object
|
||||
Query points where the GP is evaluated.
|
||||
|
||||
return_std : bool, default=False
|
||||
If True, the standard-deviation of the predictive distribution at
|
||||
the query points is returned along with the mean.
|
||||
|
||||
return_cov : bool, default=False
|
||||
If True, the covariance of the joint predictive distribution at
|
||||
the query points is returned along with the mean.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
|
||||
Mean of predictive distribution at query points.
|
||||
|
||||
y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional
|
||||
Standard deviation of predictive distribution at query points.
|
||||
Only returned when `return_std` is True.
|
||||
|
||||
y_cov : ndarray of shape (n_samples, n_samples) or \
|
||||
(n_samples, n_samples, n_targets), optional
|
||||
Covariance of joint predictive distribution at query points.
|
||||
Only returned when `return_cov` is True.
|
||||
"""
|
||||
if return_std and return_cov:
|
||||
raise RuntimeError(
|
||||
"At most one of return_std or return_cov can be requested."
|
||||
)
|
||||
|
||||
if self.kernel is None or self.kernel.requires_vector_input:
|
||||
dtype, ensure_2d = "numeric", True
|
||||
else:
|
||||
dtype, ensure_2d = None, False
|
||||
|
||||
X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)
|
||||
|
||||
if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior
|
||||
if self.kernel is None:
|
||||
kernel = C(1.0, constant_value_bounds="fixed") * RBF(
|
||||
1.0, length_scale_bounds="fixed"
|
||||
)
|
||||
else:
|
||||
kernel = self.kernel
|
||||
|
||||
n_targets = self.n_targets if self.n_targets is not None else 1
|
||||
y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze()
|
||||
|
||||
if return_cov:
|
||||
y_cov = kernel(X)
|
||||
if n_targets > 1:
|
||||
y_cov = np.repeat(
|
||||
np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1
|
||||
)
|
||||
return y_mean, y_cov
|
||||
elif return_std:
|
||||
y_var = kernel.diag(X)
|
||||
if n_targets > 1:
|
||||
y_var = np.repeat(
|
||||
np.expand_dims(y_var, -1), repeats=n_targets, axis=-1
|
||||
)
|
||||
return y_mean, np.sqrt(y_var)
|
||||
else:
|
||||
return y_mean
|
||||
else: # Predict based on GP posterior
|
||||
# Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha
|
||||
K_trans = self.kernel_(X, self.X_train_)
|
||||
y_mean = K_trans @ self.alpha_
|
||||
|
||||
# undo normalisation
|
||||
y_mean = self._y_train_std * y_mean + self._y_train_mean
|
||||
|
||||
# if y_mean has shape (n_samples, 1), reshape to (n_samples,)
|
||||
if y_mean.ndim > 1 and y_mean.shape[1] == 1:
|
||||
y_mean = np.squeeze(y_mean, axis=1)
|
||||
|
||||
# Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T
|
||||
V = solve_triangular(
|
||||
self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False
|
||||
)
|
||||
|
||||
if return_cov:
|
||||
# Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v
|
||||
y_cov = self.kernel_(X) - V.T @ V
|
||||
|
||||
# undo normalisation
|
||||
y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)
|
||||
# if y_cov has shape (n_samples, n_samples, 1), reshape to
|
||||
# (n_samples, n_samples)
|
||||
if y_cov.shape[2] == 1:
|
||||
y_cov = np.squeeze(y_cov, axis=2)
|
||||
|
||||
return y_mean, y_cov
|
||||
elif return_std:
|
||||
# Compute variance of predictive distribution
|
||||
# Use einsum to avoid explicitly forming the large matrix
|
||||
# V^T @ V just to extract its diagonal afterward.
|
||||
y_var = self.kernel_.diag(X).copy()
|
||||
y_var -= np.einsum("ij,ji->i", V.T, V)
|
||||
|
||||
# Check if any of the variances is negative because of
|
||||
# numerical issues. If yes: set the variance to 0.
|
||||
y_var_negative = y_var < 0
|
||||
if np.any(y_var_negative):
|
||||
warnings.warn(
|
||||
"Predicted variances smaller than 0. "
|
||||
"Setting those variances to 0."
|
||||
)
|
||||
y_var[y_var_negative] = 0.0
|
||||
|
||||
# undo normalisation
|
||||
y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)
|
||||
|
||||
# if y_var has shape (n_samples, 1), reshape to (n_samples,)
|
||||
if y_var.shape[1] == 1:
|
||||
y_var = np.squeeze(y_var, axis=1)
|
||||
|
||||
return y_mean, np.sqrt(y_var)
|
||||
else:
|
||||
return y_mean
|
||||
|
||||
def sample_y(self, X, n_samples=1, random_state=0):
|
||||
"""Draw samples from Gaussian process and evaluate at X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples_X, n_features) or list of object
|
||||
Query points where the GP is evaluated.
|
||||
|
||||
n_samples : int, default=1
|
||||
Number of samples drawn from the Gaussian process per query point.
|
||||
|
||||
random_state : int, RandomState instance or None, default=0
|
||||
Determines random number generation to randomly draw samples.
|
||||
Pass an int for reproducible results across multiple function
|
||||
calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_samples : ndarray of shape (n_samples_X, n_samples), or \
|
||||
(n_samples_X, n_targets, n_samples)
|
||||
Values of n_samples samples drawn from Gaussian process and
|
||||
evaluated at query points.
|
||||
"""
|
||||
rng = check_random_state(random_state)
|
||||
|
||||
y_mean, y_cov = self.predict(X, return_cov=True)
|
||||
if y_mean.ndim == 1:
|
||||
y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
|
||||
else:
|
||||
y_samples = [
|
||||
rng.multivariate_normal(
|
||||
y_mean[:, target], y_cov[..., target], n_samples
|
||||
).T[:, np.newaxis]
|
||||
for target in range(y_mean.shape[1])
|
||||
]
|
||||
y_samples = np.hstack(y_samples)
|
||||
return y_samples
|
||||
|
||||
def log_marginal_likelihood(
|
||||
self, theta=None, eval_gradient=False, clone_kernel=True
|
||||
):
|
||||
"""Return log-marginal likelihood of theta for training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
theta : array-like of shape (n_kernel_params,) default=None
|
||||
Kernel hyperparameters for which the log-marginal likelihood is
|
||||
evaluated. If None, the precomputed log_marginal_likelihood
|
||||
of ``self.kernel_.theta`` is returned.
|
||||
|
||||
eval_gradient : bool, default=False
|
||||
If True, the gradient of the log-marginal likelihood with respect
|
||||
to the kernel hyperparameters at position theta is returned
|
||||
additionally. If True, theta must not be None.
|
||||
|
||||
clone_kernel : bool, default=True
|
||||
If True, the kernel attribute is copied. If False, the kernel
|
||||
attribute is modified, but may result in a performance improvement.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood : float
|
||||
Log-marginal likelihood of theta for training data.
|
||||
|
||||
log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
|
||||
Gradient of the log-marginal likelihood with respect to the kernel
|
||||
hyperparameters at position theta.
|
||||
Only returned when eval_gradient is True.
|
||||
"""
|
||||
if theta is None:
|
||||
if eval_gradient:
|
||||
raise ValueError("Gradient can only be evaluated for theta!=None")
|
||||
return self.log_marginal_likelihood_value_
|
||||
|
||||
if clone_kernel:
|
||||
kernel = self.kernel_.clone_with_theta(theta)
|
||||
else:
|
||||
kernel = self.kernel_
|
||||
kernel.theta = theta
|
||||
|
||||
if eval_gradient:
|
||||
K, K_gradient = kernel(self.X_train_, eval_gradient=True)
|
||||
else:
|
||||
K = kernel(self.X_train_)
|
||||
|
||||
# Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
|
||||
K[np.diag_indices_from(K)] += self.alpha
|
||||
try:
|
||||
L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
|
||||
except np.linalg.LinAlgError:
|
||||
return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf
|
||||
|
||||
# Support multi-dimensional output of self.y_train_
|
||||
y_train = self.y_train_
|
||||
if y_train.ndim == 1:
|
||||
y_train = y_train[:, np.newaxis]
|
||||
|
||||
# Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
|
||||
alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)
|
||||
|
||||
# Alg 2.1, page 19, line 7
|
||||
# -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi)
|
||||
# y is originally thought to be a (1, n_samples) row vector. However,
|
||||
# in multioutputs, y is of shape (n_samples, 2) and we need to compute
|
||||
# y^T . alpha for each output, independently using einsum. Thus, it
|
||||
# is equivalent to:
|
||||
# for output_idx in range(n_outputs):
|
||||
# log_likelihood_dims[output_idx] = (
|
||||
# y_train[:, [output_idx]] @ alpha[:, [output_idx]]
|
||||
# )
|
||||
log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
|
||||
log_likelihood_dims -= np.log(np.diag(L)).sum()
|
||||
log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
|
||||
# the log likehood is sum-up across the outputs
|
||||
log_likelihood = log_likelihood_dims.sum(axis=-1)
|
||||
|
||||
if eval_gradient:
|
||||
# Eq. 5.9, p. 114, and footnote 5 in p. 114
|
||||
# 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient)
|
||||
# alpha is supposed to be a vector of (n_samples,) elements. With
|
||||
# multioutputs, alpha is a matrix of size (n_samples, n_outputs).
|
||||
# Therefore, we want to construct a matrix of
|
||||
# (n_samples, n_samples, n_outputs) equivalent to
|
||||
# for output_idx in range(n_outputs):
|
||||
# output_alpha = alpha[:, [output_idx]]
|
||||
# inner_term[..., output_idx] = output_alpha @ output_alpha.T
|
||||
inner_term = np.einsum("ik,jk->ijk", alpha, alpha)
|
||||
# compute K^-1 of shape (n_samples, n_samples)
|
||||
K_inv = cho_solve(
|
||||
(L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False
|
||||
)
|
||||
# create a new axis to use broadcasting between inner_term and
|
||||
# K_inv
|
||||
inner_term -= K_inv[..., np.newaxis]
|
||||
# Since we are interested about the trace of
|
||||
# inner_term @ K_gradient, we don't explicitly compute the
|
||||
# matrix-by-matrix operation and instead use an einsum. Therefore
|
||||
# it is equivalent to:
|
||||
# for param_idx in range(n_kernel_params):
|
||||
# for output_idx in range(n_output):
|
||||
# log_likehood_gradient_dims[param_idx, output_idx] = (
|
||||
# inner_term[..., output_idx] @
|
||||
# K_gradient[..., param_idx]
|
||||
# )
|
||||
log_likelihood_gradient_dims = 0.5 * np.einsum(
|
||||
"ijl,jik->kl", inner_term, K_gradient
|
||||
)
|
||||
# the log likehood gradient is the sum-up across the outputs
|
||||
log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)
|
||||
|
||||
if eval_gradient:
|
||||
return log_likelihood, log_likelihood_gradient
|
||||
else:
|
||||
return log_likelihood
|
||||
|
||||
def _constrained_optimization(self, obj_func, initial_theta, bounds):
|
||||
if self.optimizer == "fmin_l_bfgs_b":
|
||||
opt_res = scipy.optimize.minimize(
|
||||
obj_func,
|
||||
initial_theta,
|
||||
method="L-BFGS-B",
|
||||
jac=True,
|
||||
bounds=bounds,
|
||||
)
|
||||
_check_optimize_result("lbfgs", opt_res)
|
||||
theta_opt, func_min = opt_res.x, opt_res.fun
|
||||
elif callable(self.optimizer):
|
||||
theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
|
||||
else:
|
||||
raise ValueError(f"Unknown optimizer {self.optimizer}.")
|
||||
|
||||
return theta_opt, func_min
|
||||
|
||||
def _more_tags(self):
|
||||
return {"requires_fit": False}
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
GenericKernelMixin,
|
||||
Hyperparameter,
|
||||
Kernel,
|
||||
StationaryKernelMixin,
|
||||
)
|
||||
|
||||
|
||||
class MiniSeqKernel(GenericKernelMixin, StationaryKernelMixin, Kernel):
|
||||
"""
|
||||
A minimal (but valid) convolutional kernel for sequences of variable
|
||||
length.
|
||||
"""
|
||||
|
||||
def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
|
||||
self.baseline_similarity = baseline_similarity
|
||||
self.baseline_similarity_bounds = baseline_similarity_bounds
|
||||
|
||||
@property
|
||||
def hyperparameter_baseline_similarity(self):
|
||||
return Hyperparameter(
|
||||
"baseline_similarity", "numeric", self.baseline_similarity_bounds
|
||||
)
|
||||
|
||||
def _f(self, s1, s2):
|
||||
return sum(
|
||||
[1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
|
||||
)
|
||||
|
||||
def _g(self, s1, s2):
|
||||
return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
|
||||
|
||||
def __call__(self, X, Y=None, eval_gradient=False):
|
||||
if Y is None:
|
||||
Y = X
|
||||
|
||||
if eval_gradient:
|
||||
return (
|
||||
np.array([[self._f(x, y) for y in Y] for x in X]),
|
||||
np.array([[[self._g(x, y)] for y in Y] for x in X]),
|
||||
)
|
||||
else:
|
||||
return np.array([[self._f(x, y) for y in Y] for x in X])
|
||||
|
||||
def diag(self, X):
|
||||
return np.array([self._f(x, x) for x in X])
|
||||
|
||||
def clone_with_theta(self, theta):
|
||||
cloned = clone(self)
|
||||
cloned.theta = theta
|
||||
return cloned
|
||||
@ -0,0 +1,284 @@
|
||||
"""Testing for Gaussian process classification"""
|
||||
|
||||
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.optimize import approx_fprime
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.gaussian_process import GaussianProcessClassifier
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
RBF,
|
||||
CompoundKernel,
|
||||
WhiteKernel,
|
||||
)
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
ConstantKernel as C,
|
||||
)
|
||||
from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_equal
|
||||
|
||||
|
||||
def f(x):
|
||||
return np.sin(x)
|
||||
|
||||
|
||||
X = np.atleast_2d(np.linspace(0, 10, 30)).T
|
||||
X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
|
||||
y = np.array(f(X).ravel() > 0, dtype=int)
|
||||
fX = f(X).ravel()
|
||||
y_mc = np.empty(y.shape, dtype=int) # multi-class
|
||||
y_mc[fX < -0.35] = 0
|
||||
y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
|
||||
y_mc[fX > 0.35] = 2
|
||||
|
||||
|
||||
fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
|
||||
kernels = [
|
||||
RBF(length_scale=0.1),
|
||||
fixed_kernel,
|
||||
RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
|
||||
C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
|
||||
]
|
||||
non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_predict_consistent(kernel):
|
||||
# Check binary predict decision has also predicted probability above 0.5.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
|
||||
|
||||
|
||||
def test_predict_consistent_structured():
|
||||
# Check binary predict decision has also predicted probability above 0.5.
|
||||
X = ["A", "AB", "B"]
|
||||
y = np.array([True, False, True])
|
||||
kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_lml_improving(kernel):
|
||||
# Test that hyperparameter-tuning improves log-marginal likelihood.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
assert gpc.log_marginal_likelihood(gpc.kernel_.theta) > gpc.log_marginal_likelihood(
|
||||
kernel.theta
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_precomputed(kernel):
|
||||
# Test that lml of optimized kernel is stored correctly.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
assert_almost_equal(
|
||||
gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_without_cloning_kernel(kernel):
|
||||
# Test that clone_kernel=False has side-effects of kernel.theta.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
input_theta = np.ones(gpc.kernel_.theta.shape, dtype=np.float64)
|
||||
|
||||
gpc.log_marginal_likelihood(input_theta, clone_kernel=False)
|
||||
assert_almost_equal(gpc.kernel_.theta, input_theta, 7)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_converged_to_local_maximum(kernel):
|
||||
# Test that we are in local maximum after hyperparameter-optimization.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
|
||||
lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
|
||||
|
||||
assert np.all(
|
||||
(np.abs(lml_gradient) < 1e-4)
|
||||
| (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0])
|
||||
| (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_gradient(kernel):
|
||||
# Compare analytic and numeric gradient of log marginal likelihood.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
|
||||
|
||||
lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True)
|
||||
lml_gradient_approx = approx_fprime(
|
||||
kernel.theta, lambda theta: gpc.log_marginal_likelihood(theta, False), 1e-10
|
||||
)
|
||||
|
||||
assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
|
||||
|
||||
|
||||
def test_random_starts(global_random_seed):
|
||||
# Test that an increasing number of random-starts of GP fitting only
|
||||
# increases the log marginal likelihood of the chosen theta.
|
||||
n_samples, n_features = 25, 2
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X = rng.randn(n_samples, n_features) * 2 - 1
|
||||
y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
|
||||
|
||||
kernel = C(1.0, (1e-2, 1e2)) * RBF(
|
||||
length_scale=[1e-3] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
|
||||
)
|
||||
last_lml = -np.inf
|
||||
for n_restarts_optimizer in range(5):
|
||||
gp = GaussianProcessClassifier(
|
||||
kernel=kernel,
|
||||
n_restarts_optimizer=n_restarts_optimizer,
|
||||
random_state=global_random_seed,
|
||||
).fit(X, y)
|
||||
lml = gp.log_marginal_likelihood(gp.kernel_.theta)
|
||||
assert lml > last_lml - np.finfo(np.float32).eps
|
||||
last_lml = lml
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_custom_optimizer(kernel, global_random_seed):
|
||||
# Test that GPC can use externally defined optimizers.
|
||||
# Define a dummy optimizer that simply tests 10 random hyperparameters
|
||||
def optimizer(obj_func, initial_theta, bounds):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
theta_opt, func_min = initial_theta, obj_func(
|
||||
initial_theta, eval_gradient=False
|
||||
)
|
||||
for _ in range(10):
|
||||
theta = np.atleast_1d(
|
||||
rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
|
||||
)
|
||||
f = obj_func(theta, eval_gradient=False)
|
||||
if f < func_min:
|
||||
theta_opt, func_min = theta, f
|
||||
return theta_opt, func_min
|
||||
|
||||
gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
|
||||
gpc.fit(X, y_mc)
|
||||
# Checks that optimizer improved marginal likelihood
|
||||
assert gpc.log_marginal_likelihood(
|
||||
gpc.kernel_.theta
|
||||
) >= gpc.log_marginal_likelihood(kernel.theta)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_multi_class(kernel):
|
||||
# Test GPC for multi-class classification problems.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel)
|
||||
gpc.fit(X, y_mc)
|
||||
|
||||
y_prob = gpc.predict_proba(X2)
|
||||
assert_almost_equal(y_prob.sum(1), 1)
|
||||
|
||||
y_pred = gpc.predict(X2)
|
||||
assert_array_equal(np.argmax(y_prob, 1), y_pred)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_multi_class_n_jobs(kernel):
|
||||
# Test that multi-class GPC produces identical results with n_jobs>1.
|
||||
gpc = GaussianProcessClassifier(kernel=kernel)
|
||||
gpc.fit(X, y_mc)
|
||||
|
||||
gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
|
||||
gpc_2.fit(X, y_mc)
|
||||
|
||||
y_prob = gpc.predict_proba(X2)
|
||||
y_prob_2 = gpc_2.predict_proba(X2)
|
||||
assert_almost_equal(y_prob, y_prob_2)
|
||||
|
||||
|
||||
def test_warning_bounds():
|
||||
kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
|
||||
gpc = GaussianProcessClassifier(kernel=kernel)
|
||||
warning_message = (
|
||||
"The optimal value found for dimension 0 of parameter "
|
||||
"length_scale is close to the specified upper bound "
|
||||
"0.001. Increasing the bound and calling fit again may "
|
||||
"find a better value."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
gpc.fit(X, y)
|
||||
|
||||
kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
|
||||
length_scale_bounds=[1e3, 1e5]
|
||||
)
|
||||
gpc_sum = GaussianProcessClassifier(kernel=kernel_sum)
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
warnings.simplefilter("always")
|
||||
gpc_sum.fit(X, y)
|
||||
|
||||
assert len(record) == 2
|
||||
|
||||
assert issubclass(record[0].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[0].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"k1__noise_level is close to the "
|
||||
"specified upper bound 0.001. "
|
||||
"Increasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
assert issubclass(record[1].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[1].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"k2__length_scale is close to the "
|
||||
"specified lower bound 1000.0. "
|
||||
"Decreasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
X_tile = np.tile(X, 2)
|
||||
kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
|
||||
gpc_dims = GaussianProcessClassifier(kernel=kernel_dims)
|
||||
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
warnings.simplefilter("always")
|
||||
gpc_dims.fit(X_tile, y)
|
||||
|
||||
assert len(record) == 2
|
||||
|
||||
assert issubclass(record[0].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[0].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"length_scale is close to the "
|
||||
"specified upper bound 100.0. "
|
||||
"Increasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
assert issubclass(record[1].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[1].message.args[0] == "The optimal value found for "
|
||||
"dimension 1 of parameter "
|
||||
"length_scale is close to the "
|
||||
"specified upper bound 100.0. "
|
||||
"Increasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, error_type, err_msg",
|
||||
[
|
||||
(
|
||||
{"kernel": CompoundKernel(0)},
|
||||
ValueError,
|
||||
"kernel cannot be a CompoundKernel",
|
||||
)
|
||||
],
|
||||
)
|
||||
def test_gpc_fit_error(params, error_type, err_msg):
|
||||
"""Check that expected error are raised during fit."""
|
||||
gpc = GaussianProcessClassifier(**params)
|
||||
with pytest.raises(error_type, match=err_msg):
|
||||
gpc.fit(X, y)
|
||||
@ -0,0 +1,849 @@
|
||||
"""Testing for Gaussian process regression"""
|
||||
|
||||
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# Modified by: Pete Green <p.l.green@liverpool.ac.uk>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.optimize import approx_fprime
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
RBF,
|
||||
DotProduct,
|
||||
ExpSineSquared,
|
||||
WhiteKernel,
|
||||
)
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
ConstantKernel as C,
|
||||
)
|
||||
from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
|
||||
|
||||
def f(x):
|
||||
return x * np.sin(x)
|
||||
|
||||
|
||||
X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
|
||||
X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T
|
||||
y = f(X).ravel()
|
||||
|
||||
fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
|
||||
kernels = [
|
||||
RBF(length_scale=1.0),
|
||||
fixed_kernel,
|
||||
RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
|
||||
C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
|
||||
C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
|
||||
+ C(1e-5, (1e-5, 1e2)),
|
||||
C(0.1, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
|
||||
+ C(1e-5, (1e-5, 1e2)),
|
||||
]
|
||||
non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_gpr_interpolation(kernel):
|
||||
if sys.maxsize <= 2**32:
|
||||
pytest.xfail("This test may fail on 32 bit Python")
|
||||
|
||||
# Test the interpolating property for different kernels.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
y_pred, y_cov = gpr.predict(X, return_cov=True)
|
||||
|
||||
assert_almost_equal(y_pred, y)
|
||||
assert_almost_equal(np.diag(y_cov), 0.0)
|
||||
|
||||
|
||||
def test_gpr_interpolation_structured():
|
||||
# Test the interpolating property for different kernels.
|
||||
kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
|
||||
X = ["A", "B", "C"]
|
||||
y = np.array([1, 2, 3])
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
y_pred, y_cov = gpr.predict(X, return_cov=True)
|
||||
|
||||
assert_almost_equal(
|
||||
kernel(X, eval_gradient=True)[1].ravel(), (1 - np.eye(len(X))).ravel()
|
||||
)
|
||||
assert_almost_equal(y_pred, y)
|
||||
assert_almost_equal(np.diag(y_cov), 0.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_lml_improving(kernel):
|
||||
if sys.maxsize <= 2**32:
|
||||
pytest.xfail("This test may fail on 32 bit Python")
|
||||
|
||||
# Test that hyperparameter-tuning improves log-marginal likelihood.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
|
||||
kernel.theta
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_precomputed(kernel):
|
||||
# Test that lml of optimized kernel is stored correctly.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
assert gpr.log_marginal_likelihood(gpr.kernel_.theta) == pytest.approx(
|
||||
gpr.log_marginal_likelihood()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_without_cloning_kernel(kernel):
|
||||
# Test that lml of optimized kernel is stored correctly.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
input_theta = np.ones(gpr.kernel_.theta.shape, dtype=np.float64)
|
||||
|
||||
gpr.log_marginal_likelihood(input_theta, clone_kernel=False)
|
||||
assert_almost_equal(gpr.kernel_.theta, input_theta, 7)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_converged_to_local_maximum(kernel):
|
||||
# Test that we are in local maximum after hyperparameter-optimization.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
|
||||
lml, lml_gradient = gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
|
||||
|
||||
assert np.all(
|
||||
(np.abs(lml_gradient) < 1e-4)
|
||||
| (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
|
||||
| (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_solution_inside_bounds(kernel):
|
||||
# Test that hyperparameter-optimization remains in bounds#
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
|
||||
bounds = gpr.kernel_.bounds
|
||||
max_ = np.finfo(gpr.kernel_.theta.dtype).max
|
||||
tiny = 1e-10
|
||||
bounds[~np.isfinite(bounds[:, 1]), 1] = max_
|
||||
|
||||
assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
|
||||
assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_lml_gradient(kernel):
|
||||
# Compare analytic and numeric gradient of log marginal likelihood.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
|
||||
lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True)
|
||||
lml_gradient_approx = approx_fprime(
|
||||
kernel.theta, lambda theta: gpr.log_marginal_likelihood(theta, False), 1e-10
|
||||
)
|
||||
|
||||
assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_prior(kernel):
|
||||
# Test that GP prior has mean 0 and identical variances.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel)
|
||||
|
||||
y_mean, y_cov = gpr.predict(X, return_cov=True)
|
||||
|
||||
assert_almost_equal(y_mean, 0, 5)
|
||||
if len(gpr.kernel.theta) > 1:
|
||||
# XXX: quite hacky, works only for current kernels
|
||||
assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5)
|
||||
else:
|
||||
assert_almost_equal(np.diag(y_cov), 1, 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_sample_statistics(kernel):
|
||||
# Test that statistics of samples drawn from GP are correct.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
|
||||
y_mean, y_cov = gpr.predict(X2, return_cov=True)
|
||||
|
||||
samples = gpr.sample_y(X2, 300000)
|
||||
|
||||
# More digits accuracy would require many more samples
|
||||
assert_almost_equal(y_mean, np.mean(samples, 1), 1)
|
||||
assert_almost_equal(
|
||||
np.diag(y_cov) / np.diag(y_cov).max(),
|
||||
np.var(samples, 1) / np.diag(y_cov).max(),
|
||||
1,
|
||||
)
|
||||
|
||||
|
||||
def test_no_optimizer():
|
||||
# Test that kernel parameters are unmodified when optimizer is None.
|
||||
kernel = RBF(1.0)
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
|
||||
assert np.exp(gpr.kernel_.theta) == 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)])
|
||||
def test_predict_cov_vs_std(kernel, target):
|
||||
if sys.maxsize <= 2**32:
|
||||
pytest.xfail("This test may fail on 32 bit Python")
|
||||
|
||||
# Test that predicted std.-dev. is consistent with cov's diagonal.
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
y_mean, y_cov = gpr.predict(X2, return_cov=True)
|
||||
y_mean, y_std = gpr.predict(X2, return_std=True)
|
||||
assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)
|
||||
|
||||
|
||||
def test_anisotropic_kernel():
|
||||
# Test that GPR can identify meaningful anisotropic length-scales.
|
||||
# We learn a function which varies in one dimension ten-times slower
|
||||
# than in the other. The corresponding length-scales should differ by at
|
||||
# least a factor 5
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.uniform(-1, 1, (50, 2))
|
||||
y = X[:, 0] + 0.1 * X[:, 1]
|
||||
|
||||
kernel = RBF([1.0, 1.0])
|
||||
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
assert np.exp(gpr.kernel_.theta[1]) > np.exp(gpr.kernel_.theta[0]) * 5
|
||||
|
||||
|
||||
def test_random_starts():
|
||||
# Test that an increasing number of random-starts of GP fitting only
|
||||
# increases the log marginal likelihood of the chosen theta.
|
||||
n_samples, n_features = 25, 2
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features) * 2 - 1
|
||||
y = (
|
||||
np.sin(X).sum(axis=1)
|
||||
+ np.sin(3 * X).sum(axis=1)
|
||||
+ rng.normal(scale=0.1, size=n_samples)
|
||||
)
|
||||
|
||||
kernel = C(1.0, (1e-2, 1e2)) * RBF(
|
||||
length_scale=[1.0] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features
|
||||
) + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1))
|
||||
last_lml = -np.inf
|
||||
for n_restarts_optimizer in range(5):
|
||||
gp = GaussianProcessRegressor(
|
||||
kernel=kernel,
|
||||
n_restarts_optimizer=n_restarts_optimizer,
|
||||
random_state=0,
|
||||
).fit(X, y)
|
||||
lml = gp.log_marginal_likelihood(gp.kernel_.theta)
|
||||
assert lml > last_lml - np.finfo(np.float32).eps
|
||||
last_lml = lml
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_y_normalization(kernel):
|
||||
"""
|
||||
Test normalization of the target values in GP
|
||||
|
||||
Fitting non-normalizing GP on normalized y and fitting normalizing GP
|
||||
on unnormalized y should yield identical results. Note that, here,
|
||||
'normalized y' refers to y that has been made zero mean and unit
|
||||
variance.
|
||||
|
||||
"""
|
||||
|
||||
y_mean = np.mean(y)
|
||||
y_std = np.std(y)
|
||||
y_norm = (y - y_mean) / y_std
|
||||
|
||||
# Fit non-normalizing GP on normalized y
|
||||
gpr = GaussianProcessRegressor(kernel=kernel)
|
||||
gpr.fit(X, y_norm)
|
||||
|
||||
# Fit normalizing GP on unnormalized y
|
||||
gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
|
||||
gpr_norm.fit(X, y)
|
||||
|
||||
# Compare predicted mean, std-devs and covariances
|
||||
y_pred, y_pred_std = gpr.predict(X2, return_std=True)
|
||||
y_pred = y_pred * y_std + y_mean
|
||||
y_pred_std = y_pred_std * y_std
|
||||
y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
|
||||
|
||||
assert_almost_equal(y_pred, y_pred_norm)
|
||||
assert_almost_equal(y_pred_std, y_pred_std_norm)
|
||||
|
||||
_, y_cov = gpr.predict(X2, return_cov=True)
|
||||
y_cov = y_cov * y_std**2
|
||||
_, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
|
||||
|
||||
assert_almost_equal(y_cov, y_cov_norm)
|
||||
|
||||
|
||||
def test_large_variance_y():
|
||||
"""
|
||||
Here we test that, when noramlize_y=True, our GP can produce a
|
||||
sensible fit to training data whose variance is significantly
|
||||
larger than unity. This test was made in response to issue #15612.
|
||||
|
||||
GP predictions are verified against predictions that were made
|
||||
using GPy which, here, is treated as the 'gold standard'. Note that we
|
||||
only investigate the RBF kernel here, as that is what was used in the
|
||||
GPy implementation.
|
||||
|
||||
The following code can be used to recreate the GPy data:
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
import GPy
|
||||
|
||||
kernel_gpy = GPy.kern.RBF(input_dim=1, lengthscale=1.)
|
||||
gpy = GPy.models.GPRegression(X, np.vstack(y_large), kernel_gpy)
|
||||
gpy.optimize()
|
||||
y_pred_gpy, y_var_gpy = gpy.predict(X2)
|
||||
y_pred_std_gpy = np.sqrt(y_var_gpy)
|
||||
--------------------------------------------------------------------------
|
||||
"""
|
||||
|
||||
# Here we utilise a larger variance version of the training data
|
||||
y_large = 10 * y
|
||||
|
||||
# Standard GP with normalize_y=True
|
||||
RBF_params = {"length_scale": 1.0}
|
||||
kernel = RBF(**RBF_params)
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
|
||||
gpr.fit(X, y_large)
|
||||
y_pred, y_pred_std = gpr.predict(X2, return_std=True)
|
||||
|
||||
# 'Gold standard' mean predictions from GPy
|
||||
y_pred_gpy = np.array(
|
||||
[15.16918303, -27.98707845, -39.31636019, 14.52605515, 69.18503589]
|
||||
)
|
||||
|
||||
# 'Gold standard' std predictions from GPy
|
||||
y_pred_std_gpy = np.array(
|
||||
[7.78860962, 3.83179178, 0.63149951, 0.52745188, 0.86170042]
|
||||
)
|
||||
|
||||
# Based on numerical experiments, it's reasonable to expect our
|
||||
# GP's mean predictions to get within 7% of predictions of those
|
||||
# made by GPy.
|
||||
assert_allclose(y_pred, y_pred_gpy, rtol=0.07, atol=0)
|
||||
|
||||
# Based on numerical experiments, it's reasonable to expect our
|
||||
# GP's std predictions to get within 15% of predictions of those
|
||||
# made by GPy.
|
||||
assert_allclose(y_pred_std, y_pred_std_gpy, rtol=0.15, atol=0)
|
||||
|
||||
|
||||
def test_y_multioutput():
|
||||
# Test that GPR can deal with multi-dimensional target values
|
||||
y_2d = np.vstack((y, y * 2)).T
|
||||
|
||||
# Test for fixed kernel that first dimension of 2d GP equals the output
|
||||
# of 1d GP and that second dimension is twice as large
|
||||
kernel = RBF(length_scale=1.0)
|
||||
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
|
||||
gpr.fit(X, y)
|
||||
|
||||
gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False)
|
||||
gpr_2d.fit(X, y_2d)
|
||||
|
||||
y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True)
|
||||
y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True)
|
||||
_, y_cov_1d = gpr.predict(X2, return_cov=True)
|
||||
_, y_cov_2d = gpr_2d.predict(X2, return_cov=True)
|
||||
|
||||
assert_almost_equal(y_pred_1d, y_pred_2d[:, 0])
|
||||
assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2)
|
||||
|
||||
# Standard deviation and covariance do not depend on output
|
||||
for target in range(y_2d.shape[1]):
|
||||
assert_almost_equal(y_std_1d, y_std_2d[..., target])
|
||||
assert_almost_equal(y_cov_1d, y_cov_2d[..., target])
|
||||
|
||||
y_sample_1d = gpr.sample_y(X2, n_samples=10)
|
||||
y_sample_2d = gpr_2d.sample_y(X2, n_samples=10)
|
||||
|
||||
assert y_sample_1d.shape == (5, 10)
|
||||
assert y_sample_2d.shape == (5, 2, 10)
|
||||
# Only the first target will be equal
|
||||
assert_almost_equal(y_sample_1d, y_sample_2d[:, 0, :])
|
||||
|
||||
# Test hyperparameter optimization
|
||||
for kernel in kernels:
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
|
||||
gpr.fit(X, y)
|
||||
|
||||
gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
|
||||
gpr_2d.fit(X, np.vstack((y, y)).T)
|
||||
|
||||
assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", non_fixed_kernels)
|
||||
def test_custom_optimizer(kernel):
|
||||
# Test that GPR can use externally defined optimizers.
|
||||
# Define a dummy optimizer that simply tests 50 random hyperparameters
|
||||
def optimizer(obj_func, initial_theta, bounds):
|
||||
rng = np.random.RandomState(0)
|
||||
theta_opt, func_min = initial_theta, obj_func(
|
||||
initial_theta, eval_gradient=False
|
||||
)
|
||||
for _ in range(50):
|
||||
theta = np.atleast_1d(
|
||||
rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1]))
|
||||
)
|
||||
f = obj_func(theta, eval_gradient=False)
|
||||
if f < func_min:
|
||||
theta_opt, func_min = theta, f
|
||||
return theta_opt, func_min
|
||||
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
|
||||
gpr.fit(X, y)
|
||||
# Checks that optimizer improved marginal likelihood
|
||||
assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood(
|
||||
gpr.kernel.theta
|
||||
)
|
||||
|
||||
|
||||
def test_gpr_correct_error_message():
|
||||
X = np.arange(12).reshape(6, -1)
|
||||
y = np.ones(6)
|
||||
kernel = DotProduct()
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
|
||||
message = (
|
||||
"The kernel, %s, is not returning a "
|
||||
"positive definite matrix. Try gradually increasing "
|
||||
"the 'alpha' parameter of your "
|
||||
"GaussianProcessRegressor estimator." % kernel
|
||||
)
|
||||
with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)):
|
||||
gpr.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_duplicate_input(kernel):
|
||||
# Test GPR can handle two different output-values for the same input.
|
||||
gpr_equal_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
|
||||
gpr_similar_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
|
||||
|
||||
X_ = np.vstack((X, X[0]))
|
||||
y_ = np.hstack((y, y[0] + 1))
|
||||
gpr_equal_inputs.fit(X_, y_)
|
||||
|
||||
X_ = np.vstack((X, X[0] + 1e-15))
|
||||
y_ = np.hstack((y, y[0] + 1))
|
||||
gpr_similar_inputs.fit(X_, y_)
|
||||
|
||||
X_test = np.linspace(0, 10, 100)[:, None]
|
||||
y_pred_equal, y_std_equal = gpr_equal_inputs.predict(X_test, return_std=True)
|
||||
y_pred_similar, y_std_similar = gpr_similar_inputs.predict(X_test, return_std=True)
|
||||
|
||||
assert_almost_equal(y_pred_equal, y_pred_similar)
|
||||
assert_almost_equal(y_std_equal, y_std_similar)
|
||||
|
||||
|
||||
def test_no_fit_default_predict():
|
||||
# Test that GPR predictions without fit does not break by default.
|
||||
default_kernel = C(1.0, constant_value_bounds="fixed") * RBF(
|
||||
1.0, length_scale_bounds="fixed"
|
||||
)
|
||||
gpr1 = GaussianProcessRegressor()
|
||||
_, y_std1 = gpr1.predict(X, return_std=True)
|
||||
_, y_cov1 = gpr1.predict(X, return_cov=True)
|
||||
|
||||
gpr2 = GaussianProcessRegressor(kernel=default_kernel)
|
||||
_, y_std2 = gpr2.predict(X, return_std=True)
|
||||
_, y_cov2 = gpr2.predict(X, return_cov=True)
|
||||
|
||||
assert_array_almost_equal(y_std1, y_std2)
|
||||
assert_array_almost_equal(y_cov1, y_cov2)
|
||||
|
||||
|
||||
def test_warning_bounds():
|
||||
kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
|
||||
gpr = GaussianProcessRegressor(kernel=kernel)
|
||||
warning_message = (
|
||||
"The optimal value found for dimension 0 of parameter "
|
||||
"length_scale is close to the specified upper bound "
|
||||
"0.001. Increasing the bound and calling fit again may "
|
||||
"find a better value."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
gpr.fit(X, y)
|
||||
|
||||
kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(
|
||||
length_scale_bounds=[1e3, 1e5]
|
||||
)
|
||||
gpr_sum = GaussianProcessRegressor(kernel=kernel_sum)
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
warnings.simplefilter("always")
|
||||
gpr_sum.fit(X, y)
|
||||
|
||||
assert len(record) == 2
|
||||
|
||||
assert issubclass(record[0].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[0].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"k1__noise_level is close to the "
|
||||
"specified upper bound 0.001. "
|
||||
"Increasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
assert issubclass(record[1].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[1].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"k2__length_scale is close to the "
|
||||
"specified lower bound 1000.0. "
|
||||
"Decreasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
X_tile = np.tile(X, 2)
|
||||
kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2])
|
||||
gpr_dims = GaussianProcessRegressor(kernel=kernel_dims)
|
||||
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
warnings.simplefilter("always")
|
||||
gpr_dims.fit(X_tile, y)
|
||||
|
||||
assert len(record) == 2
|
||||
|
||||
assert issubclass(record[0].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[0].message.args[0] == "The optimal value found for "
|
||||
"dimension 0 of parameter "
|
||||
"length_scale is close to the "
|
||||
"specified lower bound 10.0. "
|
||||
"Decreasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
assert issubclass(record[1].category, ConvergenceWarning)
|
||||
assert (
|
||||
record[1].message.args[0] == "The optimal value found for "
|
||||
"dimension 1 of parameter "
|
||||
"length_scale is close to the "
|
||||
"specified lower bound 10.0. "
|
||||
"Decreasing the bound and calling "
|
||||
"fit again may find a better value."
|
||||
)
|
||||
|
||||
|
||||
def test_bound_check_fixed_hyperparameter():
|
||||
# Regression test for issue #17943
|
||||
# Check that having a hyperparameter with fixed bounds doesn't cause an
|
||||
# error
|
||||
k1 = 50.0**2 * RBF(length_scale=50.0) # long term smooth rising trend
|
||||
k2 = ExpSineSquared(
|
||||
length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed"
|
||||
) # seasonal component
|
||||
kernel = k1 + k2
|
||||
GaussianProcessRegressor(kernel=kernel).fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_constant_target(kernel):
|
||||
"""Check that the std. dev. is affected to 1 when normalizing a constant
|
||||
feature.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/18318
|
||||
NaN where affected to the target when scaling due to null std. dev. with
|
||||
constant target.
|
||||
"""
|
||||
y_constant = np.ones(X.shape[0], dtype=np.float64)
|
||||
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
|
||||
gpr.fit(X, y_constant)
|
||||
assert gpr._y_train_std == pytest.approx(1.0)
|
||||
|
||||
y_pred, y_cov = gpr.predict(X, return_cov=True)
|
||||
assert_allclose(y_pred, y_constant)
|
||||
# set atol because we compare to zero
|
||||
assert_allclose(np.diag(y_cov), 0.0, atol=1e-9)
|
||||
|
||||
# Test multi-target data
|
||||
n_samples, n_targets = X.shape[0], 2
|
||||
rng = np.random.RandomState(0)
|
||||
y = np.concatenate(
|
||||
[
|
||||
rng.normal(size=(n_samples, 1)), # non-constant target
|
||||
np.full(shape=(n_samples, 1), fill_value=2), # constant target
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
gpr.fit(X, y)
|
||||
Y_pred, Y_cov = gpr.predict(X, return_cov=True)
|
||||
|
||||
assert_allclose(Y_pred[:, 1], 2)
|
||||
assert_allclose(np.diag(Y_cov[..., 1]), 0.0, atol=1e-9)
|
||||
|
||||
assert Y_pred.shape == (n_samples, n_targets)
|
||||
assert Y_cov.shape == (n_samples, n_samples, n_targets)
|
||||
|
||||
|
||||
def test_gpr_consistency_std_cov_non_invertible_kernel():
|
||||
"""Check the consistency between the returned std. dev. and the covariance.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/19936
|
||||
Inconsistencies were observed when the kernel cannot be inverted (or
|
||||
numerically stable).
|
||||
"""
|
||||
kernel = C(8.98576054e05, (1e-12, 1e12)) * RBF(
|
||||
[5.91326520e02, 1.32584051e03], (1e-12, 1e12)
|
||||
) + WhiteKernel(noise_level=1e-5)
|
||||
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None)
|
||||
X_train = np.array(
|
||||
[
|
||||
[0.0, 0.0],
|
||||
[1.54919334, -0.77459667],
|
||||
[-1.54919334, 0.0],
|
||||
[0.0, -1.54919334],
|
||||
[0.77459667, 0.77459667],
|
||||
[-0.77459667, 1.54919334],
|
||||
]
|
||||
)
|
||||
y_train = np.array(
|
||||
[
|
||||
[-2.14882017e-10],
|
||||
[-4.66975823e00],
|
||||
[4.01823986e00],
|
||||
[-1.30303674e00],
|
||||
[-1.35760156e00],
|
||||
[3.31215668e00],
|
||||
]
|
||||
)
|
||||
gpr.fit(X_train, y_train)
|
||||
X_test = np.array(
|
||||
[
|
||||
[-1.93649167, -1.93649167],
|
||||
[1.93649167, -1.93649167],
|
||||
[-1.93649167, 1.93649167],
|
||||
[1.93649167, 1.93649167],
|
||||
]
|
||||
)
|
||||
pred1, std = gpr.predict(X_test, return_std=True)
|
||||
pred2, cov = gpr.predict(X_test, return_cov=True)
|
||||
assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, TypeError, err_msg",
|
||||
[
|
||||
(
|
||||
{"alpha": np.zeros(100)},
|
||||
ValueError,
|
||||
"alpha must be a scalar or an array with same number of entries as y",
|
||||
),
|
||||
(
|
||||
{
|
||||
"kernel": WhiteKernel(noise_level_bounds=(-np.inf, np.inf)),
|
||||
"n_restarts_optimizer": 2,
|
||||
},
|
||||
ValueError,
|
||||
"requires that all bounds are finite",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_gpr_fit_error(params, TypeError, err_msg):
|
||||
"""Check that expected error are raised during fit."""
|
||||
gpr = GaussianProcessRegressor(**params)
|
||||
with pytest.raises(TypeError, match=err_msg):
|
||||
gpr.fit(X, y)
|
||||
|
||||
|
||||
def test_gpr_lml_error():
|
||||
"""Check that we raise the proper error in the LML method."""
|
||||
gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
|
||||
|
||||
err_msg = "Gradient can only be evaluated for theta!=None"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
gpr.log_marginal_likelihood(eval_gradient=True)
|
||||
|
||||
|
||||
def test_gpr_predict_error():
|
||||
"""Check that we raise the proper error during predict."""
|
||||
gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y)
|
||||
|
||||
err_msg = "At most one of return_std or return_cov can be requested."
|
||||
with pytest.raises(RuntimeError, match=err_msg):
|
||||
gpr.predict(X, return_cov=True, return_std=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("normalize_y", [True, False])
|
||||
@pytest.mark.parametrize("n_targets", [None, 1, 10])
|
||||
def test_predict_shapes(normalize_y, n_targets):
|
||||
"""Check the shapes of y_mean, y_std, and y_cov in single-output
|
||||
(n_targets=None) and multi-output settings, including the edge case when
|
||||
n_targets=1, where the sklearn convention is to squeeze the predictions.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/17394
|
||||
https://github.com/scikit-learn/scikit-learn/issues/18065
|
||||
https://github.com/scikit-learn/scikit-learn/issues/22174
|
||||
"""
|
||||
rng = np.random.RandomState(1234)
|
||||
|
||||
n_features, n_samples_train, n_samples_test = 6, 9, 7
|
||||
|
||||
y_train_shape = (n_samples_train,)
|
||||
if n_targets is not None:
|
||||
y_train_shape = y_train_shape + (n_targets,)
|
||||
|
||||
# By convention single-output data is squeezed upon prediction
|
||||
y_test_shape = (n_samples_test,)
|
||||
if n_targets is not None and n_targets > 1:
|
||||
y_test_shape = y_test_shape + (n_targets,)
|
||||
|
||||
X_train = rng.randn(n_samples_train, n_features)
|
||||
X_test = rng.randn(n_samples_test, n_features)
|
||||
y_train = rng.randn(*y_train_shape)
|
||||
|
||||
model = GaussianProcessRegressor(normalize_y=normalize_y)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
y_pred, y_std = model.predict(X_test, return_std=True)
|
||||
_, y_cov = model.predict(X_test, return_cov=True)
|
||||
|
||||
assert y_pred.shape == y_test_shape
|
||||
assert y_std.shape == y_test_shape
|
||||
assert y_cov.shape == (n_samples_test,) + y_test_shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize("normalize_y", [True, False])
|
||||
@pytest.mark.parametrize("n_targets", [None, 1, 10])
|
||||
def test_sample_y_shapes(normalize_y, n_targets):
|
||||
"""Check the shapes of y_samples in single-output (n_targets=0) and
|
||||
multi-output settings, including the edge case when n_targets=1, where the
|
||||
sklearn convention is to squeeze the predictions.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/22175
|
||||
"""
|
||||
rng = np.random.RandomState(1234)
|
||||
|
||||
n_features, n_samples_train = 6, 9
|
||||
# Number of spatial locations to predict at
|
||||
n_samples_X_test = 7
|
||||
# Number of sample predictions per test point
|
||||
n_samples_y_test = 5
|
||||
|
||||
y_train_shape = (n_samples_train,)
|
||||
if n_targets is not None:
|
||||
y_train_shape = y_train_shape + (n_targets,)
|
||||
|
||||
# By convention single-output data is squeezed upon prediction
|
||||
if n_targets is not None and n_targets > 1:
|
||||
y_test_shape = (n_samples_X_test, n_targets, n_samples_y_test)
|
||||
else:
|
||||
y_test_shape = (n_samples_X_test, n_samples_y_test)
|
||||
|
||||
X_train = rng.randn(n_samples_train, n_features)
|
||||
X_test = rng.randn(n_samples_X_test, n_features)
|
||||
y_train = rng.randn(*y_train_shape)
|
||||
|
||||
model = GaussianProcessRegressor(normalize_y=normalize_y)
|
||||
|
||||
# FIXME: before fitting, the estimator does not have information regarding
|
||||
# the number of targets and default to 1. This is inconsistent with the shape
|
||||
# provided after `fit`. This assert should be made once the following issue
|
||||
# is fixed:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/22430
|
||||
# y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
|
||||
# assert y_samples.shape == y_test_shape
|
||||
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
y_samples = model.sample_y(X_test, n_samples=n_samples_y_test)
|
||||
assert y_samples.shape == y_test_shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
|
||||
@pytest.mark.parametrize("n_samples", [1, 5])
|
||||
def test_sample_y_shape_with_prior(n_targets, n_samples):
|
||||
"""Check the output shape of `sample_y` is consistent before and after `fit`."""
|
||||
rng = np.random.RandomState(1024)
|
||||
|
||||
X = rng.randn(10, 3)
|
||||
y = rng.randn(10, n_targets if n_targets is not None else 1)
|
||||
|
||||
model = GaussianProcessRegressor(n_targets=n_targets)
|
||||
shape_before_fit = model.sample_y(X, n_samples=n_samples).shape
|
||||
model.fit(X, y)
|
||||
shape_after_fit = model.sample_y(X, n_samples=n_samples).shape
|
||||
assert shape_before_fit == shape_after_fit
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_targets", [None, 1, 2, 3])
|
||||
def test_predict_shape_with_prior(n_targets):
|
||||
"""Check the output shape of `predict` with prior distribution."""
|
||||
rng = np.random.RandomState(1024)
|
||||
|
||||
n_sample = 10
|
||||
X = rng.randn(n_sample, 3)
|
||||
y = rng.randn(n_sample, n_targets if n_targets is not None else 1)
|
||||
|
||||
model = GaussianProcessRegressor(n_targets=n_targets)
|
||||
mean_prior, cov_prior = model.predict(X, return_cov=True)
|
||||
_, std_prior = model.predict(X, return_std=True)
|
||||
|
||||
model.fit(X, y)
|
||||
mean_post, cov_post = model.predict(X, return_cov=True)
|
||||
_, std_post = model.predict(X, return_std=True)
|
||||
|
||||
assert mean_prior.shape == mean_post.shape
|
||||
assert cov_prior.shape == cov_post.shape
|
||||
assert std_prior.shape == std_post.shape
|
||||
|
||||
|
||||
def test_n_targets_error():
|
||||
"""Check that an error is raised when the number of targets seen at fit is
|
||||
inconsistent with n_targets.
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 3)
|
||||
y = rng.randn(10, 2)
|
||||
|
||||
model = GaussianProcessRegressor(n_targets=1)
|
||||
with pytest.raises(ValueError, match="The number of targets seen in `y`"):
|
||||
model.fit(X, y)
|
||||
|
||||
|
||||
class CustomKernel(C):
|
||||
"""
|
||||
A custom kernel that has a diag method that returns the first column of the
|
||||
input matrix X. This is a helper for the test to check that the input
|
||||
matrix X is not mutated.
|
||||
"""
|
||||
|
||||
def diag(self, X):
|
||||
return X[:, 0]
|
||||
|
||||
|
||||
def test_gpr_predict_input_not_modified():
|
||||
"""
|
||||
Check that the input X is not modified by the predict method of the
|
||||
GaussianProcessRegressor when setting return_std=True.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/24340
|
||||
"""
|
||||
gpr = GaussianProcessRegressor(kernel=CustomKernel()).fit(X, y)
|
||||
|
||||
X2_copy = np.copy(X2)
|
||||
_, _ = gpr.predict(X2, return_std=True)
|
||||
|
||||
assert_allclose(X2, X2_copy)
|
||||
@ -0,0 +1,388 @@
|
||||
"""Testing for kernels for Gaussian processes."""
|
||||
|
||||
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from inspect import signature
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import clone
|
||||
from sklearn.gaussian_process.kernels import (
|
||||
RBF,
|
||||
CompoundKernel,
|
||||
ConstantKernel,
|
||||
DotProduct,
|
||||
Exponentiation,
|
||||
ExpSineSquared,
|
||||
KernelOperator,
|
||||
Matern,
|
||||
PairwiseKernel,
|
||||
RationalQuadratic,
|
||||
WhiteKernel,
|
||||
_approx_fprime,
|
||||
)
|
||||
from sklearn.metrics.pairwise import (
|
||||
PAIRWISE_KERNEL_FUNCTIONS,
|
||||
euclidean_distances,
|
||||
pairwise_kernels,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
|
||||
X = np.random.RandomState(0).normal(0, 1, (5, 2))
|
||||
Y = np.random.RandomState(0).normal(0, 1, (6, 2))
|
||||
|
||||
kernel_rbf_plus_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
|
||||
kernels = [
|
||||
RBF(length_scale=2.0),
|
||||
RBF(length_scale_bounds=(0.5, 2.0)),
|
||||
ConstantKernel(constant_value=10.0),
|
||||
2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
|
||||
2.0 * RBF(length_scale=0.5),
|
||||
kernel_rbf_plus_white,
|
||||
2.0 * RBF(length_scale=[0.5, 2.0]),
|
||||
2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
|
||||
2.0 * Matern(length_scale=0.5, nu=0.5),
|
||||
2.0 * Matern(length_scale=1.5, nu=1.5),
|
||||
2.0 * Matern(length_scale=2.5, nu=2.5),
|
||||
2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
|
||||
3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
|
||||
4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
|
||||
RationalQuadratic(length_scale=0.5, alpha=1.5),
|
||||
ExpSineSquared(length_scale=0.5, periodicity=1.5),
|
||||
DotProduct(sigma_0=2.0),
|
||||
DotProduct(sigma_0=2.0) ** 2,
|
||||
RBF(length_scale=[2.0]),
|
||||
Matern(length_scale=[2.0]),
|
||||
]
|
||||
for metric in PAIRWISE_KERNEL_FUNCTIONS:
|
||||
if metric in ["additive_chi2", "chi2"]:
|
||||
continue
|
||||
kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_gradient(kernel):
|
||||
# Compare analytic and numeric gradient of kernels.
|
||||
K, K_gradient = kernel(X, eval_gradient=True)
|
||||
|
||||
assert K_gradient.shape[0] == X.shape[0]
|
||||
assert K_gradient.shape[1] == X.shape[0]
|
||||
assert K_gradient.shape[2] == kernel.theta.shape[0]
|
||||
|
||||
def eval_kernel_for_theta(theta):
|
||||
kernel_clone = kernel.clone_with_theta(theta)
|
||||
K = kernel_clone(X, eval_gradient=False)
|
||||
return K
|
||||
|
||||
K_gradient_approx = _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10)
|
||||
|
||||
assert_almost_equal(K_gradient, K_gradient_approx, 4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel",
|
||||
[
|
||||
kernel
|
||||
for kernel in kernels
|
||||
# skip non-basic kernels
|
||||
if not (isinstance(kernel, (KernelOperator, Exponentiation)))
|
||||
],
|
||||
)
|
||||
def test_kernel_theta(kernel):
|
||||
# Check that parameter vector theta of kernel is set correctly.
|
||||
theta = kernel.theta
|
||||
_, K_gradient = kernel(X, eval_gradient=True)
|
||||
|
||||
# Determine kernel parameters that contribute to theta
|
||||
init_sign = signature(kernel.__class__.__init__).parameters.values()
|
||||
args = [p.name for p in init_sign if p.name != "self"]
|
||||
theta_vars = map(
|
||||
lambda s: s[0 : -len("_bounds")], filter(lambda s: s.endswith("_bounds"), args)
|
||||
)
|
||||
assert set(hyperparameter.name for hyperparameter in kernel.hyperparameters) == set(
|
||||
theta_vars
|
||||
)
|
||||
|
||||
# Check that values returned in theta are consistent with
|
||||
# hyperparameter values (being their logarithms)
|
||||
for i, hyperparameter in enumerate(kernel.hyperparameters):
|
||||
assert theta[i] == np.log(getattr(kernel, hyperparameter.name))
|
||||
|
||||
# Fixed kernel parameters must be excluded from theta and gradient.
|
||||
for i, hyperparameter in enumerate(kernel.hyperparameters):
|
||||
# create copy with certain hyperparameter fixed
|
||||
params = kernel.get_params()
|
||||
params[hyperparameter.name + "_bounds"] = "fixed"
|
||||
kernel_class = kernel.__class__
|
||||
new_kernel = kernel_class(**params)
|
||||
# Check that theta and K_gradient are identical with the fixed
|
||||
# dimension left out
|
||||
_, K_gradient_new = new_kernel(X, eval_gradient=True)
|
||||
assert theta.shape[0] == new_kernel.theta.shape[0] + 1
|
||||
assert K_gradient.shape[2] == K_gradient_new.shape[2] + 1
|
||||
if i > 0:
|
||||
assert theta[:i] == new_kernel.theta[:i]
|
||||
assert_array_equal(K_gradient[..., :i], K_gradient_new[..., :i])
|
||||
if i + 1 < len(kernel.hyperparameters):
|
||||
assert theta[i + 1 :] == new_kernel.theta[i:]
|
||||
assert_array_equal(K_gradient[..., i + 1 :], K_gradient_new[..., i:])
|
||||
|
||||
# Check that values of theta are modified correctly
|
||||
for i, hyperparameter in enumerate(kernel.hyperparameters):
|
||||
theta[i] = np.log(42)
|
||||
kernel.theta = theta
|
||||
assert_almost_equal(getattr(kernel, hyperparameter.name), 42)
|
||||
|
||||
setattr(kernel, hyperparameter.name, 43)
|
||||
assert_almost_equal(kernel.theta[i], np.log(43))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel",
|
||||
[
|
||||
kernel
|
||||
for kernel in kernels
|
||||
# Identity is not satisfied on diagonal
|
||||
if kernel != kernel_rbf_plus_white
|
||||
],
|
||||
)
|
||||
def test_auto_vs_cross(kernel):
|
||||
# Auto-correlation and cross-correlation should be consistent.
|
||||
K_auto = kernel(X)
|
||||
K_cross = kernel(X, X)
|
||||
assert_almost_equal(K_auto, K_cross, 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_diag(kernel):
|
||||
# Test that diag method of kernel returns consistent results.
|
||||
K_call_diag = np.diag(kernel(X))
|
||||
K_diag = kernel.diag(X)
|
||||
assert_almost_equal(K_call_diag, K_diag, 5)
|
||||
|
||||
|
||||
def test_kernel_operator_commutative():
|
||||
# Adding kernels and multiplying kernels should be commutative.
|
||||
# Check addition
|
||||
assert_almost_equal((RBF(2.0) + 1.0)(X), (1.0 + RBF(2.0))(X))
|
||||
|
||||
# Check multiplication
|
||||
assert_almost_equal((3.0 * RBF(2.0))(X), (RBF(2.0) * 3.0)(X))
|
||||
|
||||
|
||||
def test_kernel_anisotropic():
|
||||
# Anisotropic kernel should be consistent with isotropic kernels.
|
||||
kernel = 3.0 * RBF([0.5, 2.0])
|
||||
|
||||
K = kernel(X)
|
||||
X1 = np.array(X)
|
||||
X1[:, 0] *= 4
|
||||
K1 = 3.0 * RBF(2.0)(X1)
|
||||
assert_almost_equal(K, K1)
|
||||
|
||||
X2 = np.array(X)
|
||||
X2[:, 1] /= 4
|
||||
K2 = 3.0 * RBF(0.5)(X2)
|
||||
assert_almost_equal(K, K2)
|
||||
|
||||
# Check getting and setting via theta
|
||||
kernel.theta = kernel.theta + np.log(2)
|
||||
assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
|
||||
assert_array_equal(kernel.k2.length_scale, [1.0, 4.0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kernel", [kernel for kernel in kernels if kernel.is_stationary()]
|
||||
)
|
||||
def test_kernel_stationary(kernel):
|
||||
# Test stationarity of kernels.
|
||||
K = kernel(X, X + 1)
|
||||
assert_almost_equal(K[0, 0], np.diag(K))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_input_type(kernel):
|
||||
# Test whether kernels is for vectors or structured data
|
||||
if isinstance(kernel, Exponentiation):
|
||||
assert kernel.requires_vector_input == kernel.kernel.requires_vector_input
|
||||
if isinstance(kernel, KernelOperator):
|
||||
assert kernel.requires_vector_input == (
|
||||
kernel.k1.requires_vector_input or kernel.k2.requires_vector_input
|
||||
)
|
||||
|
||||
|
||||
def test_compound_kernel_input_type():
|
||||
kernel = CompoundKernel([WhiteKernel(noise_level=3.0)])
|
||||
assert not kernel.requires_vector_input
|
||||
|
||||
kernel = CompoundKernel([WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
|
||||
assert kernel.requires_vector_input
|
||||
|
||||
|
||||
def check_hyperparameters_equal(kernel1, kernel2):
|
||||
# Check that hyperparameters of two kernels are equal
|
||||
for attr in set(dir(kernel1) + dir(kernel2)):
|
||||
if attr.startswith("hyperparameter_"):
|
||||
attr_value1 = getattr(kernel1, attr)
|
||||
attr_value2 = getattr(kernel2, attr)
|
||||
assert attr_value1 == attr_value2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_clone(kernel):
|
||||
# Test that sklearn's clone works correctly on kernels.
|
||||
kernel_cloned = clone(kernel)
|
||||
|
||||
# XXX: Should this be fixed?
|
||||
# This differs from the sklearn's estimators equality check.
|
||||
assert kernel == kernel_cloned
|
||||
assert id(kernel) != id(kernel_cloned)
|
||||
|
||||
# Check that all constructor parameters are equal.
|
||||
assert kernel.get_params() == kernel_cloned.get_params()
|
||||
|
||||
# Check that all hyperparameters are equal.
|
||||
check_hyperparameters_equal(kernel, kernel_cloned)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_clone_after_set_params(kernel):
|
||||
# This test is to verify that using set_params does not
|
||||
# break clone on kernels.
|
||||
# This used to break because in kernels such as the RBF, non-trivial
|
||||
# logic that modified the length scale used to be in the constructor
|
||||
# See https://github.com/scikit-learn/scikit-learn/issues/6961
|
||||
# for more details.
|
||||
bounds = (1e-5, 1e5)
|
||||
kernel_cloned = clone(kernel)
|
||||
params = kernel.get_params()
|
||||
# RationalQuadratic kernel is isotropic.
|
||||
isotropic_kernels = (ExpSineSquared, RationalQuadratic)
|
||||
if "length_scale" in params and not isinstance(kernel, isotropic_kernels):
|
||||
length_scale = params["length_scale"]
|
||||
if np.iterable(length_scale):
|
||||
# XXX unreached code as of v0.22
|
||||
params["length_scale"] = length_scale[0]
|
||||
params["length_scale_bounds"] = bounds
|
||||
else:
|
||||
params["length_scale"] = [length_scale] * 2
|
||||
params["length_scale_bounds"] = bounds * 2
|
||||
kernel_cloned.set_params(**params)
|
||||
kernel_cloned_clone = clone(kernel_cloned)
|
||||
assert kernel_cloned_clone.get_params() == kernel_cloned.get_params()
|
||||
assert id(kernel_cloned_clone) != id(kernel_cloned)
|
||||
check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone)
|
||||
|
||||
|
||||
def test_matern_kernel():
|
||||
# Test consistency of Matern kernel for special values of nu.
|
||||
K = Matern(nu=1.5, length_scale=1.0)(X)
|
||||
# the diagonal elements of a matern kernel are 1
|
||||
assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
|
||||
# matern kernel for coef0==0.5 is equal to absolute exponential kernel
|
||||
K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
|
||||
K = Matern(nu=0.5, length_scale=1.0)(X)
|
||||
assert_array_almost_equal(K, K_absexp)
|
||||
# matern kernel with coef0==inf is equal to RBF kernel
|
||||
K_rbf = RBF(length_scale=1.0)(X)
|
||||
K = Matern(nu=np.inf, length_scale=1.0)(X)
|
||||
assert_array_almost_equal(K, K_rbf)
|
||||
assert_allclose(K, K_rbf)
|
||||
# test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
|
||||
# result in nearly identical results as the general case for coef0 in
|
||||
# [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
|
||||
tiny = 1e-10
|
||||
for nu in [0.5, 1.5, 2.5]:
|
||||
K1 = Matern(nu=nu, length_scale=1.0)(X)
|
||||
K2 = Matern(nu=nu + tiny, length_scale=1.0)(X)
|
||||
assert_array_almost_equal(K1, K2)
|
||||
# test that coef0==large is close to RBF
|
||||
large = 100
|
||||
K1 = Matern(nu=large, length_scale=1.0)(X)
|
||||
K2 = RBF(length_scale=1.0)(X)
|
||||
assert_array_almost_equal(K1, K2, decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_kernel_versus_pairwise(kernel):
|
||||
# Check that GP kernels can also be used as pairwise kernels.
|
||||
|
||||
# Test auto-kernel
|
||||
if kernel != kernel_rbf_plus_white:
|
||||
# For WhiteKernel: k(X) != k(X,X). This is assumed by
|
||||
# pairwise_kernels
|
||||
K1 = kernel(X)
|
||||
K2 = pairwise_kernels(X, metric=kernel)
|
||||
assert_array_almost_equal(K1, K2)
|
||||
|
||||
# Test cross-kernel
|
||||
K1 = kernel(X, Y)
|
||||
K2 = pairwise_kernels(X, Y, metric=kernel)
|
||||
assert_array_almost_equal(K1, K2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_set_get_params(kernel):
|
||||
# Check that set_params()/get_params() is consistent with kernel.theta.
|
||||
|
||||
# Test get_params()
|
||||
index = 0
|
||||
params = kernel.get_params()
|
||||
for hyperparameter in kernel.hyperparameters:
|
||||
if isinstance("string", type(hyperparameter.bounds)):
|
||||
if hyperparameter.bounds == "fixed":
|
||||
continue
|
||||
size = hyperparameter.n_elements
|
||||
if size > 1: # anisotropic kernels
|
||||
assert_almost_equal(
|
||||
np.exp(kernel.theta[index : index + size]), params[hyperparameter.name]
|
||||
)
|
||||
index += size
|
||||
else:
|
||||
assert_almost_equal(
|
||||
np.exp(kernel.theta[index]), params[hyperparameter.name]
|
||||
)
|
||||
index += 1
|
||||
# Test set_params()
|
||||
index = 0
|
||||
value = 10 # arbitrary value
|
||||
for hyperparameter in kernel.hyperparameters:
|
||||
if isinstance("string", type(hyperparameter.bounds)):
|
||||
if hyperparameter.bounds == "fixed":
|
||||
continue
|
||||
size = hyperparameter.n_elements
|
||||
if size > 1: # anisotropic kernels
|
||||
kernel.set_params(**{hyperparameter.name: [value] * size})
|
||||
assert_almost_equal(
|
||||
np.exp(kernel.theta[index : index + size]), [value] * size
|
||||
)
|
||||
index += size
|
||||
else:
|
||||
kernel.set_params(**{hyperparameter.name: value})
|
||||
assert_almost_equal(np.exp(kernel.theta[index]), value)
|
||||
index += 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kernel", kernels)
|
||||
def test_repr_kernels(kernel):
|
||||
# Smoke-test for repr in kernels.
|
||||
|
||||
repr(kernel)
|
||||
|
||||
|
||||
def test_rational_quadratic_kernel():
|
||||
kernel = RationalQuadratic(length_scale=[1.0, 1.0])
|
||||
message = (
|
||||
"RationalQuadratic kernel only supports isotropic "
|
||||
"version, please use a single "
|
||||
"scalar for length_scale"
|
||||
)
|
||||
with pytest.raises(AttributeError, match=message):
|
||||
kernel(X)
|
||||
Reference in New Issue
Block a user