reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,364 @@
|
||||
"""
|
||||
Maximum likelihood covariance estimator.
|
||||
|
||||
"""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
# avoid division truncation
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from .. import config_context
|
||||
from ..base import BaseEstimator, _fit_context
|
||||
from ..metrics.pairwise import pairwise_distances
|
||||
from ..utils import check_array
|
||||
from ..utils._param_validation import validate_params
|
||||
from ..utils.extmath import fast_logdet
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"emp_cov": [np.ndarray],
|
||||
"precision": [np.ndarray],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def log_likelihood(emp_cov, precision):
|
||||
"""Compute the sample mean of the log_likelihood under a covariance model.
|
||||
|
||||
Computes the empirical expected log-likelihood, allowing for universal
|
||||
comparison (beyond this software package), and accounts for normalization
|
||||
terms and scaling.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : ndarray of shape (n_features, n_features)
|
||||
Maximum Likelihood Estimator of covariance.
|
||||
|
||||
precision : ndarray of shape (n_features, n_features)
|
||||
The precision matrix of the covariance model to be tested.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood_ : float
|
||||
Sample mean of the log-likelihood.
|
||||
"""
|
||||
p = precision.shape[0]
|
||||
log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)
|
||||
log_likelihood_ -= p * np.log(2 * np.pi)
|
||||
log_likelihood_ /= 2.0
|
||||
return log_likelihood_
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"assume_centered": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def empirical_covariance(X, *, assume_centered=False):
|
||||
"""Compute the Maximum likelihood covariance estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If `True`, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If `False`, data will be centered before computation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Empirical covariance (Maximum Likelihood Estimator).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.covariance import empirical_covariance
|
||||
>>> X = [[1,1,1],[1,1,1],[1,1,1],
|
||||
... [0,0,0],[0,0,0],[0,0,0]]
|
||||
>>> empirical_covariance(X)
|
||||
array([[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25]])
|
||||
"""
|
||||
X = check_array(X, ensure_2d=False, force_all_finite=False)
|
||||
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
|
||||
if X.shape[0] == 1:
|
||||
warnings.warn(
|
||||
"Only one sample available. You may want to reshape your data array"
|
||||
)
|
||||
|
||||
if assume_centered:
|
||||
covariance = np.dot(X.T, X) / X.shape[0]
|
||||
else:
|
||||
covariance = np.cov(X.T, bias=1)
|
||||
|
||||
if covariance.ndim == 0:
|
||||
covariance = np.array([[covariance]])
|
||||
return covariance
|
||||
|
||||
|
||||
class EmpiricalCovariance(BaseEstimator):
|
||||
"""Maximum likelihood covariance estimator.
|
||||
|
||||
Read more in the :ref:`User Guide <covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specifies if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data are not centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data are centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo-inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EllipticEnvelope : An object for detecting outliers in
|
||||
a Gaussian distributed dataset.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EmpiricalCovariance
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = EmpiricalCovariance().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7569..., 0.2818...],
|
||||
[0.2818..., 0.3928...]])
|
||||
>>> cov.location_
|
||||
array([0.0622..., 0.0193...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"store_precision": ["boolean"],
|
||||
"assume_centered": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(self, *, store_precision=True, assume_centered=False):
|
||||
self.store_precision = store_precision
|
||||
self.assume_centered = assume_centered
|
||||
|
||||
def _set_covariance(self, covariance):
|
||||
"""Saves the covariance and precision estimates
|
||||
|
||||
Storage is done accordingly to `self.store_precision`.
|
||||
Precision stored only if invertible.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
covariance : array-like of shape (n_features, n_features)
|
||||
Estimated covariance matrix to be stored, and from which precision
|
||||
is computed.
|
||||
"""
|
||||
covariance = check_array(covariance)
|
||||
# set covariance
|
||||
self.covariance_ = covariance
|
||||
# set precision
|
||||
if self.store_precision:
|
||||
self.precision_ = linalg.pinvh(covariance, check_finite=False)
|
||||
else:
|
||||
self.precision_ = None
|
||||
|
||||
def get_precision(self):
|
||||
"""Getter for the precision matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
precision_ : array-like of shape (n_features, n_features)
|
||||
The precision matrix associated to the current covariance object.
|
||||
"""
|
||||
if self.store_precision:
|
||||
precision = self.precision_
|
||||
else:
|
||||
precision = linalg.pinvh(self.covariance_, check_finite=False)
|
||||
return precision
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the maximum likelihood covariance estimator to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance = empirical_covariance(X, assume_centered=self.assume_centered)
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
def score(self, X_test, y=None):
|
||||
"""Compute the log-likelihood of `X_test` under the estimated Gaussian model.
|
||||
|
||||
The Gaussian model is defined by its mean and covariance matrix which are
|
||||
represented respectively by `self.location_` and `self.covariance_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_test : array-like of shape (n_samples, n_features)
|
||||
Test data of which we compute the likelihood, where `n_samples` is
|
||||
the number of samples and `n_features` is the number of features.
|
||||
`X_test` is assumed to be drawn from the same distribution than
|
||||
the data used in fit (including centering).
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : float
|
||||
The log-likelihood of `X_test` with `self.location_` and `self.covariance_`
|
||||
as estimators of the Gaussian model mean and covariance matrix respectively.
|
||||
"""
|
||||
X_test = self._validate_data(X_test, reset=False)
|
||||
# compute empirical covariance of the test set
|
||||
test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)
|
||||
# compute log likelihood
|
||||
res = log_likelihood(test_cov, self.get_precision())
|
||||
|
||||
return res
|
||||
|
||||
def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True):
|
||||
"""Compute the Mean Squared Error between two covariance estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
comp_cov : array-like of shape (n_features, n_features)
|
||||
The covariance to compare with.
|
||||
|
||||
norm : {"frobenius", "spectral"}, default="frobenius"
|
||||
The type of norm used to compute the error. Available error types:
|
||||
- 'frobenius' (default): sqrt(tr(A^t.A))
|
||||
- 'spectral': sqrt(max(eigenvalues(A^t.A))
|
||||
where A is the error ``(comp_cov - self.covariance_)``.
|
||||
|
||||
scaling : bool, default=True
|
||||
If True (default), the squared error norm is divided by n_features.
|
||||
If False, the squared error norm is not rescaled.
|
||||
|
||||
squared : bool, default=True
|
||||
Whether to compute the squared error norm or the error norm.
|
||||
If True (default), the squared error norm is returned.
|
||||
If False, the error norm is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : float
|
||||
The Mean Squared Error (in the sense of the Frobenius norm) between
|
||||
`self` and `comp_cov` covariance estimators.
|
||||
"""
|
||||
# compute the error
|
||||
error = comp_cov - self.covariance_
|
||||
# compute the error norm
|
||||
if norm == "frobenius":
|
||||
squared_norm = np.sum(error**2)
|
||||
elif norm == "spectral":
|
||||
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only spectral and frobenius norms are implemented"
|
||||
)
|
||||
# optionally scale the error norm
|
||||
if scaling:
|
||||
squared_norm = squared_norm / error.shape[0]
|
||||
# finally get either the squared norm or the norm
|
||||
if squared:
|
||||
result = squared_norm
|
||||
else:
|
||||
result = np.sqrt(squared_norm)
|
||||
|
||||
return result
|
||||
|
||||
def mahalanobis(self, X):
|
||||
"""Compute the squared Mahalanobis distances of given observations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The observations, the Mahalanobis distances of the which we
|
||||
compute. Observations are assumed to be drawn from the same
|
||||
distribution than the data used in fit.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dist : ndarray of shape (n_samples,)
|
||||
Squared Mahalanobis distances of the observations.
|
||||
"""
|
||||
X = self._validate_data(X, reset=False)
|
||||
|
||||
precision = self.get_precision()
|
||||
with config_context(assume_finite=True):
|
||||
# compute mahalanobis distances
|
||||
dist = pairwise_distances(
|
||||
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
|
||||
)
|
||||
|
||||
return np.reshape(dist, (len(X),)) ** 2
|
||||
Reference in New Issue
Block a user