reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,267 @@
|
||||
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import OutlierMixin, _fit_context
|
||||
from ..metrics import accuracy_score
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ._robust_covariance import MinCovDet
|
||||
|
||||
|
||||
class EllipticEnvelope(OutlierMixin, MinCovDet):
|
||||
"""An object for detecting outliers in a Gaussian distributed dataset.
|
||||
|
||||
Read more in the :ref:`User Guide <outlier_detection>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, the support of robust location and covariance estimates
|
||||
is computed, and a covariance estimate is recomputed from it,
|
||||
without centering the data.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, the robust location and covariance are directly computed
|
||||
with the FastMCD algorithm without additional treatment.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. If None, the minimum value of support_fraction will
|
||||
be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`.
|
||||
Range is (0, 1).
|
||||
|
||||
contamination : float, default=0.1
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. Range is (0, 0.5].
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the pseudo random number generator for shuffling
|
||||
the data. Pass an int for reproducible results across multiple function
|
||||
calls. See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated robust location.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated robust covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute the
|
||||
robust estimates of location and shape.
|
||||
|
||||
offset_ : float
|
||||
Offset used to define the decision function from the raw scores.
|
||||
We have the relation: ``decision_function = score_samples - offset_``.
|
||||
The offset depends on the contamination parameter and is defined in
|
||||
such a way we obtain the expected number of outliers (samples with
|
||||
decision function < 0) in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
raw_location_ : ndarray of shape (n_features,)
|
||||
The raw robust estimated location before correction and re-weighting.
|
||||
|
||||
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||||
The raw robust estimated covariance before correction and re-weighting.
|
||||
|
||||
raw_support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the raw robust estimates of location and shape, before correction
|
||||
and re-weighting.
|
||||
|
||||
dist_ : ndarray of shape (n_samples,)
|
||||
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||||
called) observations.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
EmpiricalCovariance : Maximum likelihood covariance estimator.
|
||||
GraphicalLasso : Sparse inverse covariance estimation
|
||||
with an l1-penalized estimator.
|
||||
LedoitWolf : LedoitWolf Estimator.
|
||||
MinCovDet : Minimum Covariance Determinant
|
||||
(robust estimator of covariance).
|
||||
OAS : Oracle Approximating Shrinkage Estimator.
|
||||
ShrunkCovariance : Covariance estimator with shrinkage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Outlier detection from covariance estimation may break or not
|
||||
perform well in high-dimensional settings. In particular, one will
|
||||
always take care to work with ``n_samples > n_features ** 2``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
|
||||
minimum covariance determinant estimator" Technometrics 41(3), 212
|
||||
(1999)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EllipticEnvelope
|
||||
>>> true_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
|
||||
... cov=true_cov,
|
||||
... size=500)
|
||||
>>> cov = EllipticEnvelope(random_state=0).fit(X)
|
||||
>>> # predict returns 1 for an inlier and -1 for an outlier
|
||||
>>> cov.predict([[0, 0],
|
||||
... [3, 3]])
|
||||
array([ 1, -1])
|
||||
>>> cov.covariance_
|
||||
array([[0.7411..., 0.2535...],
|
||||
[0.2535..., 0.3053...]])
|
||||
>>> cov.location_
|
||||
array([0.0813... , 0.0427...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**MinCovDet._parameter_constraints,
|
||||
"contamination": [Interval(Real, 0, 0.5, closed="right")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
store_precision=True,
|
||||
assume_centered=False,
|
||||
support_fraction=None,
|
||||
contamination=0.1,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
store_precision=store_precision,
|
||||
assume_centered=assume_centered,
|
||||
support_fraction=support_fraction,
|
||||
random_state=random_state,
|
||||
)
|
||||
self.contamination = contamination
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the EllipticEnvelope model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
super().fit(X)
|
||||
self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
"""Compute the decision function of the given observations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
decision : ndarray of shape (n_samples,)
|
||||
Decision function of the samples.
|
||||
It is equal to the shifted Mahalanobis distances.
|
||||
The threshold for being an outlier is 0, which ensures a
|
||||
compatibility with other outlier detection algorithms.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
negative_mahal_dist = self.score_samples(X)
|
||||
return negative_mahal_dist - self.offset_
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Compute the negative Mahalanobis distances.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
negative_mahal_distances : array-like of shape (n_samples,)
|
||||
Opposite of the Mahalanobis distances.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return -self.mahalanobis(X)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict labels (1 inlier, -1 outlier) of X according to fitted model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
values = self.decision_function(X)
|
||||
is_inlier = np.full(values.shape[0], -1, dtype=int)
|
||||
is_inlier[values >= 0] = 1
|
||||
|
||||
return is_inlier
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Return the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) w.r.t. y.
|
||||
"""
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
Reference in New Issue
Block a user