some new features
This commit is contained in:
516
.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py
Normal file
516
.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py
Normal file
@ -0,0 +1,516 @@
|
||||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import OutlierMixin, _fit_context
|
||||
from ..utils import check_array
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.metaestimators import available_if
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ._base import KNeighborsMixin, NeighborsBase
|
||||
|
||||
__all__ = ["LocalOutlierFactor"]
|
||||
|
||||
|
||||
class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):
|
||||
"""Unsupervised Outlier Detection using the Local Outlier Factor (LOF).
|
||||
|
||||
The anomaly score of each sample is called the Local Outlier Factor.
|
||||
It measures the local deviation of the density of a given sample with respect
|
||||
to its neighbors.
|
||||
It is local in that the anomaly score depends on how isolated the object
|
||||
is with respect to the surrounding neighborhood.
|
||||
More precisely, locality is given by k-nearest neighbors, whose distance
|
||||
is used to estimate the local density.
|
||||
By comparing the local density of a sample to the local densities of its
|
||||
neighbors, one can identify samples that have a substantially lower density
|
||||
than their neighbors. These are considered outliers.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=20
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
If n_neighbors is larger than the number of samples provided,
|
||||
all samples will be used.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||||
affect the speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
Metric to use for distance computation. Default is "minkowski", which
|
||||
results in the standard Euclidean distance when p = 2. See the
|
||||
documentation of `scipy.spatial.distance
|
||||
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||||
the metrics listed in
|
||||
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||||
values.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it takes two arrays representing 1D
|
||||
vectors as inputs and must return one value indicating the distance
|
||||
between those vectors. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
:func:`sklearn.metrics.pairwise_distances`. When p = 1, this
|
||||
is equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
contamination : 'auto' or float, default='auto'
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. When fitting this is used to define the
|
||||
threshold on the scores of the samples.
|
||||
|
||||
- if 'auto', the threshold is determined as in the
|
||||
original paper,
|
||||
- if a float, the contamination should be in the range (0, 0.5].
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default value of ``contamination`` changed from 0.1
|
||||
to ``'auto'``.
|
||||
|
||||
novelty : bool, default=False
|
||||
By default, LocalOutlierFactor is only meant to be used for outlier
|
||||
detection (novelty=False). Set novelty to True if you want to use
|
||||
LocalOutlierFactor for novelty detection. In this case be aware that
|
||||
you should only use predict, decision_function and score_samples
|
||||
on new unseen data and not on the training set; and note that the
|
||||
results obtained this way may differ from the standard LOF results.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
negative_outlier_factor_ : ndarray of shape (n_samples,)
|
||||
The opposite LOF of the training samples. The higher, the more normal.
|
||||
Inliers tend to have a LOF score close to 1
|
||||
(``negative_outlier_factor_`` close to -1), while outliers tend to have
|
||||
a larger LOF score.
|
||||
|
||||
The local outlier factor (LOF) of a sample captures its
|
||||
supposed 'degree of abnormality'.
|
||||
It is the average of the ratio of the local reachability density of
|
||||
a sample and those of its k-nearest neighbors.
|
||||
|
||||
n_neighbors_ : int
|
||||
The actual number of neighbors used for :meth:`kneighbors` queries.
|
||||
|
||||
offset_ : float
|
||||
Offset used to obtain binary labels from the raw scores.
|
||||
Observations having a negative_outlier_factor smaller than `offset_`
|
||||
are detected as abnormal.
|
||||
The offset is set to -1.5 (inliers score around -1), except when a
|
||||
contamination parameter different than "auto" is provided. In that
|
||||
case, the offset is defined in such a way we obtain the expected
|
||||
number of outliers in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
effective_metric_ : str
|
||||
The effective metric used for the distance computation.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
The effective additional keyword arguments for the metric function.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_samples_fit_ : int
|
||||
It is the number of samples in the fitted data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using
|
||||
Support Vector Machine.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
|
||||
LOF: identifying density-based local outliers. In ACM sigmod record.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neighbors import LocalOutlierFactor
|
||||
>>> X = [[-1.1], [0.2], [101.1], [0.3]]
|
||||
>>> clf = LocalOutlierFactor(n_neighbors=2)
|
||||
>>> clf.fit_predict(X)
|
||||
array([ 1, 1, -1, 1])
|
||||
>>> clf.negative_outlier_factor_
|
||||
array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**NeighborsBase._parameter_constraints,
|
||||
"contamination": [
|
||||
StrOptions({"auto"}),
|
||||
Interval(Real, 0, 0.5, closed="right"),
|
||||
],
|
||||
"novelty": ["boolean"],
|
||||
}
|
||||
_parameter_constraints.pop("radius")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_neighbors=20,
|
||||
*,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
contamination="auto",
|
||||
novelty=False,
|
||||
n_jobs=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
self.contamination = contamination
|
||||
self.novelty = novelty
|
||||
|
||||
def _check_novelty_fit_predict(self):
|
||||
if self.novelty:
|
||||
msg = (
|
||||
"fit_predict is not available when novelty=True. Use "
|
||||
"novelty=False if you want to predict on the training set."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_fit_predict)
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit the model to the training set X and return the labels.
|
||||
|
||||
**Not available for novelty detection (when novelty is set to True).**
|
||||
Label is 1 for an inlier and -1 for an outlier according to the LOF
|
||||
score and the contamination parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and 1 for inliers.
|
||||
"""
|
||||
|
||||
# As fit_predict would be different from fit.predict, fit_predict is
|
||||
# only available for outlier detection (novelty=False)
|
||||
|
||||
return self.fit(X)._predict()
|
||||
|
||||
@_fit_context(
|
||||
# LocalOutlierFactor.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the local outlier factor detector from the training dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples) if metric='precomputed'
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : LocalOutlierFactor
|
||||
The fitted local outlier factor detector.
|
||||
"""
|
||||
self._fit(X)
|
||||
|
||||
n_samples = self.n_samples_fit_
|
||||
if self.n_neighbors > n_samples:
|
||||
warnings.warn(
|
||||
"n_neighbors (%s) is greater than the "
|
||||
"total number of samples (%s). n_neighbors "
|
||||
"will be set to (n_samples - 1) for estimation."
|
||||
% (self.n_neighbors, n_samples)
|
||||
)
|
||||
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
|
||||
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
|
||||
n_neighbors=self.n_neighbors_
|
||||
)
|
||||
|
||||
if self._fit_X.dtype == np.float32:
|
||||
self._distances_fit_X_ = self._distances_fit_X_.astype(
|
||||
self._fit_X.dtype,
|
||||
copy=False,
|
||||
)
|
||||
|
||||
self._lrd = self._local_reachability_density(
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_
|
||||
)
|
||||
|
||||
# Compute lof score over training samples to define offset_:
|
||||
lrd_ratios_array = (
|
||||
self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]
|
||||
)
|
||||
|
||||
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
if self.contamination == "auto":
|
||||
# inliers score around -1 (the higher, the less abnormal).
|
||||
self.offset_ = -1.5
|
||||
else:
|
||||
self.offset_ = np.percentile(
|
||||
self.negative_outlier_factor_, 100.0 * self.contamination
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def _check_novelty_predict(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"predict is not available when novelty=False, use "
|
||||
"fit_predict if you want to predict on training data. Use "
|
||||
"novelty=True if you want to use LOF for novelty detection "
|
||||
"and predict on new unseen data."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_predict)
|
||||
def predict(self, X=None):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
This method allows to generalize prediction to *new observations* (not
|
||||
in the training set). Note that the result of ``clf.fit(X)`` then
|
||||
``clf.predict(X)`` with ``novelty=True`` may differ from the result
|
||||
obtained by ``clf.fit_predict(X)`` with ``novelty=False``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
return self._predict(X)
|
||||
|
||||
def _predict(self, X=None):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
If X is None, returns the same as fit_predict(X_train).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples. If None, makes prediction on the
|
||||
training data without considering them as their own neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if X is not None:
|
||||
shifted_opposite_lof_scores = self.decision_function(X)
|
||||
is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int)
|
||||
is_inlier[shifted_opposite_lof_scores < 0] = -1
|
||||
else:
|
||||
is_inlier = np.ones(self.n_samples_fit_, dtype=int)
|
||||
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
|
||||
|
||||
return is_inlier
|
||||
|
||||
def _check_novelty_decision_function(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"decision_function is not available when novelty=False. "
|
||||
"Use novelty=True if you want to use LOF for novelty "
|
||||
"detection and compute decision_function for new unseen "
|
||||
"data. Note that the opposite LOF of the training samples "
|
||||
"is always available by considering the "
|
||||
"negative_outlier_factor_ attribute."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_decision_function)
|
||||
def decision_function(self, X):
|
||||
"""Shifted opposite of the Local Outlier Factor of X.
|
||||
|
||||
Bigger is better, i.e. large values correspond to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The shift offset allows a zero threshold for being an outlier.
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shifted_opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The shifted opposite of the Local Outlier Factor of each input
|
||||
samples. The lower, the more abnormal. Negative scores represent
|
||||
outliers, positive scores represent inliers.
|
||||
"""
|
||||
return self.score_samples(X) - self.offset_
|
||||
|
||||
def _check_novelty_score_samples(self):
|
||||
if not self.novelty:
|
||||
msg = (
|
||||
"score_samples is not available when novelty=False. The "
|
||||
"scores of the training samples are always available "
|
||||
"through the negative_outlier_factor_ attribute. Use "
|
||||
"novelty=True if you want to use LOF for novelty detection "
|
||||
"and compute score_samples for new unseen data."
|
||||
)
|
||||
raise AttributeError(msg)
|
||||
return True
|
||||
|
||||
@available_if(_check_novelty_score_samples)
|
||||
def score_samples(self, X):
|
||||
"""Opposite of the Local Outlier Factor of X.
|
||||
|
||||
It is the opposite as bigger is better, i.e. large values correspond
|
||||
to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point. Because of this, the scores obtained via ``score_samples`` may
|
||||
differ from the standard LOF scores.
|
||||
The standard LOF scores for the training data is available via the
|
||||
``negative_outlier_factor_`` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The opposite of the Local Outlier Factor of each input samples.
|
||||
The lower, the more abnormal.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse="csr")
|
||||
|
||||
distances_X, neighbors_indices_X = self.kneighbors(
|
||||
X, n_neighbors=self.n_neighbors_
|
||||
)
|
||||
|
||||
if X.dtype == np.float32:
|
||||
distances_X = distances_X.astype(X.dtype, copy=False)
|
||||
|
||||
X_lrd = self._local_reachability_density(
|
||||
distances_X,
|
||||
neighbors_indices_X,
|
||||
)
|
||||
|
||||
lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]
|
||||
|
||||
# as bigger is better:
|
||||
return -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
def _local_reachability_density(self, distances_X, neighbors_indices):
|
||||
"""The local reachability density (LRD)
|
||||
|
||||
The LRD of a sample is the inverse of the average reachability
|
||||
distance of its k-nearest neighbors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distances_X : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Distances to the neighbors (in the training samples `self._fit_X`)
|
||||
of each query point to compute the LRD.
|
||||
|
||||
neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Neighbors indices (of each query point) among training samples
|
||||
self._fit_X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
local_reachability_density : ndarray of shape (n_queries,)
|
||||
The local reachability density of each sample.
|
||||
"""
|
||||
dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]
|
||||
reach_dist_array = np.maximum(distances_X, dist_k)
|
||||
|
||||
# 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
|
||||
return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"preserves_dtype": [np.float64, np.float32],
|
||||
}
|
||||
Reference in New Issue
Block a user