reconnect moved files to git repo
This commit is contained in:
801
venv/lib/python3.11/site-packages/sklearn/cluster/_spectral.py
Normal file
801
venv/lib/python3.11/site-packages/sklearn/cluster/_spectral.py
Normal file
@ -0,0 +1,801 @@
|
||||
"""Algorithms for spectral clustering"""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Brian Cheung
|
||||
# Wei LI <kuantkid@gmail.com>
|
||||
# Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import LinAlgError, qr, svd
|
||||
from scipy.sparse import csc_matrix
|
||||
|
||||
from ..base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from ..manifold._spectral_embedding import _spectral_embedding
|
||||
from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels
|
||||
from ..neighbors import NearestNeighbors, kneighbors_graph
|
||||
from ..utils import as_float_array, check_random_state
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ._kmeans import k_means
|
||||
|
||||
|
||||
def cluster_qr(vectors):
|
||||
"""Find the discrete partition closest to the eigenvector embedding.
|
||||
|
||||
This implementation was proposed in [1]_.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectors : array-like, shape: (n_samples, n_clusters)
|
||||
The embedding space of the samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The cluster labels of vectors.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
"""
|
||||
|
||||
k = vectors.shape[1]
|
||||
_, _, piv = qr(vectors.T, pivoting=True)
|
||||
ut, _, v = svd(vectors[piv[:k], :].T)
|
||||
vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
|
||||
return vectors.argmax(axis=1)
|
||||
|
||||
|
||||
def discretize(
|
||||
vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
|
||||
):
|
||||
"""Search for a partition matrix which is closest to the eigenvector embedding.
|
||||
|
||||
This implementation was proposed in [1]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectors : array-like of shape (n_samples, n_clusters)
|
||||
The embedding space of the samples.
|
||||
|
||||
copy : bool, default=True
|
||||
Whether to copy vectors, or perform in-place normalization.
|
||||
|
||||
max_svd_restarts : int, default=30
|
||||
Maximum number of attempts to restart SVD if convergence fails
|
||||
|
||||
n_iter_max : int, default=30
|
||||
Maximum number of iterations to attempt in rotation and partition
|
||||
matrix search if machine precision convergence is not reached
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines random number generation for rotation matrix initialization.
|
||||
Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
The eigenvector embedding is used to iteratively search for the
|
||||
closest discrete partition. First, the eigenvector embedding is
|
||||
normalized to the space of partition matrices. An optimal discrete
|
||||
partition matrix closest to this normalized embedding multiplied by
|
||||
an initial rotation is calculated. Fixing this discrete partition
|
||||
matrix, an optimal rotation matrix is calculated. These two
|
||||
calculations are performed until convergence. The discrete partition
|
||||
matrix is returned as the clustering solution. Used in spectral
|
||||
clustering, this method tends to be faster and more robust to random
|
||||
initialization than k-means.
|
||||
|
||||
"""
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
vectors = as_float_array(vectors, copy=copy)
|
||||
|
||||
eps = np.finfo(float).eps
|
||||
n_samples, n_components = vectors.shape
|
||||
|
||||
# Normalize the eigenvectors to an equal length of a vector of ones.
|
||||
# Reorient the eigenvectors to point in the negative direction with respect
|
||||
# to the first element. This may have to do with constraining the
|
||||
# eigenvectors to lie in a specific quadrant to make the discretization
|
||||
# search easier.
|
||||
norm_ones = np.sqrt(n_samples)
|
||||
for i in range(vectors.shape[1]):
|
||||
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones
|
||||
if vectors[0, i] != 0:
|
||||
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
|
||||
|
||||
# Normalize the rows of the eigenvectors. Samples should lie on the unit
|
||||
# hypersphere centered at the origin. This transforms the samples in the
|
||||
# embedding space to the space of partition matrices.
|
||||
vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]
|
||||
|
||||
svd_restarts = 0
|
||||
has_converged = False
|
||||
|
||||
# If there is an exception we try to randomize and rerun SVD again
|
||||
# do this max_svd_restarts times.
|
||||
while (svd_restarts < max_svd_restarts) and not has_converged:
|
||||
# Initialize first column of rotation matrix with a row of the
|
||||
# eigenvectors
|
||||
rotation = np.zeros((n_components, n_components))
|
||||
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
|
||||
|
||||
# To initialize the rest of the rotation matrix, find the rows
|
||||
# of the eigenvectors that are as orthogonal to each other as
|
||||
# possible
|
||||
c = np.zeros(n_samples)
|
||||
for j in range(1, n_components):
|
||||
# Accumulate c to ensure row is as orthogonal as possible to
|
||||
# previous picks as well as current one
|
||||
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
|
||||
rotation[:, j] = vectors[c.argmin(), :].T
|
||||
|
||||
last_objective_value = 0.0
|
||||
n_iter = 0
|
||||
|
||||
while not has_converged:
|
||||
n_iter += 1
|
||||
|
||||
t_discrete = np.dot(vectors, rotation)
|
||||
|
||||
labels = t_discrete.argmax(axis=1)
|
||||
vectors_discrete = csc_matrix(
|
||||
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
|
||||
shape=(n_samples, n_components),
|
||||
)
|
||||
|
||||
t_svd = vectors_discrete.T * vectors
|
||||
|
||||
try:
|
||||
U, S, Vh = np.linalg.svd(t_svd)
|
||||
except LinAlgError:
|
||||
svd_restarts += 1
|
||||
print("SVD did not converge, randomizing and trying again")
|
||||
break
|
||||
|
||||
ncut_value = 2.0 * (n_samples - S.sum())
|
||||
if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max):
|
||||
has_converged = True
|
||||
else:
|
||||
# otherwise calculate rotation and continue
|
||||
last_objective_value = ncut_value
|
||||
rotation = np.dot(Vh.T, U.T)
|
||||
|
||||
if not has_converged:
|
||||
raise LinAlgError("SVD did not converge")
|
||||
return labels
|
||||
|
||||
|
||||
@validate_params(
|
||||
{"affinity": ["array-like", "sparse matrix"]},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def spectral_clustering(
|
||||
affinity,
|
||||
*,
|
||||
n_clusters=8,
|
||||
n_components=None,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
n_init=10,
|
||||
eigen_tol="auto",
|
||||
assign_labels="kmeans",
|
||||
verbose=False,
|
||||
):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster. For instance, when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If affinity is the adjacency matrix of a graph, this method can be
|
||||
used to find normalized graph cuts [1]_, [2]_.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
The affinity matrix describing the relationship of the samples to
|
||||
embed. **Must be symmetric**.
|
||||
|
||||
Possible examples:
|
||||
- adjacency matrix of a graph,
|
||||
- heat kernel of the pairwise distance matrix of the samples,
|
||||
- symmetric k-nearest neighbours connectivity matrix of the samples.
|
||||
|
||||
n_clusters : int, default=None
|
||||
Number of clusters to extract.
|
||||
|
||||
n_components : int, default=n_clusters
|
||||
Number of eigenvectors to use for the spectral embedding.
|
||||
|
||||
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
|
||||
The eigenvalue decomposition method. If None then ``'arpack'`` is used.
|
||||
See [4]_ for more details regarding ``'lobpcg'``.
|
||||
Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
|
||||
Algebraic MultiGrid preconditioning and requires pyamg to be installed.
|
||||
It can be faster on very large sparse problems [6]_ and [7]_.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigenvectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of n_init
|
||||
consecutive runs in terms of inertia. Only used if
|
||||
``assign_labels='kmeans'``.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
|
||||
The strategy to use to assign labels in the embedding
|
||||
space. There are three ways to assign labels after the Laplacian
|
||||
embedding. k-means can be applied and is a popular choice. But it can
|
||||
also be sensitive to initialization. Discretization is another
|
||||
approach which is less sensitive to random initialization [3]_.
|
||||
The cluster_qr method [5]_ directly extracts clusters from eigenvectors
|
||||
in spectral clustering. In contrast to k-means and discretization, cluster_qr
|
||||
has no tuning parameters and is not an iterative method, yet may outperform
|
||||
k-means and discretization in terms of both quality and speed.
|
||||
|
||||
.. versionchanged:: 1.1
|
||||
Added new labeling method 'cluster_qr'.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbosity mode.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The graph should contain only one connected component, elsewhere
|
||||
the results make little sense.
|
||||
|
||||
This algorithm solves the normalized cut for `k=2`: it is a
|
||||
normalized spectral clustering.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
.. [3] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
|
||||
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
|
||||
A. V. Knyazev
|
||||
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
.. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning
|
||||
for computing eigenvalues of graph Laplacians in image segmentation, 2006
|
||||
Andrew Knyazev
|
||||
<10.13140/RG.2.2.35280.02565>`
|
||||
|
||||
.. [7] :doi:`Preconditioned spectral clustering for stochastic block partition
|
||||
streaming graph challenge (Preliminary version at arXiv.)
|
||||
David Zhuzhunashvili, Andrew Knyazev
|
||||
<10.1109/HPEC.2017.8091045>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.metrics.pairwise import pairwise_kernels
|
||||
>>> from sklearn.cluster import spectral_clustering
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> affinity = pairwise_kernels(X, metric='rbf')
|
||||
>>> spectral_clustering(
|
||||
... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0
|
||||
... )
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
"""
|
||||
|
||||
clusterer = SpectralClustering(
|
||||
n_clusters=n_clusters,
|
||||
n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
n_init=n_init,
|
||||
affinity="precomputed",
|
||||
eigen_tol=eigen_tol,
|
||||
assign_labels=assign_labels,
|
||||
verbose=verbose,
|
||||
).fit(affinity)
|
||||
|
||||
return clusterer.labels_
|
||||
|
||||
|
||||
class SpectralClustering(ClusterMixin, BaseEstimator):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex, or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster, such as when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If the affinity matrix is the adjacency matrix of a graph, this method
|
||||
can be used to find normalized graph cuts [1]_, [2]_.
|
||||
|
||||
When calling ``fit``, an affinity matrix is constructed using either
|
||||
a kernel function such the Gaussian (aka RBF) kernel with Euclidean
|
||||
distance ``d(X, X)``::
|
||||
|
||||
np.exp(-gamma * d(X,X) ** 2)
|
||||
|
||||
or a k-nearest neighbors connectivity matrix.
|
||||
|
||||
Alternatively, a user-provided affinity matrix can be specified by
|
||||
setting ``affinity='precomputed'``.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used. See [4]_ for more details regarding `'lobpcg'`.
|
||||
|
||||
n_components : int, default=None
|
||||
Number of eigenvectors to use for the spectral embedding. If None,
|
||||
defaults to `n_clusters`.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigenvectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of n_init
|
||||
consecutive runs in terms of inertia. Only used if
|
||||
``assign_labels='kmeans'``.
|
||||
|
||||
gamma : float, default=1.0
|
||||
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
|
||||
Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'``
|
||||
or ``affinity='precomputed_nearest_neighbors'``.
|
||||
|
||||
affinity : str or callable, default='rbf'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors': construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf': construct the affinity matrix using a radial basis function
|
||||
(RBF) kernel.
|
||||
- 'precomputed': interpret ``X`` as a precomputed affinity matrix,
|
||||
where larger values indicate greater similarity between instances.
|
||||
- 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
|
||||
of precomputed distances, and construct a binary affinity matrix
|
||||
from the ``n_neighbors`` nearest neighbors of each instance.
|
||||
- one of the kernels supported by
|
||||
:func:`~sklearn.metrics.pairwise.pairwise_kernels`.
|
||||
|
||||
Only kernels that produce similarity scores (non-negative values that
|
||||
increase with similarity) should be used. This property is not checked
|
||||
by the clustering algorithm.
|
||||
|
||||
n_neighbors : int, default=10
|
||||
Number of neighbors to use when constructing the affinity matrix using
|
||||
the nearest neighbors method. Ignored for ``affinity='rbf'``.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigen decomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
|
||||
The strategy for assigning labels in the embedding space. There are two
|
||||
ways to assign labels after the Laplacian embedding. k-means is a
|
||||
popular choice, but it can be sensitive to initialization.
|
||||
Discretization is another approach which is less sensitive to random
|
||||
initialization [3]_.
|
||||
The cluster_qr method [5]_ directly extract clusters from eigenvectors
|
||||
in spectral clustering. In contrast to k-means and discretization, cluster_qr
|
||||
has no tuning parameters and runs no iterations, yet may outperform
|
||||
k-means and discretization in terms of both quality and speed.
|
||||
|
||||
.. versionchanged:: 1.1
|
||||
Added new labeling method 'cluster_qr'.
|
||||
|
||||
degree : float, default=3
|
||||
Degree of the polynomial kernel. Ignored by other kernels.
|
||||
|
||||
coef0 : float, default=1
|
||||
Zero coefficient for polynomial and sigmoid kernels.
|
||||
Ignored by other kernels.
|
||||
|
||||
kernel_params : dict of str to any, default=None
|
||||
Parameters (keyword arguments) and values for kernel passed as
|
||||
callable object. Ignored by other kernels.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run when `affinity='nearest_neighbors'`
|
||||
or `affinity='precomputed_nearest_neighbors'`. The neighbors search
|
||||
will be done in parallel.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbosity mode.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Attributes
|
||||
----------
|
||||
affinity_matrix_ : array-like of shape (n_samples, n_samples)
|
||||
Affinity matrix used for clustering. Available only after calling
|
||||
``fit``.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.cluster.KMeans : K-Means clustering.
|
||||
sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of
|
||||
Applications with Noise.
|
||||
|
||||
Notes
|
||||
-----
|
||||
A distance matrix for which 0 indicates identical elements and high values
|
||||
indicate very dissimilar elements can be transformed into an affinity /
|
||||
similarity matrix that is well-suited for the algorithm by
|
||||
applying the Gaussian (aka RBF, heat) kernel::
|
||||
|
||||
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
|
||||
|
||||
where ``delta`` is a free parameter representing the width of the Gaussian
|
||||
kernel.
|
||||
|
||||
An alternative is to take a symmetric version of the k-nearest neighbors
|
||||
connectivity matrix of the points.
|
||||
|
||||
If the pyamg package is installed, it is used: this greatly
|
||||
speeds up computation.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
.. [3] `Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_
|
||||
|
||||
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver:
|
||||
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
|
||||
A. V. Knyazev
|
||||
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019
|
||||
Anil Damle, Victor Minden, Lexing Ying
|
||||
<10.1093/imaiai/iay008>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralClustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralClustering(n_clusters=2,
|
||||
... assign_labels='discretize',
|
||||
... random_state=0).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
>>> clustering
|
||||
SpectralClustering(assign_labels='discretize', n_clusters=2,
|
||||
random_state=0)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"gamma": [Interval(Real, 0, None, closed="left")],
|
||||
"affinity": [
|
||||
callable,
|
||||
StrOptions(
|
||||
set(KERNEL_PARAMS)
|
||||
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
|
||||
),
|
||||
],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_tol": [
|
||||
Interval(Real, 0.0, None, closed="left"),
|
||||
StrOptions({"auto"}),
|
||||
],
|
||||
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
|
||||
"degree": [Interval(Real, 0, None, closed="left")],
|
||||
"coef0": [Interval(Real, None, None, closed="neither")],
|
||||
"kernel_params": [dict, None],
|
||||
"n_jobs": [Integral, None],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=8,
|
||||
*,
|
||||
eigen_solver=None,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
n_init=10,
|
||||
gamma=1.0,
|
||||
affinity="rbf",
|
||||
n_neighbors=10,
|
||||
eigen_tol="auto",
|
||||
assign_labels="kmeans",
|
||||
degree=3,
|
||||
coef0=1,
|
||||
kernel_params=None,
|
||||
n_jobs=None,
|
||||
verbose=False,
|
||||
):
|
||||
self.n_clusters = n_clusters
|
||||
self.eigen_solver = eigen_solver
|
||||
self.n_components = n_components
|
||||
self.random_state = random_state
|
||||
self.n_init = n_init
|
||||
self.gamma = gamma
|
||||
self.affinity = affinity
|
||||
self.n_neighbors = n_neighbors
|
||||
self.eigen_tol = eigen_tol
|
||||
self.assign_labels = assign_labels
|
||||
self.degree = degree
|
||||
self.coef0 = coef0
|
||||
self.kernel_params = kernel_params
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Perform spectral clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, similarities / affinities between
|
||||
instances if ``affinity='precomputed'``, or distances between
|
||||
instances if ``affinity='precomputed_nearest_neighbors``. If a
|
||||
sparse matrix is provided in a format other than ``csr_matrix``,
|
||||
``csc_matrix``, or ``coo_matrix``, it will be converted into a
|
||||
sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
A fitted instance of the estimator.
|
||||
"""
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse=["csr", "csc", "coo"],
|
||||
dtype=np.float64,
|
||||
ensure_min_samples=2,
|
||||
)
|
||||
allow_squared = self.affinity in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
if X.shape[0] == X.shape[1] and not allow_squared:
|
||||
warnings.warn(
|
||||
"The spectral clustering API has changed. ``fit``"
|
||||
"now constructs an affinity matrix from data. To use"
|
||||
" a custom affinity matrix, "
|
||||
"set ``affinity=precomputed``."
|
||||
)
|
||||
|
||||
if self.affinity == "nearest_neighbors":
|
||||
connectivity = kneighbors_graph(
|
||||
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
else:
|
||||
params = self.kernel_params
|
||||
if params is None:
|
||||
params = {}
|
||||
if not callable(self.affinity):
|
||||
params["gamma"] = self.gamma
|
||||
params["degree"] = self.degree
|
||||
params["coef0"] = self.coef0
|
||||
self.affinity_matrix_ = pairwise_kernels(
|
||||
X, metric=self.affinity, filter_params=True, **params
|
||||
)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
n_components = (
|
||||
self.n_clusters if self.n_components is None else self.n_components
|
||||
)
|
||||
# We now obtain the real valued solution matrix to the
|
||||
# relaxed Ncut problem, solving the eigenvalue problem
|
||||
# L_sym x = lambda x and recovering u = D^-1/2 x.
|
||||
# The first eigenvector is constant only for fully connected graphs
|
||||
# and should be kept for spectral clustering (drop_first = False)
|
||||
# See spectral_embedding documentation.
|
||||
maps = _spectral_embedding(
|
||||
self.affinity_matrix_,
|
||||
n_components=n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=self.eigen_tol,
|
||||
drop_first=False,
|
||||
)
|
||||
if self.verbose:
|
||||
print(f"Computing label assignment using {self.assign_labels}")
|
||||
|
||||
if self.assign_labels == "kmeans":
|
||||
_, self.labels_, _ = k_means(
|
||||
maps,
|
||||
self.n_clusters,
|
||||
random_state=random_state,
|
||||
n_init=self.n_init,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
elif self.assign_labels == "cluster_qr":
|
||||
self.labels_ = cluster_qr(maps)
|
||||
else:
|
||||
self.labels_ = discretize(maps, random_state=random_state)
|
||||
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Perform spectral clustering on `X` and return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, similarities / affinities between
|
||||
instances if ``affinity='precomputed'``, or distances between
|
||||
instances if ``affinity='precomputed_nearest_neighbors``. If a
|
||||
sparse matrix is provided in a format other than ``csr_matrix``,
|
||||
``csc_matrix``, or ``coo_matrix``, it will be converted into a
|
||||
sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"pairwise": self.affinity
|
||||
in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user