some new features
This commit is contained in:
@ -0,0 +1,778 @@
|
||||
"""Spectral Embedding."""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Wei LI <kuantkid@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from ..base import BaseEstimator, _fit_context
|
||||
from ..metrics.pairwise import rbf_kernel
|
||||
from ..neighbors import NearestNeighbors, kneighbors_graph
|
||||
from ..utils import (
|
||||
check_array,
|
||||
check_random_state,
|
||||
check_symmetric,
|
||||
)
|
||||
from ..utils._arpack import _init_arpack_v0
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.extmath import _deterministic_vector_sign_flip
|
||||
from ..utils.fixes import laplacian as csgraph_laplacian
|
||||
from ..utils.fixes import parse_version, sp_version
|
||||
|
||||
|
||||
def _graph_connected_component(graph, node_id):
|
||||
"""Find the largest graph connected components that contains one
|
||||
given node.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : array-like of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
node_id : int
|
||||
The index of the query node of the graph.
|
||||
|
||||
Returns
|
||||
-------
|
||||
connected_components_matrix : array-like of shape (n_samples,)
|
||||
An array of bool value indicating the indexes of the nodes
|
||||
belonging to the largest connected components of the given query
|
||||
node.
|
||||
"""
|
||||
n_node = graph.shape[0]
|
||||
if sparse.issparse(graph):
|
||||
# speed up row-wise access to boolean connection mask
|
||||
graph = graph.tocsr()
|
||||
connected_nodes = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore[node_id] = True
|
||||
for _ in range(n_node):
|
||||
last_num_component = connected_nodes.sum()
|
||||
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
|
||||
if last_num_component >= connected_nodes.sum():
|
||||
break
|
||||
indices = np.where(nodes_to_explore)[0]
|
||||
nodes_to_explore.fill(False)
|
||||
for i in indices:
|
||||
if sparse.issparse(graph):
|
||||
# scipy not yet implemented 1D sparse slices; can be changed back to
|
||||
# `neighbors = graph[i].toarray().ravel()` once implemented
|
||||
neighbors = graph[[i], :].toarray().ravel()
|
||||
else:
|
||||
neighbors = graph[i]
|
||||
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
|
||||
return connected_nodes
|
||||
|
||||
|
||||
def _graph_is_connected(graph):
|
||||
"""Return whether the graph is connected (True) or Not (False).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_connected : bool
|
||||
True means the graph is fully connected and False means not.
|
||||
"""
|
||||
if sparse.issparse(graph):
|
||||
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
|
||||
# `accept_large_sparse=True`.
|
||||
accept_large_sparse = sp_version >= parse_version("1.11.3")
|
||||
graph = check_array(
|
||||
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
|
||||
)
|
||||
# sparse graph, find all the connected components
|
||||
n_connected_components, _ = connected_components(graph)
|
||||
return n_connected_components == 1
|
||||
else:
|
||||
# dense graph, find all connected components start from node 0
|
||||
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
|
||||
|
||||
|
||||
def _set_diag(laplacian, value, norm_laplacian):
|
||||
"""Set the diagonal of the laplacian matrix and convert it to a
|
||||
sparse format well suited for eigenvalue decomposition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
laplacian : {ndarray, sparse matrix}
|
||||
The graph laplacian.
|
||||
|
||||
value : float
|
||||
The value of the diagonal.
|
||||
|
||||
norm_laplacian : bool
|
||||
Whether the value of the diagonal should be changed or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
laplacian : {array, sparse matrix}
|
||||
An array of matrix in a form that is well suited to fast
|
||||
eigenvalue decomposition, depending on the band width of the
|
||||
matrix.
|
||||
"""
|
||||
n_nodes = laplacian.shape[0]
|
||||
# We need all entries in the diagonal to values
|
||||
if not sparse.issparse(laplacian):
|
||||
if norm_laplacian:
|
||||
laplacian.flat[:: n_nodes + 1] = value
|
||||
else:
|
||||
laplacian = laplacian.tocoo()
|
||||
if norm_laplacian:
|
||||
diag_idx = laplacian.row == laplacian.col
|
||||
laplacian.data[diag_idx] = value
|
||||
# If the matrix has a small number of diagonals (as in the
|
||||
# case of structured matrices coming from images), the
|
||||
# dia format might be best suited for matvec products:
|
||||
n_diags = np.unique(laplacian.row - laplacian.col).size
|
||||
if n_diags <= 7:
|
||||
# 3 or less outer diagonals on each side
|
||||
laplacian = laplacian.todia()
|
||||
else:
|
||||
# csr has the fastest matvec and is thus best suited to
|
||||
# arpack
|
||||
laplacian = laplacian.tocsr()
|
||||
return laplacian
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"adjacency": ["array-like", "sparse matrix"],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"norm_laplacian": ["boolean"],
|
||||
"drop_first": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
"""Project the sample on the first eigenvectors of the graph Laplacian.
|
||||
|
||||
The adjacency matrix is used to compute a normalized graph Laplacian
|
||||
whose spectrum (especially the eigenvectors associated to the
|
||||
smallest eigenvalues) has an interpretation in terms of minimal
|
||||
number of cuts necessary to split the graph into comparably sized
|
||||
components.
|
||||
|
||||
This embedding can also 'work' even if the ``adjacency`` variable is
|
||||
not strictly the adjacency matrix of a graph but more generally
|
||||
an affinity or similarity matrix between samples (for instance the
|
||||
heat kernel of a euclidean distance matrix or a k-NN matrix).
|
||||
|
||||
However care must taken to always make the affinity matrix symmetric
|
||||
so that the eigenvector decomposition works as expected.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
|
||||
The adjacency matrix of the graph to embed.
|
||||
|
||||
n_components : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
|
||||
to convergence issues and should be avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
norm_laplacian : bool, default=True
|
||||
If True, then compute symmetric normalized Laplacian.
|
||||
|
||||
drop_first : bool, default=True
|
||||
Whether to drop the first eigenvector. For spectral embedding, this
|
||||
should be True as the first eigenvector should be constant vector for
|
||||
connected graph, but for spectral clustering, this should be kept as
|
||||
False to retain the first eigenvector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
embedding : ndarray of shape (n_samples, n_components)
|
||||
The reduced samples.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
|
||||
has one connected component. If there graph has many components, the first
|
||||
few eigenvectors will simply uncover the connected components of the graph.
|
||||
|
||||
References
|
||||
----------
|
||||
* https://en.wikipedia.org/wiki/LOBPCG
|
||||
|
||||
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
|
||||
Block Preconditioned Conjugate Gradient Method",
|
||||
Andrew V. Knyazev
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.neighbors import kneighbors_graph
|
||||
>>> from sklearn.manifold import spectral_embedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X = X[:100]
|
||||
>>> affinity_matrix = kneighbors_graph(
|
||||
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
|
||||
... )
|
||||
>>> # make the matrix symmetric
|
||||
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
|
||||
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
|
||||
>>> embedding.shape
|
||||
(100, 2)
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
return _spectral_embedding(
|
||||
adjacency,
|
||||
n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=eigen_tol,
|
||||
norm_laplacian=norm_laplacian,
|
||||
drop_first=drop_first,
|
||||
)
|
||||
|
||||
|
||||
def _spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
adjacency = check_symmetric(adjacency)
|
||||
|
||||
if eigen_solver == "amg":
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
"The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
) from e
|
||||
|
||||
if eigen_solver is None:
|
||||
eigen_solver = "arpack"
|
||||
|
||||
n_nodes = adjacency.shape[0]
|
||||
# Whether to drop the first eigenvector
|
||||
if drop_first:
|
||||
n_components = n_components + 1
|
||||
|
||||
if not _graph_is_connected(adjacency):
|
||||
warnings.warn(
|
||||
"Graph is not fully connected, spectral embedding may not work as expected."
|
||||
)
|
||||
|
||||
laplacian, dd = csgraph_laplacian(
|
||||
adjacency, normed=norm_laplacian, return_diag=True
|
||||
)
|
||||
if (
|
||||
eigen_solver == "arpack"
|
||||
or eigen_solver != "lobpcg"
|
||||
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
|
||||
):
|
||||
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
|
||||
# for details see the source code in scipy:
|
||||
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
|
||||
# /lobpcg/lobpcg.py#L237
|
||||
# or matlab:
|
||||
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# Here we'll use shift-invert mode for fast eigenvalues
|
||||
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
|
||||
# for a short explanation of what this means)
|
||||
# Because the normalized Laplacian has eigenvalues between 0 and 2,
|
||||
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
|
||||
# when finding eigenvalues of largest magnitude (keyword which='LM')
|
||||
# and when these eigenvalues are very large compared to the rest.
|
||||
# For very large, very sparse graphs, I - L can have many, many
|
||||
# eigenvalues very near 1.0. This leads to slow convergence. So
|
||||
# instead, we'll use ARPACK's shift-invert mode, asking for the
|
||||
# eigenvalues near 1.0. This effectively spreads-out the spectrum
|
||||
# near 1.0 and leads to much faster convergence: potentially an
|
||||
# orders-of-magnitude speedup over simply using keyword which='LA'
|
||||
# in standard mode.
|
||||
try:
|
||||
# We are computing the opposite of the laplacian inplace so as
|
||||
# to spare a memory allocation of a possibly very large array
|
||||
tol = 0 if eigen_tol == "auto" else eigen_tol
|
||||
laplacian *= -1
|
||||
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
|
||||
laplacian = check_array(
|
||||
laplacian, accept_sparse="csr", accept_large_sparse=False
|
||||
)
|
||||
_, diffusion_map = eigsh(
|
||||
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
|
||||
)
|
||||
embedding = diffusion_map.T[n_components::-1]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
except RuntimeError:
|
||||
# When submatrices are exactly singular, an LU decomposition
|
||||
# in arpack fails. We fallback to lobpcg
|
||||
eigen_solver = "lobpcg"
|
||||
# Revert the laplacian to its opposite to have lobpcg work
|
||||
laplacian *= -1
|
||||
|
||||
elif eigen_solver == "amg":
|
||||
# Use AMG to get a preconditioner and speed up the eigenvalue
|
||||
# problem.
|
||||
if not sparse.issparse(laplacian):
|
||||
warnings.warn("AMG works better for sparse matrices")
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# The Laplacian matrix is always singular, having at least one zero
|
||||
# eigenvalue, corresponding to the trivial eigenvector, which is a
|
||||
# constant. Using a singular matrix for preconditioning may result in
|
||||
# random failures in LOBPCG and is not supported by the existing
|
||||
# theory:
|
||||
# see https://doi.org/10.1007/s10208-015-9297-1
|
||||
# Shift the Laplacian so its diagononal is not all ones. The shift
|
||||
# does change the eigenpairs however, so we'll feed the shifted
|
||||
# matrix to the solver and afterward set it back to the original.
|
||||
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
|
||||
laplacian += diag_shift
|
||||
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
|
||||
# `pyamg` does not work with `csr_array` and we need to convert it to a
|
||||
# `csr_matrix` object.
|
||||
laplacian = sparse.csr_matrix(laplacian)
|
||||
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
|
||||
laplacian -= diag_shift
|
||||
|
||||
M = ml.aspreconditioner()
|
||||
# Create initial approximation X to eigenvectors
|
||||
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
|
||||
embedding = diffusion_map.T
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
if eigen_solver == "lobpcg":
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
if n_nodes < 5 * n_components + 1:
|
||||
# see note above under arpack why lobpcg has problems with small
|
||||
# number of nodes
|
||||
# lobpcg will fallback to eigh, so we short circuit it
|
||||
if sparse.issparse(laplacian):
|
||||
laplacian = laplacian.toarray()
|
||||
_, diffusion_map = eigh(laplacian, check_finite=False)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
else:
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
# We increase the number of eigenvectors requested, as lobpcg
|
||||
# doesn't behave well in low dimension and create initial
|
||||
# approximation X to eigenvectors
|
||||
X = random_state.standard_normal(
|
||||
size=(laplacian.shape[0], n_components + 1)
|
||||
)
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(
|
||||
laplacian, X, tol=tol, largest=False, maxiter=2000
|
||||
)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
embedding = _deterministic_vector_sign_flip(embedding)
|
||||
if drop_first:
|
||||
return embedding[1:n_components].T
|
||||
else:
|
||||
return embedding[:n_components].T
|
||||
|
||||
|
||||
class SpectralEmbedding(BaseEstimator):
|
||||
"""Spectral embedding for non-linear dimensionality reduction.
|
||||
|
||||
Forms an affinity matrix given by the specified function and
|
||||
applies spectral decomposition to the corresponding graph laplacian.
|
||||
The resulting transformation is given by the value of the
|
||||
eigenvectors for each data point.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
The dimension of the projected subspace.
|
||||
|
||||
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
|
||||
'precomputed_nearest_neighbors'} or callable, \
|
||||
default='nearest_neighbors'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf' : construct the affinity matrix by computing a radial basis
|
||||
function (RBF) kernel.
|
||||
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
||||
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
||||
of precomputed nearest neighbors, and constructs the affinity matrix
|
||||
by selecting the ``n_neighbors`` nearest neighbors.
|
||||
- callable : use passed in function as affinity
|
||||
the function takes in data matrix (n_samples, n_features)
|
||||
and return affinity matrix (n_samples, n_samples).
|
||||
|
||||
gamma : float, default=None
|
||||
Kernel coefficient for rbf kernel. If None, gamma will be set to
|
||||
1/n_features.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems.
|
||||
If None, then ``'arpack'`` is used.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
n_neighbors : int, default=None
|
||||
Number of nearest neighbors for nearest_neighbors graph building.
|
||||
If None, n_neighbors will be set to max(n_samples/10, 1).
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Affinity_matrix constructed from samples or precomputed.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_neighbors_ : int
|
||||
Number of nearest neighbors effectively used.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
- `On Spectral Clustering: Analysis and an algorithm, 2001
|
||||
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
|
||||
|
||||
- :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import SpectralEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = SpectralEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"affinity": [
|
||||
StrOptions(
|
||||
{
|
||||
"nearest_neighbors",
|
||||
"rbf",
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
},
|
||||
),
|
||||
callable,
|
||||
],
|
||||
"gamma": [Interval(Real, 0, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
affinity="nearest_neighbors",
|
||||
gamma=None,
|
||||
random_state=None,
|
||||
eigen_solver=None,
|
||||
eigen_tol="auto",
|
||||
n_neighbors=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.affinity = affinity
|
||||
self.gamma = gamma
|
||||
self.random_state = random_state
|
||||
self.eigen_solver = eigen_solver
|
||||
self.eigen_tol = eigen_tol
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"pairwise": self.affinity
|
||||
in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
}
|
||||
|
||||
def _get_affinity_matrix(self, X, Y=None):
|
||||
"""Calculate the affinity matrix from data
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : array-like of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
Y: Ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
affinity_matrix of shape (n_samples, n_samples)
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "nearest_neighbors":
|
||||
if sparse.issparse(X):
|
||||
warnings.warn(
|
||||
"Nearest neighbors affinity currently does "
|
||||
"not support sparse input, falling back to "
|
||||
"rbf affinity"
|
||||
)
|
||||
self.affinity = "rbf"
|
||||
else:
|
||||
self.n_neighbors_ = (
|
||||
self.n_neighbors
|
||||
if self.n_neighbors is not None
|
||||
else max(int(X.shape[0] / 10), 1)
|
||||
)
|
||||
self.affinity_matrix_ = kneighbors_graph(
|
||||
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
# currently only symmetric affinity_matrix supported
|
||||
self.affinity_matrix_ = 0.5 * (
|
||||
self.affinity_matrix_ + self.affinity_matrix_.T
|
||||
)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "rbf":
|
||||
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
|
||||
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
|
||||
return self.affinity_matrix_
|
||||
self.affinity_matrix_ = self.affinity(X)
|
||||
return self.affinity_matrix_
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model from data in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
affinity_matrix = self._get_affinity_matrix(X)
|
||||
self.embedding_ = _spectral_embedding(
|
||||
affinity_matrix,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
eigen_tol=self.eigen_tol,
|
||||
random_state=random_state,
|
||||
)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
"""
|
||||
self.fit(X)
|
||||
return self.embedding_
|
||||
Reference in New Issue
Block a user