reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,52 @@
|
||||
"""Evaluation metrics for cluster analysis results.
|
||||
|
||||
- Supervised evaluation uses a ground truth class values for each sample.
|
||||
- Unsupervised evaluation does use ground truths and measures the "quality" of the
|
||||
model itself.
|
||||
"""
|
||||
|
||||
from ._bicluster import consensus_score
|
||||
from ._supervised import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
completeness_score,
|
||||
contingency_matrix,
|
||||
entropy,
|
||||
expected_mutual_information,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_completeness_v_measure,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
pair_confusion_matrix,
|
||||
rand_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from ._unsupervised import (
|
||||
calinski_harabasz_score,
|
||||
davies_bouldin_score,
|
||||
silhouette_samples,
|
||||
silhouette_score,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"adjusted_mutual_info_score",
|
||||
"normalized_mutual_info_score",
|
||||
"adjusted_rand_score",
|
||||
"rand_score",
|
||||
"completeness_score",
|
||||
"pair_confusion_matrix",
|
||||
"contingency_matrix",
|
||||
"expected_mutual_information",
|
||||
"homogeneity_completeness_v_measure",
|
||||
"homogeneity_score",
|
||||
"mutual_info_score",
|
||||
"v_measure_score",
|
||||
"fowlkes_mallows_score",
|
||||
"entropy",
|
||||
"silhouette_samples",
|
||||
"silhouette_score",
|
||||
"calinski_harabasz_score",
|
||||
"davies_bouldin_score",
|
||||
"consensus_score",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,111 @@
|
||||
import numpy as np
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from ...utils._param_validation import StrOptions, validate_params
|
||||
from ...utils.validation import check_array, check_consistent_length
|
||||
|
||||
__all__ = ["consensus_score"]
|
||||
|
||||
|
||||
def _check_rows_and_columns(a, b):
|
||||
"""Unpacks the row and column arrays and checks their shape."""
|
||||
check_consistent_length(*a)
|
||||
check_consistent_length(*b)
|
||||
checks = lambda x: check_array(x, ensure_2d=False)
|
||||
a_rows, a_cols = map(checks, a)
|
||||
b_rows, b_cols = map(checks, b)
|
||||
return a_rows, a_cols, b_rows, b_cols
|
||||
|
||||
|
||||
def _jaccard(a_rows, a_cols, b_rows, b_cols):
|
||||
"""Jaccard coefficient on the elements of the two biclusters."""
|
||||
intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
|
||||
|
||||
a_size = a_rows.sum() * a_cols.sum()
|
||||
b_size = b_rows.sum() * b_cols.sum()
|
||||
|
||||
return intersection / (a_size + b_size - intersection)
|
||||
|
||||
|
||||
def _pairwise_similarity(a, b, similarity):
|
||||
"""Computes pairwise similarity matrix.
|
||||
|
||||
result[i, j] is the Jaccard coefficient of a's bicluster i and b's
|
||||
bicluster j.
|
||||
|
||||
"""
|
||||
a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
|
||||
n_a = a_rows.shape[0]
|
||||
n_b = b_rows.shape[0]
|
||||
result = np.array(
|
||||
[
|
||||
[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
|
||||
for i in range(n_a)
|
||||
]
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"a": [tuple],
|
||||
"b": [tuple],
|
||||
"similarity": [callable, StrOptions({"jaccard"})],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def consensus_score(a, b, *, similarity="jaccard"):
|
||||
"""The similarity of two sets of biclusters.
|
||||
|
||||
Similarity between individual biclusters is computed. Then the best
|
||||
matching between sets is found by solving a linear sum assignment problem,
|
||||
using a modified Jonker-Volgenant algorithm.
|
||||
The final score is the sum of similarities divided by the size of
|
||||
the larger set.
|
||||
|
||||
Read more in the :ref:`User Guide <biclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : tuple (rows, columns)
|
||||
Tuple of row and column indicators for a set of biclusters.
|
||||
|
||||
b : tuple (rows, columns)
|
||||
Another set of biclusters like ``a``.
|
||||
|
||||
similarity : 'jaccard' or callable, default='jaccard'
|
||||
May be the string "jaccard" to use the Jaccard coefficient, or
|
||||
any function that takes four arguments, each of which is a 1d
|
||||
indicator vector: (a_rows, a_columns, b_rows, b_columns).
|
||||
|
||||
Returns
|
||||
-------
|
||||
consensus_score : float
|
||||
Consensus score, a non-negative value, sum of similarities
|
||||
divided by size of larger set.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
|
||||
|
||||
References
|
||||
----------
|
||||
* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
|
||||
for bicluster acquisition
|
||||
<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import consensus_score
|
||||
>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
|
||||
>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
|
||||
>>> consensus_score(a, b, similarity='jaccard')
|
||||
np.float64(1.0)
|
||||
"""
|
||||
if similarity == "jaccard":
|
||||
similarity = _jaccard
|
||||
matrix = _pairwise_similarity(a, b, similarity)
|
||||
row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
|
||||
n_a = len(a[0])
|
||||
n_b = len(b[0])
|
||||
return matrix[row_indices, col_indices].sum() / max(n_a, n_b)
|
||||
Binary file not shown.
@ -0,0 +1,70 @@
|
||||
# Authors: Robert Layton <robertlayton@gmail.com>
|
||||
# Corey Lynch <coreylynch9@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from libc.math cimport exp, lgamma
|
||||
|
||||
from ...utils._typedefs cimport float64_t, int64_t
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import gammaln
|
||||
|
||||
|
||||
def expected_mutual_information(contingency, int64_t n_samples):
|
||||
"""Calculate the expected mutual information for two labelings."""
|
||||
cdef:
|
||||
float64_t emi = 0
|
||||
int64_t n_rows, n_cols
|
||||
float64_t term2, term3, gln
|
||||
int64_t[::1] a_view, b_view
|
||||
float64_t[::1] term1
|
||||
float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij
|
||||
float64_t[::1] log_a, log_b
|
||||
Py_ssize_t i, j, nij
|
||||
int64_t start, end
|
||||
|
||||
n_rows, n_cols = contingency.shape
|
||||
a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False))
|
||||
b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False))
|
||||
a_view = a
|
||||
b_view = b
|
||||
|
||||
# any labelling with zero entropy implies EMI = 0
|
||||
if a.size == 1 or b.size == 1:
|
||||
return 0.0
|
||||
|
||||
# There are three major terms to the EMI equation, which are multiplied to
|
||||
# and then summed over varying nij values.
|
||||
# While nijs[0] will never be used, having it simplifies the indexing.
|
||||
nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
|
||||
nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue.
|
||||
# term1 is nij / N
|
||||
term1 = nijs / n_samples
|
||||
# term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
|
||||
log_a = np.log(a)
|
||||
log_b = np.log(b)
|
||||
# term2 uses log(N * nij) = log(N) + log(nij)
|
||||
log_Nnij = np.log(n_samples) + np.log(nijs)
|
||||
# term3 is large, and involved many factorials. Calculate these in log
|
||||
# space to stop overflows.
|
||||
gln_a = gammaln(a + 1)
|
||||
gln_b = gammaln(b + 1)
|
||||
gln_Na = gammaln(n_samples - a + 1)
|
||||
gln_Nb = gammaln(n_samples - b + 1)
|
||||
gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1)
|
||||
|
||||
# emi itself is a summation over the various values.
|
||||
for i in range(n_rows):
|
||||
for j in range(n_cols):
|
||||
start = max(1, a_view[i] - n_samples + b_view[j])
|
||||
end = min(a_view[i], b_view[j]) + 1
|
||||
for nij in range(start, end):
|
||||
term2 = log_Nnij[nij] - log_a[i] - log_b[j]
|
||||
# Numerators are positive, denominators are negative.
|
||||
gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
|
||||
- gln_Nnij[nij] - lgamma(a_view[i] - nij + 1)
|
||||
- lgamma(b_view[j] - nij + 1)
|
||||
- lgamma(n_samples - a_view[i] - b_view[j] + nij + 1))
|
||||
term3 = exp(gln)
|
||||
emi += (term1[nij] * term2 * term3)
|
||||
return emi
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,466 @@
|
||||
"""Unsupervised evaluation metrics."""
|
||||
|
||||
# Authors: Robert Layton <robertlayton@gmail.com>
|
||||
# Arnaud Fouchet <foucheta@gmail.com>
|
||||
# Thierry Guillemot <thierry.guillemot.work@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import functools
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from ...preprocessing import LabelEncoder
|
||||
from ...utils import _safe_indexing, check_random_state, check_X_y
|
||||
from ...utils._array_api import _atol_for_type
|
||||
from ...utils._param_validation import (
|
||||
Interval,
|
||||
StrOptions,
|
||||
validate_params,
|
||||
)
|
||||
from ..pairwise import _VALID_METRICS, pairwise_distances, pairwise_distances_chunked
|
||||
|
||||
|
||||
def check_number_of_labels(n_labels, n_samples):
|
||||
"""Check that number of labels are valid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_labels : int
|
||||
Number of labels.
|
||||
|
||||
n_samples : int
|
||||
Number of samples.
|
||||
"""
|
||||
if not 1 < n_labels < n_samples:
|
||||
raise ValueError(
|
||||
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
|
||||
% n_labels
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"labels": ["array-like"],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
"sample_size": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def silhouette_score(
|
||||
X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds
|
||||
):
|
||||
"""Compute the mean Silhouette Coefficient of all samples.
|
||||
|
||||
The Silhouette Coefficient is calculated using the mean intra-cluster
|
||||
distance (``a``) and the mean nearest-cluster distance (``b``) for each
|
||||
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
|
||||
b)``. To clarify, ``b`` is the distance between a sample and the nearest
|
||||
cluster that the sample is not a part of.
|
||||
Note that Silhouette Coefficient is only defined if number of labels
|
||||
is ``2 <= n_labels <= n_samples - 1``.
|
||||
|
||||
This function returns the mean Silhouette Coefficient over all samples.
|
||||
To obtain the values for each sample, use :func:`silhouette_samples`.
|
||||
|
||||
The best value is 1 and the worst value is -1. Values near 0 indicate
|
||||
overlapping clusters. Negative values generally indicate that a sample has
|
||||
been assigned to the wrong cluster, as a different cluster is more similar.
|
||||
|
||||
Read more in the :ref:`User Guide <silhouette_coefficient>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
|
||||
"precomputed" or (n_samples_a, n_features) otherwise
|
||||
An array of pairwise distances between samples, or a feature array.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string, it must be one of the options
|
||||
allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is
|
||||
the distance array itself, use ``metric="precomputed"``.
|
||||
|
||||
sample_size : int, default=None
|
||||
The size of the sample to use when computing the Silhouette Coefficient
|
||||
on a random subset of the data.
|
||||
If ``sample_size is None``, no sampling is used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for selecting a subset of samples.
|
||||
Used when ``sample_size is not None``.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
**kwds : optional keyword parameters
|
||||
Any further parameters are passed directly to the distance function.
|
||||
If using a scipy.spatial.distance metric, the parameters are still
|
||||
metric dependent. See the scipy docs for usage examples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
silhouette : float
|
||||
Mean Silhouette Coefficient for all samples.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
|
||||
Interpretation and Validation of Cluster Analysis". Computational
|
||||
and Applied Mathematics 20: 53-65.
|
||||
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
|
||||
|
||||
.. [2] `Wikipedia entry on the Silhouette Coefficient
|
||||
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> from sklearn.metrics import silhouette_score
|
||||
>>> X, y = make_blobs(random_state=42)
|
||||
>>> kmeans = KMeans(n_clusters=2, random_state=42)
|
||||
>>> silhouette_score(X, kmeans.fit_predict(X))
|
||||
np.float64(0.49...)
|
||||
"""
|
||||
if sample_size is not None:
|
||||
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
|
||||
random_state = check_random_state(random_state)
|
||||
indices = random_state.permutation(X.shape[0])[:sample_size]
|
||||
if metric == "precomputed":
|
||||
X, labels = X[indices].T[indices].T, labels[indices]
|
||||
else:
|
||||
X, labels = X[indices], labels[indices]
|
||||
return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
|
||||
|
||||
|
||||
def _silhouette_reduce(D_chunk, start, labels, label_freqs):
|
||||
"""Accumulate silhouette statistics for vertical chunk of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
|
||||
Precomputed distances for a chunk. If a sparse matrix is provided,
|
||||
only CSR format is accepted.
|
||||
start : int
|
||||
First index in the chunk.
|
||||
labels : array-like of shape (n_samples,)
|
||||
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
|
||||
label_freqs : array-like
|
||||
Distribution of cluster labels in ``labels``.
|
||||
"""
|
||||
n_chunk_samples = D_chunk.shape[0]
|
||||
# accumulate distances from each sample to each cluster
|
||||
cluster_distances = np.zeros(
|
||||
(n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype
|
||||
)
|
||||
|
||||
if issparse(D_chunk):
|
||||
if D_chunk.format != "csr":
|
||||
raise TypeError(
|
||||
"Expected CSR matrix. Please pass sparse matrix in CSR format."
|
||||
)
|
||||
for i in range(n_chunk_samples):
|
||||
indptr = D_chunk.indptr
|
||||
indices = D_chunk.indices[indptr[i] : indptr[i + 1]]
|
||||
sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]]
|
||||
sample_labels = np.take(labels, indices)
|
||||
cluster_distances[i] += np.bincount(
|
||||
sample_labels, weights=sample_weights, minlength=len(label_freqs)
|
||||
)
|
||||
else:
|
||||
for i in range(n_chunk_samples):
|
||||
sample_weights = D_chunk[i]
|
||||
sample_labels = labels
|
||||
cluster_distances[i] += np.bincount(
|
||||
sample_labels, weights=sample_weights, minlength=len(label_freqs)
|
||||
)
|
||||
|
||||
# intra_index selects intra-cluster distances within cluster_distances
|
||||
end = start + n_chunk_samples
|
||||
intra_index = (np.arange(n_chunk_samples), labels[start:end])
|
||||
# intra_cluster_distances are averaged over cluster size outside this function
|
||||
intra_cluster_distances = cluster_distances[intra_index]
|
||||
# of the remaining distances we normalise and extract the minimum
|
||||
cluster_distances[intra_index] = np.inf
|
||||
cluster_distances /= label_freqs
|
||||
inter_cluster_distances = cluster_distances.min(axis=1)
|
||||
return intra_cluster_distances, inter_cluster_distances
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"labels": ["array-like"],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
|
||||
"""Compute the Silhouette Coefficient for each sample.
|
||||
|
||||
The Silhouette Coefficient is a measure of how well samples are clustered
|
||||
with samples that are similar to themselves. Clustering models with a high
|
||||
Silhouette Coefficient are said to be dense, where samples in the same
|
||||
cluster are similar to each other, and well separated, where samples in
|
||||
different clusters are not very similar to each other.
|
||||
|
||||
The Silhouette Coefficient is calculated using the mean intra-cluster
|
||||
distance (``a``) and the mean nearest-cluster distance (``b``) for each
|
||||
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
|
||||
b)``.
|
||||
Note that Silhouette Coefficient is only defined if number of labels
|
||||
is 2 ``<= n_labels <= n_samples - 1``.
|
||||
|
||||
This function returns the Silhouette Coefficient for each sample.
|
||||
|
||||
The best value is 1 and the worst value is -1. Values near 0 indicate
|
||||
overlapping clusters.
|
||||
|
||||
Read more in the :ref:`User Guide <silhouette_coefficient>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
|
||||
"precomputed" or (n_samples_a, n_features) otherwise
|
||||
An array of pairwise distances between samples, or a feature array. If
|
||||
a sparse matrix is provided, CSR format should be favoured avoiding
|
||||
an additional copy.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Label values for each sample.
|
||||
|
||||
metric : str or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string, it must be one of the options
|
||||
allowed by :func:`~sklearn.metrics.pairwise_distances`.
|
||||
If ``X`` is the distance array itself, use "precomputed" as the metric.
|
||||
Precomputed distance matrices must have 0 along the diagonal.
|
||||
|
||||
**kwds : optional keyword parameters
|
||||
Any further parameters are passed directly to the distance function.
|
||||
If using a ``scipy.spatial.distance`` metric, the parameters are still
|
||||
metric dependent. See the scipy docs for usage examples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
silhouette : array-like of shape (n_samples,)
|
||||
Silhouette Coefficients for each sample.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
|
||||
Interpretation and Validation of Cluster Analysis". Computational
|
||||
and Applied Mathematics 20: 53-65.
|
||||
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
|
||||
|
||||
.. [2] `Wikipedia entry on the Silhouette Coefficient
|
||||
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import silhouette_samples
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> X, y = make_blobs(n_samples=50, random_state=42)
|
||||
>>> kmeans = KMeans(n_clusters=3, random_state=42)
|
||||
>>> labels = kmeans.fit_predict(X)
|
||||
>>> silhouette_samples(X, labels)
|
||||
array([...])
|
||||
"""
|
||||
X, labels = check_X_y(X, labels, accept_sparse=["csr"])
|
||||
|
||||
# Check for non-zero diagonal entries in precomputed distance matrix
|
||||
if metric == "precomputed":
|
||||
error_msg = ValueError(
|
||||
"The precomputed distance matrix contains non-zero "
|
||||
"elements on the diagonal. Use np.fill_diagonal(X, 0)."
|
||||
)
|
||||
if X.dtype.kind == "f":
|
||||
atol = _atol_for_type(X.dtype)
|
||||
|
||||
if np.any(np.abs(X.diagonal()) > atol):
|
||||
raise error_msg
|
||||
elif np.any(X.diagonal() != 0): # integral dtype
|
||||
raise error_msg
|
||||
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
n_samples = len(labels)
|
||||
label_freqs = np.bincount(labels)
|
||||
check_number_of_labels(len(le.classes_), n_samples)
|
||||
|
||||
kwds["metric"] = metric
|
||||
reduce_func = functools.partial(
|
||||
_silhouette_reduce, labels=labels, label_freqs=label_freqs
|
||||
)
|
||||
results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
|
||||
intra_clust_dists, inter_clust_dists = results
|
||||
intra_clust_dists = np.concatenate(intra_clust_dists)
|
||||
inter_clust_dists = np.concatenate(inter_clust_dists)
|
||||
|
||||
denom = (label_freqs - 1).take(labels, mode="clip")
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
intra_clust_dists /= denom
|
||||
|
||||
sil_samples = inter_clust_dists - intra_clust_dists
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
|
||||
# nan values are for clusters of size 1, and should be 0
|
||||
return np.nan_to_num(sil_samples)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"labels": ["array-like"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def calinski_harabasz_score(X, labels):
|
||||
"""Compute the Calinski and Harabasz score.
|
||||
|
||||
It is also known as the Variance Ratio Criterion.
|
||||
|
||||
The score is defined as ratio of the sum of between-cluster dispersion and
|
||||
of within-cluster dispersion.
|
||||
|
||||
Read more in the :ref:`User Guide <calinski_harabasz_index>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
A list of ``n_features``-dimensional data points. Each row corresponds
|
||||
to a single data point.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
The resulting Calinski-Harabasz score.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
|
||||
analysis". Communications in Statistics
|
||||
<https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import make_blobs
|
||||
>>> from sklearn.cluster import KMeans
|
||||
>>> from sklearn.metrics import calinski_harabasz_score
|
||||
>>> X, _ = make_blobs(random_state=0)
|
||||
>>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X)
|
||||
>>> calinski_harabasz_score(X, kmeans.labels_)
|
||||
np.float64(114.8...)
|
||||
"""
|
||||
X, labels = check_X_y(X, labels)
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
|
||||
n_samples, _ = X.shape
|
||||
n_labels = len(le.classes_)
|
||||
|
||||
check_number_of_labels(n_labels, n_samples)
|
||||
|
||||
extra_disp, intra_disp = 0.0, 0.0
|
||||
mean = np.mean(X, axis=0)
|
||||
for k in range(n_labels):
|
||||
cluster_k = X[labels == k]
|
||||
mean_k = np.mean(cluster_k, axis=0)
|
||||
extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
|
||||
intra_disp += np.sum((cluster_k - mean_k) ** 2)
|
||||
|
||||
return (
|
||||
1.0
|
||||
if intra_disp == 0.0
|
||||
else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like"],
|
||||
"labels": ["array-like"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def davies_bouldin_score(X, labels):
|
||||
"""Compute the Davies-Bouldin score.
|
||||
|
||||
The score is defined as the average similarity measure of each cluster with
|
||||
its most similar cluster, where similarity is the ratio of within-cluster
|
||||
distances to between-cluster distances. Thus, clusters which are farther
|
||||
apart and less dispersed will result in a better score.
|
||||
|
||||
The minimum score is zero, with lower values indicating better clustering.
|
||||
|
||||
Read more in the :ref:`User Guide <davies-bouldin_index>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
A list of ``n_features``-dimensional data points. Each row corresponds
|
||||
to a single data point.
|
||||
|
||||
labels : array-like of shape (n_samples,)
|
||||
Predicted labels for each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score: float
|
||||
The resulting Davies-Bouldin score.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Davies, David L.; Bouldin, Donald W. (1979).
|
||||
`"A Cluster Separation Measure"
|
||||
<https://ieeexplore.ieee.org/document/4766909>`__.
|
||||
IEEE Transactions on Pattern Analysis and Machine Intelligence.
|
||||
PAMI-1 (2): 224-227
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.metrics import davies_bouldin_score
|
||||
>>> X = [[0, 1], [1, 1], [3, 4]]
|
||||
>>> labels = [0, 0, 1]
|
||||
>>> davies_bouldin_score(X, labels)
|
||||
np.float64(0.12...)
|
||||
"""
|
||||
X, labels = check_X_y(X, labels)
|
||||
le = LabelEncoder()
|
||||
labels = le.fit_transform(labels)
|
||||
n_samples, _ = X.shape
|
||||
n_labels = len(le.classes_)
|
||||
check_number_of_labels(n_labels, n_samples)
|
||||
|
||||
intra_dists = np.zeros(n_labels)
|
||||
centroids = np.zeros((n_labels, len(X[0])), dtype=float)
|
||||
for k in range(n_labels):
|
||||
cluster_k = _safe_indexing(X, labels == k)
|
||||
centroid = cluster_k.mean(axis=0)
|
||||
centroids[k] = centroid
|
||||
intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid]))
|
||||
|
||||
centroid_distances = pairwise_distances(centroids)
|
||||
|
||||
if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
|
||||
return 0.0
|
||||
|
||||
centroid_distances[centroid_distances == 0] = np.inf
|
||||
combined_intra_dists = intra_dists[:, None] + intra_dists
|
||||
scores = np.max(combined_intra_dists / centroid_distances, axis=1)
|
||||
return np.mean(scores)
|
||||
@ -0,0 +1,7 @@
|
||||
py.extension_module(
|
||||
'_expected_mutual_info_fast',
|
||||
'_expected_mutual_info_fast.pyx',
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/metrics/cluster',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,56 @@
|
||||
"""Testing for bicluster metrics module"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.metrics import consensus_score
|
||||
from sklearn.metrics.cluster._bicluster import _jaccard
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_jaccard():
|
||||
a1 = np.array([True, True, False, False])
|
||||
a2 = np.array([True, True, True, True])
|
||||
a3 = np.array([False, True, True, False])
|
||||
a4 = np.array([False, False, True, True])
|
||||
|
||||
assert _jaccard(a1, a1, a1, a1) == 1
|
||||
assert _jaccard(a1, a1, a2, a2) == 0.25
|
||||
assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
|
||||
assert _jaccard(a1, a1, a4, a4) == 0
|
||||
|
||||
|
||||
def test_consensus_score():
|
||||
a = [[True, True, False, False], [False, False, True, True]]
|
||||
b = a[::-1]
|
||||
|
||||
assert consensus_score((a, a), (a, a)) == 1
|
||||
assert consensus_score((a, a), (b, b)) == 1
|
||||
assert consensus_score((a, b), (a, b)) == 1
|
||||
assert consensus_score((a, b), (b, a)) == 1
|
||||
|
||||
assert consensus_score((a, a), (b, a)) == 0
|
||||
assert consensus_score((a, a), (a, b)) == 0
|
||||
assert consensus_score((b, b), (a, b)) == 0
|
||||
assert consensus_score((b, b), (b, a)) == 0
|
||||
|
||||
|
||||
def test_consensus_score_issue2445():
|
||||
"""Different number of biclusters in A and B"""
|
||||
a_rows = np.array(
|
||||
[
|
||||
[True, True, False, False],
|
||||
[False, False, True, True],
|
||||
[False, False, False, True],
|
||||
]
|
||||
)
|
||||
a_cols = np.array(
|
||||
[
|
||||
[True, True, False, False],
|
||||
[False, False, True, True],
|
||||
[False, False, False, True],
|
||||
]
|
||||
)
|
||||
idx = [0, 2]
|
||||
s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
|
||||
# B contains 2 of the 3 biclusters in A, so score should be 2/3
|
||||
assert_almost_equal(s, 2.0 / 3.0)
|
||||
@ -0,0 +1,219 @@
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.metrics.cluster import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
calinski_harabasz_score,
|
||||
completeness_score,
|
||||
davies_bouldin_score,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
rand_score,
|
||||
silhouette_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
# Dictionaries of metrics
|
||||
# ------------------------
|
||||
# The goal of having those dictionaries is to have an easy way to call a
|
||||
# particular metric and associate a name to each function:
|
||||
# - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
|
||||
# ground truth value)
|
||||
# - UNSUPERVISED_METRICS: all unsupervised cluster metrics
|
||||
#
|
||||
# Those dictionaries will be used to test systematically some invariance
|
||||
# properties, e.g. invariance toward several input layout.
|
||||
#
|
||||
|
||||
SUPERVISED_METRICS = {
|
||||
"adjusted_mutual_info_score": adjusted_mutual_info_score,
|
||||
"adjusted_rand_score": adjusted_rand_score,
|
||||
"rand_score": rand_score,
|
||||
"completeness_score": completeness_score,
|
||||
"homogeneity_score": homogeneity_score,
|
||||
"mutual_info_score": mutual_info_score,
|
||||
"normalized_mutual_info_score": normalized_mutual_info_score,
|
||||
"v_measure_score": v_measure_score,
|
||||
"fowlkes_mallows_score": fowlkes_mallows_score,
|
||||
}
|
||||
|
||||
UNSUPERVISED_METRICS = {
|
||||
"silhouette_score": silhouette_score,
|
||||
"silhouette_manhattan": partial(silhouette_score, metric="manhattan"),
|
||||
"calinski_harabasz_score": calinski_harabasz_score,
|
||||
"davies_bouldin_score": davies_bouldin_score,
|
||||
}
|
||||
|
||||
# Lists of metrics with common properties
|
||||
# ---------------------------------------
|
||||
# Lists of metrics with common properties are used to test systematically some
|
||||
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
|
||||
# that are symmetric with respect to their input argument y_true and y_pred.
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# Symmetric with respect to their input arguments y_true and y_pred.
|
||||
# Symmetric metrics only apply to supervised clusters.
|
||||
SYMMETRIC_METRICS = [
|
||||
"adjusted_rand_score",
|
||||
"rand_score",
|
||||
"v_measure_score",
|
||||
"mutual_info_score",
|
||||
"adjusted_mutual_info_score",
|
||||
"normalized_mutual_info_score",
|
||||
"fowlkes_mallows_score",
|
||||
]
|
||||
|
||||
NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
|
||||
|
||||
# Metrics whose upper bound is 1
|
||||
NORMALIZED_METRICS = [
|
||||
"adjusted_rand_score",
|
||||
"rand_score",
|
||||
"homogeneity_score",
|
||||
"completeness_score",
|
||||
"v_measure_score",
|
||||
"adjusted_mutual_info_score",
|
||||
"fowlkes_mallows_score",
|
||||
"normalized_mutual_info_score",
|
||||
]
|
||||
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
y1 = rng.randint(3, size=30)
|
||||
y2 = rng.randint(3, size=30)
|
||||
|
||||
|
||||
def test_symmetric_non_symmetric_union():
|
||||
assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted(
|
||||
SUPERVISED_METRICS
|
||||
)
|
||||
|
||||
|
||||
# 0.22 AMI and NMI changes
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS]
|
||||
)
|
||||
def test_symmetry(metric_name, y1, y2):
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric(y1, y2) == pytest.approx(metric(y2, y1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
|
||||
)
|
||||
def test_non_symmetry(metric_name, y1, y2):
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric(y1, y2) != pytest.approx(metric(y2, y1))
|
||||
|
||||
|
||||
# 0.22 AMI and NMI changes
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
|
||||
def test_normalized_output(metric_name):
|
||||
upper_bound_1 = [0, 0, 0, 1, 1, 1]
|
||||
upper_bound_2 = [0, 0, 0, 1, 1, 1]
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
|
||||
assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
|
||||
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
|
||||
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
|
||||
assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
|
||||
|
||||
lower_bound_1 = [0, 0, 0, 0, 0, 0]
|
||||
lower_bound_2 = [0, 1, 2, 3, 4, 5]
|
||||
score = np.array(
|
||||
[metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)]
|
||||
)
|
||||
assert not (score < 0).any()
|
||||
|
||||
|
||||
# 0.22 AMI and NMI changes
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
|
||||
def test_permute_labels(metric_name):
|
||||
# All clustering metrics do not change score due to permutations of labels
|
||||
# that is when 0 and 1 exchanged.
|
||||
y_label = np.array([0, 0, 0, 1, 1, 0, 1])
|
||||
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
score_1 = metric(y_pred, y_label)
|
||||
assert_allclose(score_1, metric(1 - y_pred, y_label))
|
||||
assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
|
||||
assert_allclose(score_1, metric(y_pred, 1 - y_label))
|
||||
else:
|
||||
metric = UNSUPERVISED_METRICS[metric_name]
|
||||
X = np.random.randint(10, size=(7, 10))
|
||||
score_1 = metric(X, y_pred)
|
||||
assert_allclose(score_1, metric(X, 1 - y_pred))
|
||||
|
||||
|
||||
# 0.22 AMI and NMI changes
|
||||
@pytest.mark.filterwarnings("ignore::FutureWarning")
|
||||
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
|
||||
# For all clustering metrics Input parameters can be both
|
||||
# in the form of arrays lists, positive, negative or string
|
||||
def test_format_invariance(metric_name):
|
||||
y_true = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
|
||||
|
||||
def generate_formats(y):
|
||||
y = np.array(y)
|
||||
yield y, "array of ints"
|
||||
yield y.tolist(), "list of ints"
|
||||
yield [str(x) + "-a" for x in y.tolist()], "list of strs"
|
||||
yield (
|
||||
np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
|
||||
"array of strs",
|
||||
)
|
||||
yield y - 1, "including negative ints"
|
||||
yield y + 1, "strictly positive ints"
|
||||
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
metric = SUPERVISED_METRICS[metric_name]
|
||||
score_1 = metric(y_true, y_pred)
|
||||
y_true_gen = generate_formats(y_true)
|
||||
y_pred_gen = generate_formats(y_pred)
|
||||
for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen):
|
||||
assert score_1 == metric(y_true_fmt, y_pred_fmt)
|
||||
else:
|
||||
metric = UNSUPERVISED_METRICS[metric_name]
|
||||
X = np.random.randint(10, size=(8, 10))
|
||||
score_1 = metric(X, y_true)
|
||||
assert score_1 == metric(X.astype(float), y_true)
|
||||
y_true_gen = generate_formats(y_true)
|
||||
for y_true_fmt, fmt_name in y_true_gen:
|
||||
assert score_1 == metric(X, y_true_fmt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
|
||||
def test_single_sample(metric):
|
||||
# only the supervised metrics support single sample
|
||||
for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
|
||||
metric([i], [j])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
|
||||
)
|
||||
def test_inf_nan_input(metric_name, metric_func):
|
||||
if metric_name in SUPERVISED_METRICS:
|
||||
invalids = [
|
||||
([0, 1], [np.inf, np.inf]),
|
||||
([0, 1], [np.nan, np.nan]),
|
||||
([0, 1], [np.nan, np.inf]),
|
||||
]
|
||||
else:
|
||||
X = np.random.randint(10, size=(2, 10))
|
||||
invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
|
||||
with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
|
||||
for args in invalids:
|
||||
metric_func(*args)
|
||||
@ -0,0 +1,482 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.metrics.cluster import (
|
||||
adjusted_mutual_info_score,
|
||||
adjusted_rand_score,
|
||||
completeness_score,
|
||||
contingency_matrix,
|
||||
entropy,
|
||||
expected_mutual_information,
|
||||
fowlkes_mallows_score,
|
||||
homogeneity_completeness_v_measure,
|
||||
homogeneity_score,
|
||||
mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
pair_confusion_matrix,
|
||||
rand_score,
|
||||
v_measure_score,
|
||||
)
|
||||
from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings
|
||||
from sklearn.utils import assert_all_finite
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
score_funcs = [
|
||||
adjusted_rand_score,
|
||||
rand_score,
|
||||
homogeneity_score,
|
||||
completeness_score,
|
||||
v_measure_score,
|
||||
adjusted_mutual_info_score,
|
||||
normalized_mutual_info_score,
|
||||
]
|
||||
|
||||
|
||||
def test_error_messages_on_wrong_input():
|
||||
for score_func in score_funcs:
|
||||
expected = (
|
||||
r"Found input variables with inconsistent numbers " r"of samples: \[2, 3\]"
|
||||
)
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([0, 1], [1, 1, 1])
|
||||
|
||||
expected = r"labels_true must be 1D: shape is \(2"
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([[0, 1], [1, 0]], [1, 1, 1])
|
||||
|
||||
expected = r"labels_pred must be 1D: shape is \(2"
|
||||
with pytest.raises(ValueError, match=expected):
|
||||
score_func([0, 1, 0], [[1, 1], [0, 0]])
|
||||
|
||||
|
||||
def test_generalized_average():
|
||||
a, b = 1, 2
|
||||
methods = ["min", "geometric", "arithmetic", "max"]
|
||||
means = [_generalized_average(a, b, method) for method in methods]
|
||||
assert means[0] <= means[1] <= means[2] <= means[3]
|
||||
c, d = 12, 12
|
||||
means = [_generalized_average(c, d, method) for method in methods]
|
||||
assert means[0] == means[1] == means[2] == means[3]
|
||||
|
||||
|
||||
def test_perfect_matches():
|
||||
for score_func in score_funcs:
|
||||
assert score_func([], []) == pytest.approx(1.0)
|
||||
assert score_func([0], [1]) == pytest.approx(1.0)
|
||||
assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
|
||||
assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
|
||||
assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0)
|
||||
assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0)
|
||||
assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
|
||||
score_funcs_with_changing_means = [
|
||||
normalized_mutual_info_score,
|
||||
adjusted_mutual_info_score,
|
||||
]
|
||||
means = {"min", "geometric", "arithmetic", "max"}
|
||||
for score_func in score_funcs_with_changing_means:
|
||||
for mean in means:
|
||||
assert score_func([], [], average_method=mean) == pytest.approx(1.0)
|
||||
assert score_func([0], [1], average_method=mean) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 0, 0], [0, 0, 0], average_method=mean
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 1, 0], [42, 7, 42], average_method=mean
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=mean
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=mean
|
||||
) == pytest.approx(1.0)
|
||||
assert score_func(
|
||||
[0, 1, 2], [42, 7, 2], average_method=mean
|
||||
) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_homogeneous_but_not_complete_labeling():
|
||||
# homogeneous but not complete clustering
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 1.00, 2)
|
||||
assert_almost_equal(c, 0.69, 2)
|
||||
assert_almost_equal(v, 0.81, 2)
|
||||
|
||||
|
||||
def test_complete_but_not_homogeneous_labeling():
|
||||
# complete but not homogeneous clustering
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1])
|
||||
assert_almost_equal(h, 0.58, 2)
|
||||
assert_almost_equal(c, 1.00, 2)
|
||||
assert_almost_equal(v, 0.73, 2)
|
||||
|
||||
|
||||
def test_not_complete_and_not_homogeneous_labeling():
|
||||
# neither complete nor homogeneous but not so bad either
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
|
||||
def test_beta_parameter():
|
||||
# test for when beta passed to
|
||||
# homogeneity_completeness_v_measure
|
||||
# and v_measure_score
|
||||
beta_test = 0.2
|
||||
h_test = 0.67
|
||||
c_test = 0.42
|
||||
v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)
|
||||
|
||||
h, c, v = homogeneity_completeness_v_measure(
|
||||
[0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
|
||||
)
|
||||
assert_almost_equal(h, h_test, 2)
|
||||
assert_almost_equal(c, c_test, 2)
|
||||
assert_almost_equal(v, v_test, 2)
|
||||
|
||||
v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
|
||||
assert_almost_equal(v, v_test, 2)
|
||||
|
||||
|
||||
def test_non_consecutive_labels():
|
||||
# regression tests for labels with gaps
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(h, 0.67, 2)
|
||||
assert_almost_equal(c, 0.42, 2)
|
||||
assert_almost_equal(v, 0.52, 2)
|
||||
|
||||
ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(ari_1, 0.24, 2)
|
||||
assert_almost_equal(ari_2, 0.24, 2)
|
||||
|
||||
ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
|
||||
ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
|
||||
assert_almost_equal(ri_1, 0.66, 2)
|
||||
assert_almost_equal(ri_2, 0.66, 2)
|
||||
|
||||
|
||||
def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42):
|
||||
# Compute score for random uniform cluster labelings
|
||||
random_labels = np.random.RandomState(seed).randint
|
||||
scores = np.zeros((len(k_range), n_runs))
|
||||
for i, k in enumerate(k_range):
|
||||
for j in range(n_runs):
|
||||
labels_a = random_labels(low=0, high=k, size=n_samples)
|
||||
labels_b = random_labels(low=0, high=k, size=n_samples)
|
||||
scores[i, j] = score_func(labels_a, labels_b)
|
||||
return scores
|
||||
|
||||
|
||||
def test_adjustment_for_chance():
|
||||
# Check that adjusted scores are almost zero on random labels
|
||||
n_clusters_range = [2, 10, 50, 90]
|
||||
n_samples = 100
|
||||
n_runs = 10
|
||||
|
||||
scores = uniform_labelings_scores(
|
||||
adjusted_rand_score, n_samples, n_clusters_range, n_runs
|
||||
)
|
||||
|
||||
max_abs_scores = np.abs(scores).max(axis=1)
|
||||
assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
|
||||
|
||||
|
||||
def test_adjusted_mutual_info_score():
|
||||
# Compute the Adjusted Mutual Information and test against known values
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
# Mutual information
|
||||
mi = mutual_info_score(labels_a, labels_b)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# with provided sparse contingency
|
||||
C = contingency_matrix(labels_a, labels_b, sparse=True)
|
||||
mi = mutual_info_score(labels_a, labels_b, contingency=C)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# with provided dense contingency
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
mi = mutual_info_score(labels_a, labels_b, contingency=C)
|
||||
assert_almost_equal(mi, 0.41022, 5)
|
||||
# Expected mutual information
|
||||
n_samples = C.sum()
|
||||
emi = expected_mutual_information(C, n_samples)
|
||||
assert_almost_equal(emi, 0.15042, 5)
|
||||
# Adjusted mutual information
|
||||
ami = adjusted_mutual_info_score(labels_a, labels_b)
|
||||
assert_almost_equal(ami, 0.27821, 5)
|
||||
ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
|
||||
assert ami == pytest.approx(1.0)
|
||||
# Test with a very large array
|
||||
a110 = np.array([list(labels_a) * 110]).flatten()
|
||||
b110 = np.array([list(labels_b) * 110]).flatten()
|
||||
ami = adjusted_mutual_info_score(a110, b110)
|
||||
assert_almost_equal(ami, 0.38, 2)
|
||||
|
||||
|
||||
def test_expected_mutual_info_overflow():
|
||||
# Test for regression where contingency cell exceeds 2**16
|
||||
# leading to overflow in np.outer, resulting in EMI > 1
|
||||
assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
|
||||
|
||||
|
||||
def test_int_overflow_mutual_info_fowlkes_mallows_score():
|
||||
# Test overflow in mutual_info_classif and fowlkes_mallows_score
|
||||
x = np.array(
|
||||
[1] * (52632 + 2529)
|
||||
+ [2] * (14660 + 793)
|
||||
+ [3] * (3271 + 204)
|
||||
+ [4] * (814 + 39)
|
||||
+ [5] * (316 + 20)
|
||||
)
|
||||
y = np.array(
|
||||
[0] * 52632
|
||||
+ [1] * 2529
|
||||
+ [0] * 14660
|
||||
+ [1] * 793
|
||||
+ [0] * 3271
|
||||
+ [1] * 204
|
||||
+ [0] * 814
|
||||
+ [1] * 39
|
||||
+ [0] * 316
|
||||
+ [1] * 20
|
||||
)
|
||||
|
||||
assert_all_finite(mutual_info_score(x, y))
|
||||
assert_all_finite(fowlkes_mallows_score(x, y))
|
||||
|
||||
|
||||
def test_entropy():
|
||||
ent = entropy([0, 0, 42.0])
|
||||
assert_almost_equal(ent, 0.6365141, 5)
|
||||
assert_almost_equal(entropy([]), 1)
|
||||
assert entropy([1, 1, 1, 1]) == 0
|
||||
|
||||
|
||||
def test_contingency_matrix():
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
|
||||
assert_array_almost_equal(C, C2)
|
||||
C = contingency_matrix(labels_a, labels_b, eps=0.1)
|
||||
assert_array_almost_equal(C, C2 + 0.1)
|
||||
|
||||
|
||||
def test_contingency_matrix_sparse():
|
||||
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
|
||||
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
|
||||
C = contingency_matrix(labels_a, labels_b)
|
||||
C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
|
||||
assert_array_almost_equal(C, C_sparse)
|
||||
with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
|
||||
contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
|
||||
|
||||
|
||||
def test_exactly_zero_info_score():
|
||||
# Check numerical stability when information is exactly zero
|
||||
for i in np.logspace(1, 4, 4).astype(int):
|
||||
labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
|
||||
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
assert adjusted_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
|
||||
for method in ["min", "geometric", "arithmetic", "max"]:
|
||||
assert adjusted_mutual_info_score(
|
||||
labels_a, labels_b, average_method=method
|
||||
) == pytest.approx(0.0)
|
||||
assert normalized_mutual_info_score(
|
||||
labels_a, labels_b, average_method=method
|
||||
) == pytest.approx(0.0)
|
||||
|
||||
|
||||
def test_v_measure_and_mutual_information(seed=36):
|
||||
# Check relation between v_measure, entropy and mutual information
|
||||
for i in np.logspace(1, 4, 4).astype(int):
|
||||
random_state = np.random.RandomState(seed)
|
||||
labels_a, labels_b = (
|
||||
random_state.randint(0, 10, i),
|
||||
random_state.randint(0, 10, i),
|
||||
)
|
||||
assert_almost_equal(
|
||||
v_measure_score(labels_a, labels_b),
|
||||
2.0
|
||||
* mutual_info_score(labels_a, labels_b)
|
||||
/ (entropy(labels_a) + entropy(labels_b)),
|
||||
0,
|
||||
)
|
||||
avg = "arithmetic"
|
||||
assert_almost_equal(
|
||||
v_measure_score(labels_a, labels_b),
|
||||
normalized_mutual_info_score(labels_a, labels_b, average_method=avg),
|
||||
)
|
||||
|
||||
|
||||
def test_fowlkes_mallows_score():
|
||||
# General case
|
||||
score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
|
||||
assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))
|
||||
|
||||
# Perfect match but where the label names changed
|
||||
perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
|
||||
assert_almost_equal(perfect_score, 1.0)
|
||||
|
||||
# Worst case
|
||||
worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
|
||||
assert_almost_equal(worst_score, 0.0)
|
||||
|
||||
|
||||
def test_fowlkes_mallows_score_properties():
|
||||
# handcrafted example
|
||||
labels_a = np.array([0, 0, 0, 1, 1, 2])
|
||||
labels_b = np.array([1, 1, 2, 2, 0, 0])
|
||||
expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0))
|
||||
# FMI = TP / sqrt((TP + FP) * (TP + FN))
|
||||
|
||||
score_original = fowlkes_mallows_score(labels_a, labels_b)
|
||||
assert_almost_equal(score_original, expected)
|
||||
|
||||
# symmetric property
|
||||
score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
|
||||
assert_almost_equal(score_symmetric, expected)
|
||||
|
||||
# permutation property
|
||||
score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
|
||||
assert_almost_equal(score_permuted, expected)
|
||||
|
||||
# symmetric and permutation(both together)
|
||||
score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
|
||||
assert_almost_equal(score_both, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"labels_true, labels_pred",
|
||||
[
|
||||
(["a"] * 6, [1, 1, 0, 0, 1, 1]),
|
||||
([1] * 6, [1, 1, 0, 0, 1, 1]),
|
||||
([1, 1, 0, 0, 1, 1], ["a"] * 6),
|
||||
([1, 1, 0, 0, 1, 1], [1] * 6),
|
||||
(["a"] * 6, ["a"] * 6),
|
||||
],
|
||||
)
|
||||
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
|
||||
# Check that MI = 0 when one or both labelling are constant
|
||||
# non-regression test for #16355
|
||||
assert mutual_info_score(labels_true, labels_pred) == 0
|
||||
|
||||
|
||||
def test_check_clustering_error():
|
||||
# Test warning message for continuous values
|
||||
rng = np.random.RandomState(42)
|
||||
noise = rng.rand(500)
|
||||
wavelength = np.linspace(0.01, 1, 500) * 1e-6
|
||||
msg = (
|
||||
"Clustering metrics expects discrete values but received "
|
||||
"continuous values for label, and continuous values for "
|
||||
"target"
|
||||
)
|
||||
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
check_clusterings(wavelength, noise)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix_fully_dispersed():
|
||||
# edge case: every element is its own cluster
|
||||
N = 100
|
||||
clustering1 = list(range(N))
|
||||
clustering2 = clustering1
|
||||
expected = np.array([[N * (N - 1), 0], [0, 0]])
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix_single_cluster():
|
||||
# edge case: only one cluster
|
||||
N = 100
|
||||
clustering1 = np.zeros((N,))
|
||||
clustering2 = clustering1
|
||||
expected = np.array([[0, 0], [0, N * (N - 1)]])
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_pair_confusion_matrix():
|
||||
# regular case: different non-trivial clusterings
|
||||
n = 10
|
||||
N = n**2
|
||||
clustering1 = np.hstack([[i + 1] * n for i in range(n)])
|
||||
clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
|
||||
# basic quadratic implementation
|
||||
expected = np.zeros(shape=(2, 2), dtype=np.int64)
|
||||
for i in range(len(clustering1)):
|
||||
for j in range(len(clustering2)):
|
||||
if i != j:
|
||||
same_cluster_1 = int(clustering1[i] == clustering1[j])
|
||||
same_cluster_2 = int(clustering2[i] == clustering2[j])
|
||||
expected[same_cluster_1, same_cluster_2] += 1
|
||||
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"clustering1, clustering2",
|
||||
[(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))],
|
||||
)
|
||||
def test_rand_score_edge_cases(clustering1, clustering2):
|
||||
# edge case 1: every element is its own cluster
|
||||
# edge case 2: only one cluster
|
||||
assert_allclose(rand_score(clustering1, clustering2), 1.0)
|
||||
|
||||
|
||||
def test_rand_score():
|
||||
# regular case: different non-trivial clusterings
|
||||
clustering1 = [0, 0, 0, 1, 1, 1]
|
||||
clustering2 = [0, 1, 0, 1, 2, 2]
|
||||
# pair confusion matrix
|
||||
D11 = 2 * 2 # ordered pairs (1, 3), (5, 6)
|
||||
D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
|
||||
D01 = 2 * 1 # ordered pair (2, 4)
|
||||
D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs
|
||||
# rand score
|
||||
expected_numerator = D00 + D11
|
||||
expected_denominator = D00 + D01 + D10 + D11
|
||||
expected = expected_numerator / expected_denominator
|
||||
assert_allclose(rand_score(clustering1, clustering2), expected)
|
||||
|
||||
|
||||
def test_adjusted_rand_score_overflow():
|
||||
"""Check that large amount of data will not lead to overflow in
|
||||
`adjusted_rand_score`.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/20305
|
||||
"""
|
||||
rng = np.random.RandomState(0)
|
||||
y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
|
||||
y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
adjusted_rand_score(y_true, y_pred)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"])
|
||||
def test_normalized_mutual_info_score_bounded(average_method):
|
||||
"""Check that nmi returns a score between 0 (included) and 1 (excluded
|
||||
for non-perfect match)
|
||||
|
||||
Non-regression test for issue #13836
|
||||
"""
|
||||
labels1 = [0] * 469
|
||||
labels2 = [1] + labels1[1:]
|
||||
labels3 = [0, 1] + labels1[2:]
|
||||
|
||||
# labels1 is constant. The mutual info between labels1 and any other labelling is 0.
|
||||
nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method)
|
||||
assert nmi == 0
|
||||
|
||||
# non constant, non perfect matching labels
|
||||
nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method)
|
||||
assert 0 <= nmi < 1
|
||||
@ -0,0 +1,413 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.metrics import pairwise_distances
|
||||
from sklearn.metrics.cluster import (
|
||||
calinski_harabasz_score,
|
||||
davies_bouldin_score,
|
||||
silhouette_samples,
|
||||
silhouette_score,
|
||||
)
|
||||
from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import (
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
[None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
@pytest.mark.parametrize("sample_size", [None, "half"])
|
||||
def test_silhouette(sparse_container, sample_size):
|
||||
# Tests the Silhouette Coefficient.
|
||||
dataset = datasets.load_iris()
|
||||
X, y = dataset.data, dataset.target
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
|
||||
|
||||
D = pairwise_distances(X, metric="euclidean")
|
||||
# Given that the actual labels are used, we can assume that S would be positive.
|
||||
score_precomputed = silhouette_score(
|
||||
D, y, metric="precomputed", sample_size=sample_size, random_state=0
|
||||
)
|
||||
score_euclidean = silhouette_score(
|
||||
X, y, metric="euclidean", sample_size=sample_size, random_state=0
|
||||
)
|
||||
assert score_precomputed > 0
|
||||
assert score_euclidean > 0
|
||||
assert score_precomputed == pytest.approx(score_euclidean)
|
||||
|
||||
|
||||
def test_cluster_size_1():
|
||||
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
|
||||
# (cluster 0). We also test the case where there are identical samples
|
||||
# as the only members of a cluster (cluster 2). To our knowledge, this case
|
||||
# is not discussed in reference material, and we choose for it a sample
|
||||
# score of 1.
|
||||
X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
|
||||
labels = np.array([0, 1, 1, 1, 2, 2])
|
||||
|
||||
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
|
||||
# Cluster 1: intra-cluster = [.5, .5, 1]
|
||||
# inter-cluster = [1, 1, 1]
|
||||
# silhouette = [.5, .5, 0]
|
||||
# Cluster 2: intra-cluster = [0, 0]
|
||||
# inter-cluster = [arbitrary, arbitrary]
|
||||
# silhouette = [1., 1.]
|
||||
|
||||
silhouette = silhouette_score(X, labels)
|
||||
assert not np.isnan(silhouette)
|
||||
ss = silhouette_samples(X, labels)
|
||||
assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
|
||||
|
||||
|
||||
def test_silhouette_paper_example():
|
||||
# Explicitly check per-sample results against Rousseeuw (1987)
|
||||
# Data from Table 1
|
||||
lower = [
|
||||
5.58,
|
||||
7.00,
|
||||
6.50,
|
||||
7.08,
|
||||
7.00,
|
||||
3.83,
|
||||
4.83,
|
||||
5.08,
|
||||
8.17,
|
||||
5.83,
|
||||
2.17,
|
||||
5.75,
|
||||
6.67,
|
||||
6.92,
|
||||
4.92,
|
||||
6.42,
|
||||
5.00,
|
||||
5.58,
|
||||
6.00,
|
||||
4.67,
|
||||
6.42,
|
||||
3.42,
|
||||
5.50,
|
||||
6.42,
|
||||
6.42,
|
||||
5.00,
|
||||
3.92,
|
||||
6.17,
|
||||
2.50,
|
||||
4.92,
|
||||
6.25,
|
||||
7.33,
|
||||
4.50,
|
||||
2.25,
|
||||
6.33,
|
||||
2.75,
|
||||
6.08,
|
||||
6.67,
|
||||
4.25,
|
||||
2.67,
|
||||
6.00,
|
||||
6.17,
|
||||
6.17,
|
||||
6.92,
|
||||
6.17,
|
||||
5.25,
|
||||
6.83,
|
||||
4.50,
|
||||
3.75,
|
||||
5.75,
|
||||
5.42,
|
||||
6.08,
|
||||
5.83,
|
||||
6.67,
|
||||
3.67,
|
||||
4.75,
|
||||
3.00,
|
||||
6.08,
|
||||
6.67,
|
||||
5.00,
|
||||
5.58,
|
||||
4.83,
|
||||
6.17,
|
||||
5.67,
|
||||
6.50,
|
||||
6.92,
|
||||
]
|
||||
D = np.zeros((12, 12))
|
||||
D[np.tril_indices(12, -1)] = lower
|
||||
D += D.T
|
||||
|
||||
names = [
|
||||
"BEL",
|
||||
"BRA",
|
||||
"CHI",
|
||||
"CUB",
|
||||
"EGY",
|
||||
"FRA",
|
||||
"IND",
|
||||
"ISR",
|
||||
"USA",
|
||||
"USS",
|
||||
"YUG",
|
||||
"ZAI",
|
||||
]
|
||||
|
||||
# Data from Figure 2
|
||||
labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
|
||||
expected1 = {
|
||||
"USA": 0.43,
|
||||
"BEL": 0.39,
|
||||
"FRA": 0.35,
|
||||
"ISR": 0.30,
|
||||
"BRA": 0.22,
|
||||
"EGY": 0.20,
|
||||
"ZAI": 0.19,
|
||||
"CUB": 0.40,
|
||||
"USS": 0.34,
|
||||
"CHI": 0.33,
|
||||
"YUG": 0.26,
|
||||
"IND": -0.04,
|
||||
}
|
||||
score1 = 0.28
|
||||
|
||||
# Data from Figure 3
|
||||
labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
|
||||
expected2 = {
|
||||
"USA": 0.47,
|
||||
"FRA": 0.44,
|
||||
"BEL": 0.42,
|
||||
"ISR": 0.37,
|
||||
"EGY": 0.02,
|
||||
"ZAI": 0.28,
|
||||
"BRA": 0.25,
|
||||
"IND": 0.17,
|
||||
"CUB": 0.48,
|
||||
"USS": 0.44,
|
||||
"YUG": 0.31,
|
||||
"CHI": 0.31,
|
||||
}
|
||||
score2 = 0.33
|
||||
|
||||
for labels, expected, score in [
|
||||
(labels1, expected1, score1),
|
||||
(labels2, expected2, score2),
|
||||
]:
|
||||
expected = [expected[name] for name in names]
|
||||
# we check to 2dp because that's what's in the paper
|
||||
pytest.approx(
|
||||
expected,
|
||||
silhouette_samples(D, np.array(labels), metric="precomputed"),
|
||||
abs=1e-2,
|
||||
)
|
||||
pytest.approx(
|
||||
score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
|
||||
)
|
||||
|
||||
|
||||
def test_correct_labelsize():
|
||||
# Assert 1 < n_labels < n_samples
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
|
||||
# n_labels = n_samples
|
||||
y = np.arange(X.shape[0])
|
||||
err_msg = (
|
||||
r"Number of labels is %d\. Valid values are 2 "
|
||||
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
silhouette_score(X, y)
|
||||
|
||||
# n_labels = 1
|
||||
y = np.zeros(X.shape[0])
|
||||
err_msg = (
|
||||
r"Number of labels is %d\. Valid values are 2 "
|
||||
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
silhouette_score(X, y)
|
||||
|
||||
|
||||
def test_non_encoded_labels():
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
labels = dataset.target
|
||||
assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
|
||||
assert_array_equal(
|
||||
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
|
||||
)
|
||||
|
||||
|
||||
def test_non_numpy_labels():
|
||||
dataset = datasets.load_iris()
|
||||
X = dataset.data
|
||||
y = dataset.target
|
||||
assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_silhouette_nonzero_diag(dtype):
|
||||
# Make sure silhouette_samples requires diagonal to be zero.
|
||||
# Non-regression test for #12178
|
||||
|
||||
# Construct a zero-diagonal matrix
|
||||
dists = pairwise_distances(
|
||||
np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
|
||||
)
|
||||
labels = [0, 0, 0, 1, 1, 1]
|
||||
|
||||
# small values on the diagonal are OK
|
||||
dists[2][2] = np.finfo(dists.dtype).eps * 10
|
||||
silhouette_samples(dists, labels, metric="precomputed")
|
||||
|
||||
# values bigger than eps * 100 are not
|
||||
dists[2][2] = np.finfo(dists.dtype).eps * 1000
|
||||
with pytest.raises(ValueError, match="contains non-zero"):
|
||||
silhouette_samples(dists, labels, metric="precomputed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
def test_silhouette_samples_precomputed_sparse(sparse_container):
|
||||
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
assert issparse(pdist_sparse)
|
||||
output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
|
||||
output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
|
||||
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container",
|
||||
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||||
)
|
||||
def test_silhouette_samples_euclidean_sparse(sparse_container):
|
||||
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
assert issparse(pdist_sparse)
|
||||
output_with_sparse_input = silhouette_samples(pdist_sparse, y)
|
||||
output_with_dense_input = silhouette_samples(pdist_dense, y)
|
||||
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
|
||||
)
|
||||
def test_silhouette_reduce(sparse_container):
|
||||
"""Check for non-CSR input to private method `_silhouette_reduce`."""
|
||||
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||||
pdist_dense = pairwise_distances(X)
|
||||
pdist_sparse = sparse_container(pdist_dense)
|
||||
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||||
label_freqs = np.bincount(y)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
|
||||
):
|
||||
_silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
|
||||
|
||||
|
||||
def assert_raises_on_only_one_label(func):
|
||||
"""Assert message when there is only one label"""
|
||||
rng = np.random.RandomState(seed=0)
|
||||
with pytest.raises(ValueError, match="Number of labels is"):
|
||||
func(rng.rand(10, 2), np.zeros(10))
|
||||
|
||||
|
||||
def assert_raises_on_all_points_same_cluster(func):
|
||||
"""Assert message when all point are in different clusters"""
|
||||
rng = np.random.RandomState(seed=0)
|
||||
with pytest.raises(ValueError, match="Number of labels is"):
|
||||
func(rng.rand(10, 2), np.arange(10))
|
||||
|
||||
|
||||
def test_calinski_harabasz_score():
|
||||
assert_raises_on_only_one_label(calinski_harabasz_score)
|
||||
|
||||
assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
|
||||
|
||||
# Assert the value is 1. when all samples are equals
|
||||
assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
|
||||
|
||||
# Assert the value is 0. when all the mean cluster are equal
|
||||
assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
|
||||
|
||||
# General case (with non numpy arrays)
|
||||
X = (
|
||||
[[0, 0], [1, 1]] * 5
|
||||
+ [[3, 3], [4, 4]] * 5
|
||||
+ [[0, 4], [1, 3]] * 5
|
||||
+ [[3, 1], [4, 0]] * 5
|
||||
)
|
||||
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||||
pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
|
||||
|
||||
|
||||
def test_davies_bouldin_score():
|
||||
assert_raises_on_only_one_label(davies_bouldin_score)
|
||||
assert_raises_on_all_points_same_cluster(davies_bouldin_score)
|
||||
|
||||
# Assert the value is 0. when all samples are equals
|
||||
assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
|
||||
0.0
|
||||
)
|
||||
|
||||
# Assert the value is 0. when all the mean cluster are equal
|
||||
assert davies_bouldin_score(
|
||||
[[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
|
||||
) == pytest.approx(0.0)
|
||||
|
||||
# General case (with non numpy arrays)
|
||||
X = (
|
||||
[[0, 0], [1, 1]] * 5
|
||||
+ [[3, 3], [4, 4]] * 5
|
||||
+ [[0, 4], [1, 3]] * 5
|
||||
+ [[3, 1], [4, 0]] * 5
|
||||
)
|
||||
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||||
pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
|
||||
|
||||
# Ensure divide by zero warning is not raised in general case
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", RuntimeWarning)
|
||||
davies_bouldin_score(X, labels)
|
||||
|
||||
# General case - cluster have one sample
|
||||
X = [[0, 0], [2, 2], [3, 3], [5, 5]]
|
||||
labels = [0, 0, 1, 2]
|
||||
pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
|
||||
|
||||
|
||||
def test_silhouette_score_integer_precomputed():
|
||||
"""Check that silhouette_score works for precomputed metrics that are integers.
|
||||
|
||||
Non-regression test for #22107.
|
||||
"""
|
||||
result = silhouette_score(
|
||||
[[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||||
)
|
||||
assert result == pytest.approx(1 / 6)
|
||||
|
||||
# non-zero on diagonal for ints raises an error
|
||||
with pytest.raises(ValueError, match="contains non-zero"):
|
||||
silhouette_score(
|
||||
[[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||||
)
|
||||
Reference in New Issue
Block a user