reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,52 @@
"""Evaluation metrics for cluster analysis results.
- Supervised evaluation uses a ground truth class values for each sample.
- Unsupervised evaluation does use ground truths and measures the "quality" of the
model itself.
"""
from ._bicluster import consensus_score
from ._supervised import (
adjusted_mutual_info_score,
adjusted_rand_score,
completeness_score,
contingency_matrix,
entropy,
expected_mutual_information,
fowlkes_mallows_score,
homogeneity_completeness_v_measure,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
pair_confusion_matrix,
rand_score,
v_measure_score,
)
from ._unsupervised import (
calinski_harabasz_score,
davies_bouldin_score,
silhouette_samples,
silhouette_score,
)
__all__ = [
"adjusted_mutual_info_score",
"normalized_mutual_info_score",
"adjusted_rand_score",
"rand_score",
"completeness_score",
"pair_confusion_matrix",
"contingency_matrix",
"expected_mutual_information",
"homogeneity_completeness_v_measure",
"homogeneity_score",
"mutual_info_score",
"v_measure_score",
"fowlkes_mallows_score",
"entropy",
"silhouette_samples",
"silhouette_score",
"calinski_harabasz_score",
"davies_bouldin_score",
"consensus_score",
]

View File

@ -0,0 +1,111 @@
import numpy as np
from scipy.optimize import linear_sum_assignment
from ...utils._param_validation import StrOptions, validate_params
from ...utils.validation import check_array, check_consistent_length
__all__ = ["consensus_score"]
def _check_rows_and_columns(a, b):
"""Unpacks the row and column arrays and checks their shape."""
check_consistent_length(*a)
check_consistent_length(*b)
checks = lambda x: check_array(x, ensure_2d=False)
a_rows, a_cols = map(checks, a)
b_rows, b_cols = map(checks, b)
return a_rows, a_cols, b_rows, b_cols
def _jaccard(a_rows, a_cols, b_rows, b_cols):
"""Jaccard coefficient on the elements of the two biclusters."""
intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
a_size = a_rows.sum() * a_cols.sum()
b_size = b_rows.sum() * b_cols.sum()
return intersection / (a_size + b_size - intersection)
def _pairwise_similarity(a, b, similarity):
"""Computes pairwise similarity matrix.
result[i, j] is the Jaccard coefficient of a's bicluster i and b's
bicluster j.
"""
a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
n_a = a_rows.shape[0]
n_b = b_rows.shape[0]
result = np.array(
[
[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
for i in range(n_a)
]
)
return result
@validate_params(
{
"a": [tuple],
"b": [tuple],
"similarity": [callable, StrOptions({"jaccard"})],
},
prefer_skip_nested_validation=True,
)
def consensus_score(a, b, *, similarity="jaccard"):
"""The similarity of two sets of biclusters.
Similarity between individual biclusters is computed. Then the best
matching between sets is found by solving a linear sum assignment problem,
using a modified Jonker-Volgenant algorithm.
The final score is the sum of similarities divided by the size of
the larger set.
Read more in the :ref:`User Guide <biclustering>`.
Parameters
----------
a : tuple (rows, columns)
Tuple of row and column indicators for a set of biclusters.
b : tuple (rows, columns)
Another set of biclusters like ``a``.
similarity : 'jaccard' or callable, default='jaccard'
May be the string "jaccard" to use the Jaccard coefficient, or
any function that takes four arguments, each of which is a 1d
indicator vector: (a_rows, a_columns, b_rows, b_columns).
Returns
-------
consensus_score : float
Consensus score, a non-negative value, sum of similarities
divided by size of larger set.
See Also
--------
scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
References
----------
* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
for bicluster acquisition
<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
Examples
--------
>>> from sklearn.metrics import consensus_score
>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
>>> consensus_score(a, b, similarity='jaccard')
np.float64(1.0)
"""
if similarity == "jaccard":
similarity = _jaccard
matrix = _pairwise_similarity(a, b, similarity)
row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
n_a = len(a[0])
n_b = len(b[0])
return matrix[row_indices, col_indices].sum() / max(n_a, n_b)

View File

@ -0,0 +1,70 @@
# Authors: Robert Layton <robertlayton@gmail.com>
# Corey Lynch <coreylynch9@gmail.com>
# License: BSD 3 clause
from libc.math cimport exp, lgamma
from ...utils._typedefs cimport float64_t, int64_t
import numpy as np
from scipy.special import gammaln
def expected_mutual_information(contingency, int64_t n_samples):
"""Calculate the expected mutual information for two labelings."""
cdef:
float64_t emi = 0
int64_t n_rows, n_cols
float64_t term2, term3, gln
int64_t[::1] a_view, b_view
float64_t[::1] term1
float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij
float64_t[::1] log_a, log_b
Py_ssize_t i, j, nij
int64_t start, end
n_rows, n_cols = contingency.shape
a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False))
b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False))
a_view = a
b_view = b
# any labelling with zero entropy implies EMI = 0
if a.size == 1 or b.size == 1:
return 0.0
# There are three major terms to the EMI equation, which are multiplied to
# and then summed over varying nij values.
# While nijs[0] will never be used, having it simplifies the indexing.
nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue.
# term1 is nij / N
term1 = nijs / n_samples
# term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
log_a = np.log(a)
log_b = np.log(b)
# term2 uses log(N * nij) = log(N) + log(nij)
log_Nnij = np.log(n_samples) + np.log(nijs)
# term3 is large, and involved many factorials. Calculate these in log
# space to stop overflows.
gln_a = gammaln(a + 1)
gln_b = gammaln(b + 1)
gln_Na = gammaln(n_samples - a + 1)
gln_Nb = gammaln(n_samples - b + 1)
gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1)
# emi itself is a summation over the various values.
for i in range(n_rows):
for j in range(n_cols):
start = max(1, a_view[i] - n_samples + b_view[j])
end = min(a_view[i], b_view[j]) + 1
for nij in range(start, end):
term2 = log_Nnij[nij] - log_a[i] - log_b[j]
# Numerators are positive, denominators are negative.
gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
- gln_Nnij[nij] - lgamma(a_view[i] - nij + 1)
- lgamma(b_view[j] - nij + 1)
- lgamma(n_samples - a_view[i] - b_view[j] + nij + 1))
term3 = exp(gln)
emi += (term1[nij] * term2 * term3)
return emi

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,466 @@
"""Unsupervised evaluation metrics."""
# Authors: Robert Layton <robertlayton@gmail.com>
# Arnaud Fouchet <foucheta@gmail.com>
# Thierry Guillemot <thierry.guillemot.work@gmail.com>
# License: BSD 3 clause
import functools
from numbers import Integral
import numpy as np
from scipy.sparse import issparse
from ...preprocessing import LabelEncoder
from ...utils import _safe_indexing, check_random_state, check_X_y
from ...utils._array_api import _atol_for_type
from ...utils._param_validation import (
Interval,
StrOptions,
validate_params,
)
from ..pairwise import _VALID_METRICS, pairwise_distances, pairwise_distances_chunked
def check_number_of_labels(n_labels, n_samples):
"""Check that number of labels are valid.
Parameters
----------
n_labels : int
Number of labels.
n_samples : int
Number of samples.
"""
if not 1 < n_labels < n_samples:
raise ValueError(
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
% n_labels
)
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"labels": ["array-like"],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
"sample_size": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
},
prefer_skip_nested_validation=True,
)
def silhouette_score(
X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds
):
"""Compute the mean Silhouette Coefficient of all samples.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (``a``) and the mean nearest-cluster distance (``b``) for each
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
b)``. To clarify, ``b`` is the distance between a sample and the nearest
cluster that the sample is not a part of.
Note that Silhouette Coefficient is only defined if number of labels
is ``2 <= n_labels <= n_samples - 1``.
This function returns the mean Silhouette Coefficient over all samples.
To obtain the values for each sample, use :func:`silhouette_samples`.
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters. Negative values generally indicate that a sample has
been assigned to the wrong cluster, as a different cluster is more similar.
Read more in the :ref:`User Guide <silhouette_coefficient>`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
"precomputed" or (n_samples_a, n_features) otherwise
An array of pairwise distances between samples, or a feature array.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is
the distance array itself, use ``metric="precomputed"``.
sample_size : int, default=None
The size of the sample to use when computing the Silhouette Coefficient
on a random subset of the data.
If ``sample_size is None``, no sampling is used.
random_state : int, RandomState instance or None, default=None
Determines random number generation for selecting a subset of samples.
Used when ``sample_size is not None``.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a scipy.spatial.distance metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : float
Mean Silhouette Coefficient for all samples.
References
----------
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65.
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
.. [2] `Wikipedia entry on the Silhouette Coefficient
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> from sklearn.metrics import silhouette_score
>>> X, y = make_blobs(random_state=42)
>>> kmeans = KMeans(n_clusters=2, random_state=42)
>>> silhouette_score(X, kmeans.fit_predict(X))
np.float64(0.49...)
"""
if sample_size is not None:
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
random_state = check_random_state(random_state)
indices = random_state.permutation(X.shape[0])[:sample_size]
if metric == "precomputed":
X, labels = X[indices].T[indices].T, labels[indices]
else:
X, labels = X[indices], labels[indices]
return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
def _silhouette_reduce(D_chunk, start, labels, label_freqs):
"""Accumulate silhouette statistics for vertical chunk of X.
Parameters
----------
D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples)
Precomputed distances for a chunk. If a sparse matrix is provided,
only CSR format is accepted.
start : int
First index in the chunk.
labels : array-like of shape (n_samples,)
Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.
label_freqs : array-like
Distribution of cluster labels in ``labels``.
"""
n_chunk_samples = D_chunk.shape[0]
# accumulate distances from each sample to each cluster
cluster_distances = np.zeros(
(n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype
)
if issparse(D_chunk):
if D_chunk.format != "csr":
raise TypeError(
"Expected CSR matrix. Please pass sparse matrix in CSR format."
)
for i in range(n_chunk_samples):
indptr = D_chunk.indptr
indices = D_chunk.indices[indptr[i] : indptr[i + 1]]
sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]]
sample_labels = np.take(labels, indices)
cluster_distances[i] += np.bincount(
sample_labels, weights=sample_weights, minlength=len(label_freqs)
)
else:
for i in range(n_chunk_samples):
sample_weights = D_chunk[i]
sample_labels = labels
cluster_distances[i] += np.bincount(
sample_labels, weights=sample_weights, minlength=len(label_freqs)
)
# intra_index selects intra-cluster distances within cluster_distances
end = start + n_chunk_samples
intra_index = (np.arange(n_chunk_samples), labels[start:end])
# intra_cluster_distances are averaged over cluster size outside this function
intra_cluster_distances = cluster_distances[intra_index]
# of the remaining distances we normalise and extract the minimum
cluster_distances[intra_index] = np.inf
cluster_distances /= label_freqs
inter_cluster_distances = cluster_distances.min(axis=1)
return intra_cluster_distances, inter_cluster_distances
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"labels": ["array-like"],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
},
prefer_skip_nested_validation=True,
)
def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
"""Compute the Silhouette Coefficient for each sample.
The Silhouette Coefficient is a measure of how well samples are clustered
with samples that are similar to themselves. Clustering models with a high
Silhouette Coefficient are said to be dense, where samples in the same
cluster are similar to each other, and well separated, where samples in
different clusters are not very similar to each other.
The Silhouette Coefficient is calculated using the mean intra-cluster
distance (``a``) and the mean nearest-cluster distance (``b``) for each
sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,
b)``.
Note that Silhouette Coefficient is only defined if number of labels
is 2 ``<= n_labels <= n_samples - 1``.
This function returns the Silhouette Coefficient for each sample.
The best value is 1 and the worst value is -1. Values near 0 indicate
overlapping clusters.
Read more in the :ref:`User Guide <silhouette_coefficient>`.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \
"precomputed" or (n_samples_a, n_features) otherwise
An array of pairwise distances between samples, or a feature array. If
a sparse matrix is provided, CSR format should be favoured avoiding
an additional copy.
labels : array-like of shape (n_samples,)
Label values for each sample.
metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by :func:`~sklearn.metrics.pairwise_distances`.
If ``X`` is the distance array itself, use "precomputed" as the metric.
Precomputed distance matrices must have 0 along the diagonal.
**kwds : optional keyword parameters
Any further parameters are passed directly to the distance function.
If using a ``scipy.spatial.distance`` metric, the parameters are still
metric dependent. See the scipy docs for usage examples.
Returns
-------
silhouette : array-like of shape (n_samples,)
Silhouette Coefficients for each sample.
References
----------
.. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis". Computational
and Applied Mathematics 20: 53-65.
<https://www.sciencedirect.com/science/article/pii/0377042787901257>`_
.. [2] `Wikipedia entry on the Silhouette Coefficient
<https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
Examples
--------
>>> from sklearn.metrics import silhouette_samples
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> X, y = make_blobs(n_samples=50, random_state=42)
>>> kmeans = KMeans(n_clusters=3, random_state=42)
>>> labels = kmeans.fit_predict(X)
>>> silhouette_samples(X, labels)
array([...])
"""
X, labels = check_X_y(X, labels, accept_sparse=["csr"])
# Check for non-zero diagonal entries in precomputed distance matrix
if metric == "precomputed":
error_msg = ValueError(
"The precomputed distance matrix contains non-zero "
"elements on the diagonal. Use np.fill_diagonal(X, 0)."
)
if X.dtype.kind == "f":
atol = _atol_for_type(X.dtype)
if np.any(np.abs(X.diagonal()) > atol):
raise error_msg
elif np.any(X.diagonal() != 0): # integral dtype
raise error_msg
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples = len(labels)
label_freqs = np.bincount(labels)
check_number_of_labels(len(le.classes_), n_samples)
kwds["metric"] = metric
reduce_func = functools.partial(
_silhouette_reduce, labels=labels, label_freqs=label_freqs
)
results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
intra_clust_dists, inter_clust_dists = results
intra_clust_dists = np.concatenate(intra_clust_dists)
inter_clust_dists = np.concatenate(inter_clust_dists)
denom = (label_freqs - 1).take(labels, mode="clip")
with np.errstate(divide="ignore", invalid="ignore"):
intra_clust_dists /= denom
sil_samples = inter_clust_dists - intra_clust_dists
with np.errstate(divide="ignore", invalid="ignore"):
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
# nan values are for clusters of size 1, and should be 0
return np.nan_to_num(sil_samples)
@validate_params(
{
"X": ["array-like"],
"labels": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def calinski_harabasz_score(X, labels):
"""Compute the Calinski and Harabasz score.
It is also known as the Variance Ratio Criterion.
The score is defined as ratio of the sum of between-cluster dispersion and
of within-cluster dispersion.
Read more in the :ref:`User Guide <calinski_harabasz_index>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
A list of ``n_features``-dimensional data points. Each row corresponds
to a single data point.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
Returns
-------
score : float
The resulting Calinski-Harabasz score.
References
----------
.. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
analysis". Communications in Statistics
<https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> from sklearn.cluster import KMeans
>>> from sklearn.metrics import calinski_harabasz_score
>>> X, _ = make_blobs(random_state=0)
>>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X)
>>> calinski_harabasz_score(X, kmeans.labels_)
np.float64(114.8...)
"""
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
extra_disp, intra_disp = 0.0, 0.0
mean = np.mean(X, axis=0)
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = np.mean(cluster_k, axis=0)
extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2)
intra_disp += np.sum((cluster_k - mean_k) ** 2)
return (
1.0
if intra_disp == 0.0
else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))
)
@validate_params(
{
"X": ["array-like"],
"labels": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def davies_bouldin_score(X, labels):
"""Compute the Davies-Bouldin score.
The score is defined as the average similarity measure of each cluster with
its most similar cluster, where similarity is the ratio of within-cluster
distances to between-cluster distances. Thus, clusters which are farther
apart and less dispersed will result in a better score.
The minimum score is zero, with lower values indicating better clustering.
Read more in the :ref:`User Guide <davies-bouldin_index>`.
.. versionadded:: 0.20
Parameters
----------
X : array-like of shape (n_samples, n_features)
A list of ``n_features``-dimensional data points. Each row corresponds
to a single data point.
labels : array-like of shape (n_samples,)
Predicted labels for each sample.
Returns
-------
score: float
The resulting Davies-Bouldin score.
References
----------
.. [1] Davies, David L.; Bouldin, Donald W. (1979).
`"A Cluster Separation Measure"
<https://ieeexplore.ieee.org/document/4766909>`__.
IEEE Transactions on Pattern Analysis and Machine Intelligence.
PAMI-1 (2): 224-227
Examples
--------
>>> from sklearn.metrics import davies_bouldin_score
>>> X = [[0, 1], [1, 1], [3, 4]]
>>> labels = [0, 0, 1]
>>> davies_bouldin_score(X, labels)
np.float64(0.12...)
"""
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
intra_dists = np.zeros(n_labels)
centroids = np.zeros((n_labels, len(X[0])), dtype=float)
for k in range(n_labels):
cluster_k = _safe_indexing(X, labels == k)
centroid = cluster_k.mean(axis=0)
centroids[k] = centroid
intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid]))
centroid_distances = pairwise_distances(centroids)
if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
return 0.0
centroid_distances[centroid_distances == 0] = np.inf
combined_intra_dists = intra_dists[:, None] + intra_dists
scores = np.max(combined_intra_dists / centroid_distances, axis=1)
return np.mean(scores)

View File

@ -0,0 +1,7 @@
py.extension_module(
'_expected_mutual_info_fast',
'_expected_mutual_info_fast.pyx',
cython_args: cython_args,
subdir: 'sklearn/metrics/cluster',
install: true
)

View File

@ -0,0 +1,56 @@
"""Testing for bicluster metrics module"""
import numpy as np
from sklearn.metrics import consensus_score
from sklearn.metrics.cluster._bicluster import _jaccard
from sklearn.utils._testing import assert_almost_equal
def test_jaccard():
a1 = np.array([True, True, False, False])
a2 = np.array([True, True, True, True])
a3 = np.array([False, True, True, False])
a4 = np.array([False, False, True, True])
assert _jaccard(a1, a1, a1, a1) == 1
assert _jaccard(a1, a1, a2, a2) == 0.25
assert _jaccard(a1, a1, a3, a3) == 1.0 / 7
assert _jaccard(a1, a1, a4, a4) == 0
def test_consensus_score():
a = [[True, True, False, False], [False, False, True, True]]
b = a[::-1]
assert consensus_score((a, a), (a, a)) == 1
assert consensus_score((a, a), (b, b)) == 1
assert consensus_score((a, b), (a, b)) == 1
assert consensus_score((a, b), (b, a)) == 1
assert consensus_score((a, a), (b, a)) == 0
assert consensus_score((a, a), (a, b)) == 0
assert consensus_score((b, b), (a, b)) == 0
assert consensus_score((b, b), (b, a)) == 0
def test_consensus_score_issue2445():
"""Different number of biclusters in A and B"""
a_rows = np.array(
[
[True, True, False, False],
[False, False, True, True],
[False, False, False, True],
]
)
a_cols = np.array(
[
[True, True, False, False],
[False, False, True, True],
[False, False, False, True],
]
)
idx = [0, 2]
s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx]))
# B contains 2 of the 3 biclusters in A, so score should be 2/3
assert_almost_equal(s, 2.0 / 3.0)

View File

@ -0,0 +1,219 @@
from functools import partial
from itertools import chain
import numpy as np
import pytest
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
calinski_harabasz_score,
completeness_score,
davies_bouldin_score,
fowlkes_mallows_score,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
rand_score,
silhouette_score,
v_measure_score,
)
from sklearn.utils._testing import assert_allclose
# Dictionaries of metrics
# ------------------------
# The goal of having those dictionaries is to have an easy way to call a
# particular metric and associate a name to each function:
# - SUPERVISED_METRICS: all supervised cluster metrics - (when given a
# ground truth value)
# - UNSUPERVISED_METRICS: all unsupervised cluster metrics
#
# Those dictionaries will be used to test systematically some invariance
# properties, e.g. invariance toward several input layout.
#
SUPERVISED_METRICS = {
"adjusted_mutual_info_score": adjusted_mutual_info_score,
"adjusted_rand_score": adjusted_rand_score,
"rand_score": rand_score,
"completeness_score": completeness_score,
"homogeneity_score": homogeneity_score,
"mutual_info_score": mutual_info_score,
"normalized_mutual_info_score": normalized_mutual_info_score,
"v_measure_score": v_measure_score,
"fowlkes_mallows_score": fowlkes_mallows_score,
}
UNSUPERVISED_METRICS = {
"silhouette_score": silhouette_score,
"silhouette_manhattan": partial(silhouette_score, metric="manhattan"),
"calinski_harabasz_score": calinski_harabasz_score,
"davies_bouldin_score": davies_bouldin_score,
}
# Lists of metrics with common properties
# ---------------------------------------
# Lists of metrics with common properties are used to test systematically some
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics
# that are symmetric with respect to their input argument y_true and y_pred.
#
# --------------------------------------------------------------------
# Symmetric with respect to their input arguments y_true and y_pred.
# Symmetric metrics only apply to supervised clusters.
SYMMETRIC_METRICS = [
"adjusted_rand_score",
"rand_score",
"v_measure_score",
"mutual_info_score",
"adjusted_mutual_info_score",
"normalized_mutual_info_score",
"fowlkes_mallows_score",
]
NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"]
# Metrics whose upper bound is 1
NORMALIZED_METRICS = [
"adjusted_rand_score",
"rand_score",
"homogeneity_score",
"completeness_score",
"v_measure_score",
"adjusted_mutual_info_score",
"fowlkes_mallows_score",
"normalized_mutual_info_score",
]
rng = np.random.RandomState(0)
y1 = rng.randint(3, size=30)
y2 = rng.randint(3, size=30)
def test_symmetric_non_symmetric_union():
assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted(
SUPERVISED_METRICS
)
# 0.22 AMI and NMI changes
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize(
"metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS]
)
def test_symmetry(metric_name, y1, y2):
metric = SUPERVISED_METRICS[metric_name]
assert metric(y1, y2) == pytest.approx(metric(y2, y1))
@pytest.mark.parametrize(
"metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS]
)
def test_non_symmetry(metric_name, y1, y2):
metric = SUPERVISED_METRICS[metric_name]
assert metric(y1, y2) != pytest.approx(metric(y2, y1))
# 0.22 AMI and NMI changes
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS)
def test_normalized_output(metric_name):
upper_bound_1 = [0, 0, 0, 1, 1, 1]
upper_bound_2 = [0, 0, 0, 1, 1, 1]
metric = SUPERVISED_METRICS[metric_name]
assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0
assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0
assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0)
lower_bound_1 = [0, 0, 0, 0, 0, 0]
lower_bound_2 = [0, 1, 2, 3, 4, 5]
score = np.array(
[metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)]
)
assert not (score < 0).any()
# 0.22 AMI and NMI changes
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
def test_permute_labels(metric_name):
# All clustering metrics do not change score due to permutations of labels
# that is when 0 and 1 exchanged.
y_label = np.array([0, 0, 0, 1, 1, 0, 1])
y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
if metric_name in SUPERVISED_METRICS:
metric = SUPERVISED_METRICS[metric_name]
score_1 = metric(y_pred, y_label)
assert_allclose(score_1, metric(1 - y_pred, y_label))
assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
assert_allclose(score_1, metric(y_pred, 1 - y_label))
else:
metric = UNSUPERVISED_METRICS[metric_name]
X = np.random.randint(10, size=(7, 10))
score_1 = metric(X, y_pred)
assert_allclose(score_1, metric(X, 1 - y_pred))
# 0.22 AMI and NMI changes
@pytest.mark.filterwarnings("ignore::FutureWarning")
@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS))
# For all clustering metrics Input parameters can be both
# in the form of arrays lists, positive, negative or string
def test_format_invariance(metric_name):
y_true = [0, 0, 0, 0, 1, 1, 1, 1]
y_pred = [0, 1, 2, 3, 4, 5, 6, 7]
def generate_formats(y):
y = np.array(y)
yield y, "array of ints"
yield y.tolist(), "list of ints"
yield [str(x) + "-a" for x in y.tolist()], "list of strs"
yield (
np.array([str(x) + "-a" for x in y.tolist()], dtype=object),
"array of strs",
)
yield y - 1, "including negative ints"
yield y + 1, "strictly positive ints"
if metric_name in SUPERVISED_METRICS:
metric = SUPERVISED_METRICS[metric_name]
score_1 = metric(y_true, y_pred)
y_true_gen = generate_formats(y_true)
y_pred_gen = generate_formats(y_pred)
for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen):
assert score_1 == metric(y_true_fmt, y_pred_fmt)
else:
metric = UNSUPERVISED_METRICS[metric_name]
X = np.random.randint(10, size=(8, 10))
score_1 = metric(X, y_true)
assert score_1 == metric(X.astype(float), y_true)
y_true_gen = generate_formats(y_true)
for y_true_fmt, fmt_name in y_true_gen:
assert score_1 == metric(X, y_true_fmt)
@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values())
def test_single_sample(metric):
# only the supervised metrics support single sample
for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]:
metric([i], [j])
@pytest.mark.parametrize(
"metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items()
)
def test_inf_nan_input(metric_name, metric_func):
if metric_name in SUPERVISED_METRICS:
invalids = [
([0, 1], [np.inf, np.inf]),
([0, 1], [np.nan, np.nan]),
([0, 1], [np.nan, np.inf]),
]
else:
X = np.random.randint(10, size=(2, 10))
invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
for args in invalids:
metric_func(*args)

View File

@ -0,0 +1,482 @@
import warnings
import numpy as np
import pytest
from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
from sklearn.metrics.cluster import (
adjusted_mutual_info_score,
adjusted_rand_score,
completeness_score,
contingency_matrix,
entropy,
expected_mutual_information,
fowlkes_mallows_score,
homogeneity_completeness_v_measure,
homogeneity_score,
mutual_info_score,
normalized_mutual_info_score,
pair_confusion_matrix,
rand_score,
v_measure_score,
)
from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings
from sklearn.utils import assert_all_finite
from sklearn.utils._testing import assert_almost_equal
score_funcs = [
adjusted_rand_score,
rand_score,
homogeneity_score,
completeness_score,
v_measure_score,
adjusted_mutual_info_score,
normalized_mutual_info_score,
]
def test_error_messages_on_wrong_input():
for score_func in score_funcs:
expected = (
r"Found input variables with inconsistent numbers " r"of samples: \[2, 3\]"
)
with pytest.raises(ValueError, match=expected):
score_func([0, 1], [1, 1, 1])
expected = r"labels_true must be 1D: shape is \(2"
with pytest.raises(ValueError, match=expected):
score_func([[0, 1], [1, 0]], [1, 1, 1])
expected = r"labels_pred must be 1D: shape is \(2"
with pytest.raises(ValueError, match=expected):
score_func([0, 1, 0], [[1, 1], [0, 0]])
def test_generalized_average():
a, b = 1, 2
methods = ["min", "geometric", "arithmetic", "max"]
means = [_generalized_average(a, b, method) for method in methods]
assert means[0] <= means[1] <= means[2] <= means[3]
c, d = 12, 12
means = [_generalized_average(c, d, method) for method in methods]
assert means[0] == means[1] == means[2] == means[3]
def test_perfect_matches():
for score_func in score_funcs:
assert score_func([], []) == pytest.approx(1.0)
assert score_func([0], [1]) == pytest.approx(1.0)
assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0)
assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0)
assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0)
assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0)
assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0)
score_funcs_with_changing_means = [
normalized_mutual_info_score,
adjusted_mutual_info_score,
]
means = {"min", "geometric", "arithmetic", "max"}
for score_func in score_funcs_with_changing_means:
for mean in means:
assert score_func([], [], average_method=mean) == pytest.approx(1.0)
assert score_func([0], [1], average_method=mean) == pytest.approx(1.0)
assert score_func(
[0, 0, 0], [0, 0, 0], average_method=mean
) == pytest.approx(1.0)
assert score_func(
[0, 1, 0], [42, 7, 42], average_method=mean
) == pytest.approx(1.0)
assert score_func(
[0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=mean
) == pytest.approx(1.0)
assert score_func(
[0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=mean
) == pytest.approx(1.0)
assert score_func(
[0, 1, 2], [42, 7, 2], average_method=mean
) == pytest.approx(1.0)
def test_homogeneous_but_not_complete_labeling():
# homogeneous but not complete clustering
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2])
assert_almost_equal(h, 1.00, 2)
assert_almost_equal(c, 0.69, 2)
assert_almost_equal(v, 0.81, 2)
def test_complete_but_not_homogeneous_labeling():
# complete but not homogeneous clustering
h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1])
assert_almost_equal(h, 0.58, 2)
assert_almost_equal(c, 1.00, 2)
assert_almost_equal(v, 0.73, 2)
def test_not_complete_and_not_homogeneous_labeling():
# neither complete nor homogeneous but not so bad either
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
def test_beta_parameter():
# test for when beta passed to
# homogeneity_completeness_v_measure
# and v_measure_score
beta_test = 0.2
h_test = 0.67
c_test = 0.42
v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)
h, c, v = homogeneity_completeness_v_measure(
[0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
)
assert_almost_equal(h, h_test, 2)
assert_almost_equal(c, c_test, 2)
assert_almost_equal(v, v_test, 2)
v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
assert_almost_equal(v, v_test, 2)
def test_non_consecutive_labels():
# regression tests for labels with gaps
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(h, 0.67, 2)
assert_almost_equal(c, 0.42, 2)
assert_almost_equal(v, 0.52, 2)
ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(ari_1, 0.24, 2)
assert_almost_equal(ari_2, 0.24, 2)
ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
assert_almost_equal(ri_1, 0.66, 2)
assert_almost_equal(ri_2, 0.66, 2)
def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42):
# Compute score for random uniform cluster labelings
random_labels = np.random.RandomState(seed).randint
scores = np.zeros((len(k_range), n_runs))
for i, k in enumerate(k_range):
for j in range(n_runs):
labels_a = random_labels(low=0, high=k, size=n_samples)
labels_b = random_labels(low=0, high=k, size=n_samples)
scores[i, j] = score_func(labels_a, labels_b)
return scores
def test_adjustment_for_chance():
# Check that adjusted scores are almost zero on random labels
n_clusters_range = [2, 10, 50, 90]
n_samples = 100
n_runs = 10
scores = uniform_labelings_scores(
adjusted_rand_score, n_samples, n_clusters_range, n_runs
)
max_abs_scores = np.abs(scores).max(axis=1)
assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
def test_adjusted_mutual_info_score():
# Compute the Adjusted Mutual Information and test against known values
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
# Mutual information
mi = mutual_info_score(labels_a, labels_b)
assert_almost_equal(mi, 0.41022, 5)
# with provided sparse contingency
C = contingency_matrix(labels_a, labels_b, sparse=True)
mi = mutual_info_score(labels_a, labels_b, contingency=C)
assert_almost_equal(mi, 0.41022, 5)
# with provided dense contingency
C = contingency_matrix(labels_a, labels_b)
mi = mutual_info_score(labels_a, labels_b, contingency=C)
assert_almost_equal(mi, 0.41022, 5)
# Expected mutual information
n_samples = C.sum()
emi = expected_mutual_information(C, n_samples)
assert_almost_equal(emi, 0.15042, 5)
# Adjusted mutual information
ami = adjusted_mutual_info_score(labels_a, labels_b)
assert_almost_equal(ami, 0.27821, 5)
ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
assert ami == pytest.approx(1.0)
# Test with a very large array
a110 = np.array([list(labels_a) * 110]).flatten()
b110 = np.array([list(labels_b) * 110]).flatten()
ami = adjusted_mutual_info_score(a110, b110)
assert_almost_equal(ami, 0.38, 2)
def test_expected_mutual_info_overflow():
# Test for regression where contingency cell exceeds 2**16
# leading to overflow in np.outer, resulting in EMI > 1
assert expected_mutual_information(np.array([[70000]]), 70000) <= 1
def test_int_overflow_mutual_info_fowlkes_mallows_score():
# Test overflow in mutual_info_classif and fowlkes_mallows_score
x = np.array(
[1] * (52632 + 2529)
+ [2] * (14660 + 793)
+ [3] * (3271 + 204)
+ [4] * (814 + 39)
+ [5] * (316 + 20)
)
y = np.array(
[0] * 52632
+ [1] * 2529
+ [0] * 14660
+ [1] * 793
+ [0] * 3271
+ [1] * 204
+ [0] * 814
+ [1] * 39
+ [0] * 316
+ [1] * 20
)
assert_all_finite(mutual_info_score(x, y))
assert_all_finite(fowlkes_mallows_score(x, y))
def test_entropy():
ent = entropy([0, 0, 42.0])
assert_almost_equal(ent, 0.6365141, 5)
assert_almost_equal(entropy([]), 1)
assert entropy([1, 1, 1, 1]) == 0
def test_contingency_matrix():
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
C = contingency_matrix(labels_a, labels_b)
C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0]
assert_array_almost_equal(C, C2)
C = contingency_matrix(labels_a, labels_b, eps=0.1)
assert_array_almost_equal(C, C2 + 0.1)
def test_contingency_matrix_sparse():
labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
C = contingency_matrix(labels_a, labels_b)
C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray()
assert_array_almost_equal(C, C_sparse)
with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"):
contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True)
def test_exactly_zero_info_score():
# Check numerical stability when information is exactly zero
for i in np.logspace(1, 4, 4).astype(int):
labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0)
assert adjusted_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0)
for method in ["min", "geometric", "arithmetic", "max"]:
assert adjusted_mutual_info_score(
labels_a, labels_b, average_method=method
) == pytest.approx(0.0)
assert normalized_mutual_info_score(
labels_a, labels_b, average_method=method
) == pytest.approx(0.0)
def test_v_measure_and_mutual_information(seed=36):
# Check relation between v_measure, entropy and mutual information
for i in np.logspace(1, 4, 4).astype(int):
random_state = np.random.RandomState(seed)
labels_a, labels_b = (
random_state.randint(0, 10, i),
random_state.randint(0, 10, i),
)
assert_almost_equal(
v_measure_score(labels_a, labels_b),
2.0
* mutual_info_score(labels_a, labels_b)
/ (entropy(labels_a) + entropy(labels_b)),
0,
)
avg = "arithmetic"
assert_almost_equal(
v_measure_score(labels_a, labels_b),
normalized_mutual_info_score(labels_a, labels_b, average_method=avg),
)
def test_fowlkes_mallows_score():
# General case
score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))
# Perfect match but where the label names changed
perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
assert_almost_equal(perfect_score, 1.0)
# Worst case
worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
assert_almost_equal(worst_score, 0.0)
def test_fowlkes_mallows_score_properties():
# handcrafted example
labels_a = np.array([0, 0, 0, 1, 1, 2])
labels_b = np.array([1, 1, 2, 2, 0, 0])
expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0))
# FMI = TP / sqrt((TP + FP) * (TP + FN))
score_original = fowlkes_mallows_score(labels_a, labels_b)
assert_almost_equal(score_original, expected)
# symmetric property
score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
assert_almost_equal(score_symmetric, expected)
# permutation property
score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
assert_almost_equal(score_permuted, expected)
# symmetric and permutation(both together)
score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
assert_almost_equal(score_both, expected)
@pytest.mark.parametrize(
"labels_true, labels_pred",
[
(["a"] * 6, [1, 1, 0, 0, 1, 1]),
([1] * 6, [1, 1, 0, 0, 1, 1]),
([1, 1, 0, 0, 1, 1], ["a"] * 6),
([1, 1, 0, 0, 1, 1], [1] * 6),
(["a"] * 6, ["a"] * 6),
],
)
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
# Check that MI = 0 when one or both labelling are constant
# non-regression test for #16355
assert mutual_info_score(labels_true, labels_pred) == 0
def test_check_clustering_error():
# Test warning message for continuous values
rng = np.random.RandomState(42)
noise = rng.rand(500)
wavelength = np.linspace(0.01, 1, 500) * 1e-6
msg = (
"Clustering metrics expects discrete values but received "
"continuous values for label, and continuous values for "
"target"
)
with pytest.warns(UserWarning, match=msg):
check_clusterings(wavelength, noise)
def test_pair_confusion_matrix_fully_dispersed():
# edge case: every element is its own cluster
N = 100
clustering1 = list(range(N))
clustering2 = clustering1
expected = np.array([[N * (N - 1), 0], [0, 0]])
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def test_pair_confusion_matrix_single_cluster():
# edge case: only one cluster
N = 100
clustering1 = np.zeros((N,))
clustering2 = clustering1
expected = np.array([[0, 0], [0, N * (N - 1)]])
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def test_pair_confusion_matrix():
# regular case: different non-trivial clusterings
n = 10
N = n**2
clustering1 = np.hstack([[i + 1] * n for i in range(n)])
clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
# basic quadratic implementation
expected = np.zeros(shape=(2, 2), dtype=np.int64)
for i in range(len(clustering1)):
for j in range(len(clustering2)):
if i != j:
same_cluster_1 = int(clustering1[i] == clustering1[j])
same_cluster_2 = int(clustering2[i] == clustering2[j])
expected[same_cluster_1, same_cluster_2] += 1
assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
@pytest.mark.parametrize(
"clustering1, clustering2",
[(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))],
)
def test_rand_score_edge_cases(clustering1, clustering2):
# edge case 1: every element is its own cluster
# edge case 2: only one cluster
assert_allclose(rand_score(clustering1, clustering2), 1.0)
def test_rand_score():
# regular case: different non-trivial clusterings
clustering1 = [0, 0, 0, 1, 1, 1]
clustering2 = [0, 1, 0, 1, 2, 2]
# pair confusion matrix
D11 = 2 * 2 # ordered pairs (1, 3), (5, 6)
D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
D01 = 2 * 1 # ordered pair (2, 4)
D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs
# rand score
expected_numerator = D00 + D11
expected_denominator = D00 + D01 + D10 + D11
expected = expected_numerator / expected_denominator
assert_allclose(rand_score(clustering1, clustering2), expected)
def test_adjusted_rand_score_overflow():
"""Check that large amount of data will not lead to overflow in
`adjusted_rand_score`.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/20305
"""
rng = np.random.RandomState(0)
y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
adjusted_rand_score(y_true, y_pred)
@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"])
def test_normalized_mutual_info_score_bounded(average_method):
"""Check that nmi returns a score between 0 (included) and 1 (excluded
for non-perfect match)
Non-regression test for issue #13836
"""
labels1 = [0] * 469
labels2 = [1] + labels1[1:]
labels3 = [0, 1] + labels1[2:]
# labels1 is constant. The mutual info between labels1 and any other labelling is 0.
nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method)
assert nmi == 0
# non constant, non perfect matching labels
nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method)
assert 0 <= nmi < 1

View File

@ -0,0 +1,413 @@
import warnings
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy.sparse import issparse
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.metrics.cluster import (
calinski_harabasz_score,
davies_bouldin_score,
silhouette_samples,
silhouette_score,
)
from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import (
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
@pytest.mark.parametrize(
"sparse_container",
[None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
@pytest.mark.parametrize("sample_size", [None, "half"])
def test_silhouette(sparse_container, sample_size):
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X, y = dataset.data, dataset.target
if sparse_container is not None:
X = sparse_container(X)
sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
D = pairwise_distances(X, metric="euclidean")
# Given that the actual labels are used, we can assume that S would be positive.
score_precomputed = silhouette_score(
D, y, metric="precomputed", sample_size=sample_size, random_state=0
)
score_euclidean = silhouette_score(
X, y, metric="euclidean", sample_size=sample_size, random_state=0
)
assert score_precomputed > 0
assert score_euclidean > 0
assert score_precomputed == pytest.approx(score_euclidean)
def test_cluster_size_1():
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
# (cluster 0). We also test the case where there are identical samples
# as the only members of a cluster (cluster 2). To our knowledge, this case
# is not discussed in reference material, and we choose for it a sample
# score of 1.
X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
labels = np.array([0, 1, 1, 1, 2, 2])
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
# Cluster 1: intra-cluster = [.5, .5, 1]
# inter-cluster = [1, 1, 1]
# silhouette = [.5, .5, 0]
# Cluster 2: intra-cluster = [0, 0]
# inter-cluster = [arbitrary, arbitrary]
# silhouette = [1., 1.]
silhouette = silhouette_score(X, labels)
assert not np.isnan(silhouette)
ss = silhouette_samples(X, labels)
assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
def test_silhouette_paper_example():
# Explicitly check per-sample results against Rousseeuw (1987)
# Data from Table 1
lower = [
5.58,
7.00,
6.50,
7.08,
7.00,
3.83,
4.83,
5.08,
8.17,
5.83,
2.17,
5.75,
6.67,
6.92,
4.92,
6.42,
5.00,
5.58,
6.00,
4.67,
6.42,
3.42,
5.50,
6.42,
6.42,
5.00,
3.92,
6.17,
2.50,
4.92,
6.25,
7.33,
4.50,
2.25,
6.33,
2.75,
6.08,
6.67,
4.25,
2.67,
6.00,
6.17,
6.17,
6.92,
6.17,
5.25,
6.83,
4.50,
3.75,
5.75,
5.42,
6.08,
5.83,
6.67,
3.67,
4.75,
3.00,
6.08,
6.67,
5.00,
5.58,
4.83,
6.17,
5.67,
6.50,
6.92,
]
D = np.zeros((12, 12))
D[np.tril_indices(12, -1)] = lower
D += D.T
names = [
"BEL",
"BRA",
"CHI",
"CUB",
"EGY",
"FRA",
"IND",
"ISR",
"USA",
"USS",
"YUG",
"ZAI",
]
# Data from Figure 2
labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
expected1 = {
"USA": 0.43,
"BEL": 0.39,
"FRA": 0.35,
"ISR": 0.30,
"BRA": 0.22,
"EGY": 0.20,
"ZAI": 0.19,
"CUB": 0.40,
"USS": 0.34,
"CHI": 0.33,
"YUG": 0.26,
"IND": -0.04,
}
score1 = 0.28
# Data from Figure 3
labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
expected2 = {
"USA": 0.47,
"FRA": 0.44,
"BEL": 0.42,
"ISR": 0.37,
"EGY": 0.02,
"ZAI": 0.28,
"BRA": 0.25,
"IND": 0.17,
"CUB": 0.48,
"USS": 0.44,
"YUG": 0.31,
"CHI": 0.31,
}
score2 = 0.33
for labels, expected, score in [
(labels1, expected1, score1),
(labels2, expected2, score2),
]:
expected = [expected[name] for name in names]
# we check to 2dp because that's what's in the paper
pytest.approx(
expected,
silhouette_samples(D, np.array(labels), metric="precomputed"),
abs=1e-2,
)
pytest.approx(
score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
)
def test_correct_labelsize():
# Assert 1 < n_labels < n_samples
dataset = datasets.load_iris()
X = dataset.data
# n_labels = n_samples
y = np.arange(X.shape[0])
err_msg = (
r"Number of labels is %d\. Valid values are 2 "
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
)
with pytest.raises(ValueError, match=err_msg):
silhouette_score(X, y)
# n_labels = 1
y = np.zeros(X.shape[0])
err_msg = (
r"Number of labels is %d\. Valid values are 2 "
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
)
with pytest.raises(ValueError, match=err_msg):
silhouette_score(X, y)
def test_non_encoded_labels():
dataset = datasets.load_iris()
X = dataset.data
labels = dataset.target
assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
assert_array_equal(
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
)
def test_non_numpy_labels():
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_silhouette_nonzero_diag(dtype):
# Make sure silhouette_samples requires diagonal to be zero.
# Non-regression test for #12178
# Construct a zero-diagonal matrix
dists = pairwise_distances(
np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
)
labels = [0, 0, 0, 1, 1, 1]
# small values on the diagonal are OK
dists[2][2] = np.finfo(dists.dtype).eps * 10
silhouette_samples(dists, labels, metric="precomputed")
# values bigger than eps * 100 are not
dists[2][2] = np.finfo(dists.dtype).eps * 1000
with pytest.raises(ValueError, match="contains non-zero"):
silhouette_samples(dists, labels, metric="precomputed")
@pytest.mark.parametrize(
"sparse_container",
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
def test_silhouette_samples_precomputed_sparse(sparse_container):
"""Check that silhouette_samples works for sparse matrices correctly."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
y = [0, 0, 0, 0, 1, 1, 1, 1]
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
assert issparse(pdist_sparse)
output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
assert_allclose(output_with_sparse_input, output_with_dense_input)
@pytest.mark.parametrize(
"sparse_container",
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
)
def test_silhouette_samples_euclidean_sparse(sparse_container):
"""Check that silhouette_samples works for sparse matrices correctly."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
y = [0, 0, 0, 0, 1, 1, 1, 1]
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
assert issparse(pdist_sparse)
output_with_sparse_input = silhouette_samples(pdist_sparse, y)
output_with_dense_input = silhouette_samples(pdist_dense, y)
assert_allclose(output_with_sparse_input, output_with_dense_input)
@pytest.mark.parametrize(
"sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
)
def test_silhouette_reduce(sparse_container):
"""Check for non-CSR input to private method `_silhouette_reduce`."""
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
pdist_dense = pairwise_distances(X)
pdist_sparse = sparse_container(pdist_dense)
y = [0, 0, 0, 0, 1, 1, 1, 1]
label_freqs = np.bincount(y)
with pytest.raises(
TypeError,
match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
):
_silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
def assert_raises_on_only_one_label(func):
"""Assert message when there is only one label"""
rng = np.random.RandomState(seed=0)
with pytest.raises(ValueError, match="Number of labels is"):
func(rng.rand(10, 2), np.zeros(10))
def assert_raises_on_all_points_same_cluster(func):
"""Assert message when all point are in different clusters"""
rng = np.random.RandomState(seed=0)
with pytest.raises(ValueError, match="Number of labels is"):
func(rng.rand(10, 2), np.arange(10))
def test_calinski_harabasz_score():
assert_raises_on_only_one_label(calinski_harabasz_score)
assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
# Assert the value is 1. when all samples are equals
assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
# Assert the value is 0. when all the mean cluster are equal
assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
# General case (with non numpy arrays)
X = (
[[0, 0], [1, 1]] * 5
+ [[3, 3], [4, 4]] * 5
+ [[0, 4], [1, 3]] * 5
+ [[3, 1], [4, 0]] * 5
)
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def test_davies_bouldin_score():
assert_raises_on_only_one_label(davies_bouldin_score)
assert_raises_on_all_points_same_cluster(davies_bouldin_score)
# Assert the value is 0. when all samples are equals
assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
0.0
)
# Assert the value is 0. when all the mean cluster are equal
assert davies_bouldin_score(
[[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
) == pytest.approx(0.0)
# General case (with non numpy arrays)
X = (
[[0, 0], [1, 1]] * 5
+ [[3, 3], [4, 4]] * 5
+ [[0, 4], [1, 3]] * 5
+ [[3, 1], [4, 0]] * 5
)
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
# Ensure divide by zero warning is not raised in general case
with warnings.catch_warnings():
warnings.simplefilter("error", RuntimeWarning)
davies_bouldin_score(X, labels)
# General case - cluster have one sample
X = [[0, 0], [2, 2], [3, 3], [5, 5]]
labels = [0, 0, 1, 2]
pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
def test_silhouette_score_integer_precomputed():
"""Check that silhouette_score works for precomputed metrics that are integers.
Non-regression test for #22107.
"""
result = silhouette_score(
[[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
)
assert result == pytest.approx(1 / 6)
# non-zero on diagonal for ints raises an error
with pytest.raises(ValueError, match="contains non-zero"):
silhouette_score(
[[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
)