reconnect moved files to git repo
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,272 @@
|
||||
# Minimum spanning tree single linkage implementation for hdbscan
|
||||
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
|
||||
# Steve Astels <sastels@gmail.com>
|
||||
# Meekail Zain <zainmeekail@gmail.com>
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.float cimport DBL_MAX
|
||||
|
||||
import numpy as np
|
||||
from ...metrics._dist_metrics cimport DistanceMetric64
|
||||
from ...cluster._hierarchical_fast cimport UnionFind
|
||||
from ...cluster._hdbscan._tree cimport HIERARCHY_t
|
||||
from ...cluster._hdbscan._tree import HIERARCHY_dtype
|
||||
from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
# Numpy structured dtype representing a single ordered edge in Prim's algorithm
|
||||
MST_edge_dtype = np.dtype([
|
||||
("current_node", np.int64),
|
||||
("next_node", np.int64),
|
||||
("distance", np.float64),
|
||||
])
|
||||
|
||||
# Packed shouldn't make a difference since they're all 8-byte quantities,
|
||||
# but it's included just to be safe.
|
||||
ctypedef packed struct MST_edge_t:
|
||||
int64_t current_node
|
||||
int64_t next_node
|
||||
float64_t distance
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
|
||||
cnp.ndarray[float64_t, ndim=2] mutual_reachability
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mutual_reachability : ndarray of shape (n_samples, n_samples)
|
||||
Array of mutual-reachabilities between samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
"""
|
||||
cdef:
|
||||
# Note: we utilize ndarray's over memory-views to make use of numpy
|
||||
# binary indexing and sub-selection below.
|
||||
cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
|
||||
cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
cnp.ndarray[uint8_t, mode='c'] label_filter
|
||||
|
||||
int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
|
||||
int64_t current_node, new_node_index, new_node, i
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
current_labels = np.arange(n_samples, dtype=np.int64)
|
||||
current_node = 0
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
for i in range(0, n_samples - 1):
|
||||
label_filter = current_labels != current_node
|
||||
current_labels = current_labels[label_filter]
|
||||
left = min_reachability[label_filter]
|
||||
right = mutual_reachability[current_node][current_labels]
|
||||
min_reachability = np.minimum(left, right)
|
||||
|
||||
new_node_index = np.argmin(min_reachability)
|
||||
new_node = current_labels[new_node_index]
|
||||
mst[i].current_node = current_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = min_reachability[new_node_index]
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
|
||||
const float64_t[:, ::1] raw_data,
|
||||
const float64_t[::1] core_distances,
|
||||
DistanceMetric64 dist_metric,
|
||||
float64_t alpha=1.0
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph generated from the provided `raw_data` and
|
||||
`core_distances` using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_data : ndarray of shape (n_samples, n_features)
|
||||
Input array of data samples.
|
||||
|
||||
core_distances : ndarray of shape (n_samples,)
|
||||
An array containing the core-distance calculated for each corresponding
|
||||
sample.
|
||||
|
||||
dist_metric : DistanceMetric
|
||||
The distance metric to use when calculating pairwise distances for
|
||||
determining mutual-reachability.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
uint8_t[::1] in_tree
|
||||
float64_t[::1] min_reachability
|
||||
int64_t[::1] current_sources
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
int64_t current_node, source_node, new_node, next_node_source
|
||||
int64_t i, j, n_samples, num_features
|
||||
|
||||
float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
|
||||
float64_t next_node_min_reach, pair_distance, next_node_core_dist
|
||||
|
||||
n_samples = raw_data.shape[0]
|
||||
num_features = raw_data.shape[1]
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
|
||||
in_tree = np.zeros(n_samples, dtype=np.uint8)
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
current_sources = np.ones(n_samples, dtype=np.int64)
|
||||
|
||||
current_node = 0
|
||||
|
||||
for i in range(0, n_samples - 1):
|
||||
|
||||
in_tree[current_node] = 1
|
||||
|
||||
current_node_core_dist = core_distances[current_node]
|
||||
|
||||
new_reachability = DBL_MAX
|
||||
source_node = 0
|
||||
new_node = 0
|
||||
|
||||
for j in range(n_samples):
|
||||
if in_tree[j]:
|
||||
continue
|
||||
|
||||
next_node_min_reach = min_reachability[j]
|
||||
next_node_source = current_sources[j]
|
||||
|
||||
pair_distance = dist_metric.dist(
|
||||
&raw_data[current_node, 0],
|
||||
&raw_data[j, 0],
|
||||
num_features
|
||||
)
|
||||
|
||||
pair_distance /= alpha
|
||||
|
||||
next_node_core_dist = core_distances[j]
|
||||
mutual_reachability_distance = max(
|
||||
current_node_core_dist,
|
||||
next_node_core_dist,
|
||||
pair_distance
|
||||
)
|
||||
if mutual_reachability_distance > next_node_min_reach:
|
||||
if next_node_min_reach < new_reachability:
|
||||
new_reachability = next_node_min_reach
|
||||
source_node = next_node_source
|
||||
new_node = j
|
||||
continue
|
||||
|
||||
if mutual_reachability_distance < next_node_min_reach:
|
||||
min_reachability[j] = mutual_reachability_distance
|
||||
current_sources[j] = current_node
|
||||
if mutual_reachability_distance < new_reachability:
|
||||
new_reachability = mutual_reachability_distance
|
||||
source_node = current_node
|
||||
new_node = j
|
||||
else:
|
||||
if next_node_min_reach < new_reachability:
|
||||
new_reachability = next_node_min_reach
|
||||
source_node = next_node_source
|
||||
new_node = j
|
||||
|
||||
mst[i].current_node = source_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = new_reachability
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
|
||||
"""Construct a single-linkage tree from an MST.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
|
||||
Returns
|
||||
-------
|
||||
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
|
||||
The single-linkage tree tree (dendrogram) built from the MST. Each
|
||||
of the array represents the following:
|
||||
|
||||
- left node/cluster
|
||||
- right node/cluster
|
||||
- distance
|
||||
- new cluster size
|
||||
"""
|
||||
cdef:
|
||||
cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
|
||||
|
||||
# Note mst.shape[0] is one fewer than the number of samples
|
||||
int64_t n_samples = mst.shape[0] + 1
|
||||
intp_t current_node_cluster, next_node_cluster
|
||||
int64_t current_node, next_node, i
|
||||
float64_t distance
|
||||
UnionFind U = UnionFind(n_samples)
|
||||
|
||||
single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
|
||||
|
||||
for i in range(n_samples - 1):
|
||||
|
||||
current_node = mst[i].current_node
|
||||
next_node = mst[i].next_node
|
||||
distance = mst[i].distance
|
||||
|
||||
current_node_cluster = U.fast_find(current_node)
|
||||
next_node_cluster = U.fast_find(next_node)
|
||||
|
||||
single_linkage[i].left_node = current_node_cluster
|
||||
single_linkage[i].right_node = next_node_cluster
|
||||
single_linkage[i].value = distance
|
||||
single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
|
||||
|
||||
U.union(current_node_cluster, next_node_cluster)
|
||||
|
||||
return single_linkage
|
||||
Binary file not shown.
@ -0,0 +1,212 @@
|
||||
# mutual reachability distance computations
|
||||
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
|
||||
# Meekail Zain <zainmeekail@gmail.com>
|
||||
# Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from cython cimport floating, integral
|
||||
from libc.math cimport isfinite, INFINITY
|
||||
from ...utils._typedefs cimport intp_t
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
def mutual_reachability_graph(
|
||||
distance_matrix, min_samples=5, max_distance=0.0
|
||||
):
|
||||
"""Compute the weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
The mutual reachability distance used to build the graph is defined as::
|
||||
|
||||
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
|
||||
|
||||
and the core distance `d_core` is defined as the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
Note that all computations are done in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
|
||||
Array of distances between samples. If sparse, the array must be in
|
||||
`CSR` format.
|
||||
|
||||
min_samples : int, default=5
|
||||
The parameter `k` used to calculate the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
max_distance : float, default=0.0
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
|
||||
(n_samples, n_samples)
|
||||
Weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
|
||||
Density-based clustering based on hierarchical density estimates.
|
||||
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
|
||||
(pp. 160-172). Springer Berlin Heidelberg.
|
||||
"""
|
||||
further_neighbor_idx = min_samples - 1
|
||||
if issparse(distance_matrix):
|
||||
if distance_matrix.format != "csr":
|
||||
raise ValueError(
|
||||
"Only sparse CSR matrices are supported for `distance_matrix`."
|
||||
)
|
||||
_sparse_mutual_reachability_graph(
|
||||
distance_matrix.data,
|
||||
distance_matrix.indices,
|
||||
distance_matrix.indptr,
|
||||
distance_matrix.shape[0],
|
||||
further_neighbor_idx=further_neighbor_idx,
|
||||
max_distance=max_distance,
|
||||
)
|
||||
else:
|
||||
_dense_mutual_reachability_graph(
|
||||
distance_matrix, further_neighbor_idx=further_neighbor_idx
|
||||
)
|
||||
return distance_matrix
|
||||
|
||||
|
||||
def _dense_mutual_reachability_graph(
|
||||
floating[:, :] distance_matrix,
|
||||
intp_t further_neighbor_idx,
|
||||
):
|
||||
"""Dense implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : ndarray of shape (n_samples, n_samples)
|
||||
Array of distances between samples.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
"""
|
||||
cdef:
|
||||
intp_t i, j, n_samples = distance_matrix.shape[0]
|
||||
floating mutual_reachibility_distance
|
||||
floating[::1] core_distances
|
||||
|
||||
# We assume that the distance matrix is symmetric. We choose to sort every
|
||||
# row to have the same implementation than the sparse case that requires
|
||||
# CSR matrix.
|
||||
core_distances = np.ascontiguousarray(
|
||||
np.partition(
|
||||
distance_matrix, further_neighbor_idx, axis=1
|
||||
)[:, further_neighbor_idx]
|
||||
)
|
||||
|
||||
with nogil:
|
||||
# TODO: Update w/ prange with thread count based on
|
||||
# _openmp_effective_n_threads
|
||||
for i in range(n_samples):
|
||||
for j in range(n_samples):
|
||||
mutual_reachibility_distance = max(
|
||||
core_distances[i],
|
||||
core_distances[j],
|
||||
distance_matrix[i, j],
|
||||
)
|
||||
distance_matrix[i, j] = mutual_reachibility_distance
|
||||
|
||||
|
||||
def _sparse_mutual_reachability_graph(
|
||||
cnp.ndarray[floating, ndim=1, mode="c"] data,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indices,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
|
||||
intp_t n_samples,
|
||||
intp_t further_neighbor_idx,
|
||||
floating max_distance,
|
||||
):
|
||||
"""Sparse implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly. This implementation only accepts `CSR` format sparse matrices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : sparse matrix of shape (n_samples, n_samples)
|
||||
Sparse matrix of distances between samples. The sparse format should
|
||||
be `CSR`.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
|
||||
max_distance : float
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
"""
|
||||
cdef:
|
||||
integral i, col_ind, row_ind
|
||||
floating mutual_reachibility_distance
|
||||
floating[:] core_distances
|
||||
floating[:] row_data
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
core_distances = np.empty(n_samples, dtype=dtype)
|
||||
|
||||
for i in range(n_samples):
|
||||
row_data = data[indptr[i]:indptr[i + 1]]
|
||||
if further_neighbor_idx < row_data.size:
|
||||
core_distances[i] = np.partition(
|
||||
row_data, further_neighbor_idx
|
||||
)[further_neighbor_idx]
|
||||
else:
|
||||
core_distances[i] = INFINITY
|
||||
|
||||
with nogil:
|
||||
for row_ind in range(n_samples):
|
||||
for i in range(indptr[row_ind], indptr[row_ind + 1]):
|
||||
col_ind = indices[i]
|
||||
mutual_reachibility_distance = max(
|
||||
core_distances[row_ind], core_distances[col_ind], data[i]
|
||||
)
|
||||
if isfinite(mutual_reachibility_distance):
|
||||
data[i] = mutual_reachibility_distance
|
||||
elif max_distance > 0:
|
||||
data[i] = max_distance
|
||||
Binary file not shown.
@ -0,0 +1,49 @@
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from ...utils._typedefs cimport intp_t, float64_t, uint8_t
|
||||
cimport numpy as cnp
|
||||
|
||||
# This corresponds to the scipy.cluster.hierarchy format
|
||||
ctypedef packed struct HIERARCHY_t:
|
||||
intp_t left_node
|
||||
intp_t right_node
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
# Effectively an edgelist encoding a parent/child pair, along with a value and
|
||||
# the corresponding cluster_size in each row providing a tree structure.
|
||||
ctypedef packed struct CONDENSED_t:
|
||||
intp_t parent
|
||||
intp_t child
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
@ -0,0 +1,799 @@
|
||||
# Tree handling (condensing, finding stable clusters) for hdbscan
|
||||
# Authors: Leland McInnes
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.math cimport isinf
|
||||
import cython
|
||||
|
||||
import numpy as np
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
cdef cnp.float64_t INFTY = np.inf
|
||||
cdef cnp.intp_t NOISE = -1
|
||||
|
||||
HIERARCHY_dtype = np.dtype([
|
||||
("left_node", np.intp),
|
||||
("right_node", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
CONDENSED_dtype = np.dtype([
|
||||
("parent", np.intp),
|
||||
("child", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
cpdef tuple tree_to_labels(
|
||||
const HIERARCHY_t[::1] single_linkage_tree,
|
||||
cnp.intp_t min_cluster_size=10,
|
||||
cluster_selection_method="eom",
|
||||
bint allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None,
|
||||
):
|
||||
cdef:
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
|
||||
|
||||
condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
|
||||
labels, probabilities = _get_clusters(
|
||||
condensed_tree,
|
||||
_compute_stability(condensed_tree),
|
||||
cluster_selection_method,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon,
|
||||
max_cluster_size,
|
||||
)
|
||||
|
||||
return (labels, probabilities)
|
||||
|
||||
cdef list bfs_from_hierarchy(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
"""
|
||||
Perform a breadth first search on a tree in scipy hclust format.
|
||||
"""
|
||||
|
||||
cdef list process_queue, next_queue, result
|
||||
cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cdef cnp.intp_t node
|
||||
process_queue = [bfs_root]
|
||||
result = []
|
||||
|
||||
while process_queue:
|
||||
result.extend(process_queue)
|
||||
# By construction, node i is formed by the union of nodes
|
||||
# hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
|
||||
process_queue = [
|
||||
x - n_samples
|
||||
for x in process_queue
|
||||
if x >= n_samples
|
||||
]
|
||||
if process_queue:
|
||||
next_queue = []
|
||||
for node in process_queue:
|
||||
next_queue.extend(
|
||||
[
|
||||
hierarchy[node].left_node,
|
||||
hierarchy[node].right_node,
|
||||
]
|
||||
)
|
||||
process_queue = next_queue
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t min_cluster_size=10
|
||||
):
|
||||
"""Condense a tree according to a minimum cluster size. This is akin
|
||||
to the runt pruning procedure of Stuetzle. The result is a much simpler
|
||||
tree that is easier to visualize. We include extra information on the
|
||||
lambda value at which individual points depart clusters for later
|
||||
analysis and computation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
A single linkage hierarchy in scipy.cluster.hierarchy format.
|
||||
|
||||
min_cluster_size : int, optional (default 10)
|
||||
The minimum size of clusters to consider. Clusters smaller than this
|
||||
are pruned from the tree.
|
||||
|
||||
Returns
|
||||
-------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root = 2 * hierarchy.shape[0]
|
||||
cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cnp.intp_t next_label = n_samples + 1
|
||||
list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
|
||||
|
||||
cnp.intp_t[::1] relabel
|
||||
cnp.uint8_t[::1] ignore
|
||||
|
||||
cnp.intp_t node, sub_node, left, right
|
||||
cnp.float64_t lambda_value, distance
|
||||
cnp.intp_t left_count, right_count
|
||||
HIERARCHY_t children
|
||||
|
||||
relabel = np.empty(root + 1, dtype=np.intp)
|
||||
relabel[root] = n_samples
|
||||
result_list = []
|
||||
ignore = np.zeros(len(node_list), dtype=bool)
|
||||
|
||||
for node in node_list:
|
||||
if ignore[node] or node < n_samples:
|
||||
continue
|
||||
|
||||
children = hierarchy[node - n_samples]
|
||||
left = children.left_node
|
||||
right = children.right_node
|
||||
distance = children.value
|
||||
if distance > 0.0:
|
||||
lambda_value = 1.0 / distance
|
||||
else:
|
||||
lambda_value = INFTY
|
||||
|
||||
if left >= n_samples:
|
||||
left_count = hierarchy[left - n_samples].cluster_size
|
||||
else:
|
||||
left_count = 1
|
||||
|
||||
if right >= n_samples:
|
||||
right_count = <cnp.intp_t> hierarchy[right - n_samples].cluster_size
|
||||
else:
|
||||
right_count = 1
|
||||
|
||||
if left_count >= min_cluster_size and right_count >= min_cluster_size:
|
||||
relabel[left] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[left], lambda_value, left_count)
|
||||
)
|
||||
|
||||
relabel[right] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[right], lambda_value, right_count)
|
||||
)
|
||||
|
||||
elif left_count < min_cluster_size and right_count < min_cluster_size:
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
elif left_count < min_cluster_size:
|
||||
relabel[right] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
else:
|
||||
relabel[left] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
return np.array(result_list, dtype=CONDENSED_dtype)
|
||||
|
||||
|
||||
cdef dict _compute_stability(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.float64_t[::1] result, births
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
cnp.intp_t parent, cluster_size, result_index, idx
|
||||
cnp.float64_t lambda_val
|
||||
CONDENSED_t condensed_node
|
||||
cnp.intp_t largest_child = condensed_tree['child'].max()
|
||||
cnp.intp_t smallest_cluster = np.min(parents)
|
||||
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
|
||||
dict stability_dict = {}
|
||||
|
||||
largest_child = max(largest_child, smallest_cluster)
|
||||
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
|
||||
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
births[condensed_node.child] = condensed_node.value
|
||||
|
||||
births[smallest_cluster] = 0.0
|
||||
|
||||
result = np.zeros(num_clusters, dtype=np.float64)
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
parent = condensed_node.parent
|
||||
lambda_val = condensed_node.value
|
||||
cluster_size = condensed_node.cluster_size
|
||||
|
||||
result_index = parent - smallest_cluster
|
||||
result[result_index] += (lambda_val - births[parent]) * cluster_size
|
||||
|
||||
for idx in range(num_clusters):
|
||||
stability_dict[idx + smallest_cluster] = result[idx]
|
||||
|
||||
return stability_dict
|
||||
|
||||
|
||||
cdef list bfs_from_cluster_tree(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
|
||||
cdef:
|
||||
list result = []
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
|
||||
np.array([bfs_root], dtype=np.intp)
|
||||
)
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
while len(process_queue) > 0:
|
||||
result.extend(process_queue.tolist())
|
||||
process_queue = children[np.isin(parents, process_queue)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
|
||||
|
||||
cdef:
|
||||
cnp.intp_t parent, current_parent, idx
|
||||
cnp.float64_t lambda_val, max_lambda
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t largest_parent = condensed_tree['parent'].max()
|
||||
|
||||
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
|
||||
current_parent = condensed_tree[0].parent
|
||||
max_lambda = condensed_tree[0].value
|
||||
|
||||
for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
parent = condensed_tree[idx].parent
|
||||
lambda_val = condensed_tree[idx].value
|
||||
|
||||
if parent == current_parent:
|
||||
max_lambda = max(max_lambda, lambda_val)
|
||||
else:
|
||||
deaths[current_parent] = max_lambda
|
||||
current_parent = parent
|
||||
max_lambda = lambda_val
|
||||
|
||||
deaths[current_parent] = max_lambda # value for last parent
|
||||
return deaths
|
||||
|
||||
|
||||
@cython.final
|
||||
cdef class TreeUnionFind:
|
||||
|
||||
cdef cnp.intp_t[:, ::1] data
|
||||
cdef cnp.uint8_t[::1] is_component
|
||||
|
||||
def __init__(self, size):
|
||||
cdef cnp.intp_t idx
|
||||
self.data = np.zeros((size, 2), dtype=np.intp)
|
||||
for idx in range(size):
|
||||
self.data[idx, 0] = idx
|
||||
self.is_component = np.ones(size, dtype=np.uint8)
|
||||
|
||||
cdef void union(self, cnp.intp_t x, cnp.intp_t y):
|
||||
cdef cnp.intp_t x_root = self.find(x)
|
||||
cdef cnp.intp_t y_root = self.find(y)
|
||||
|
||||
if self.data[x_root, 1] < self.data[y_root, 1]:
|
||||
self.data[x_root, 0] = y_root
|
||||
elif self.data[x_root, 1] > self.data[y_root, 1]:
|
||||
self.data[y_root, 0] = x_root
|
||||
else:
|
||||
self.data[y_root, 0] = x_root
|
||||
self.data[x_root, 1] += 1
|
||||
return
|
||||
|
||||
cdef cnp.intp_t find(self, cnp.intp_t x):
|
||||
if self.data[x, 0] != x:
|
||||
self.data[x, 0] = self.find(self.data[x, 0])
|
||||
self.is_component[x] = False
|
||||
return self.data[x, 0]
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
|
||||
const HIERARCHY_t[::1] linkage,
|
||||
cnp.float64_t cut,
|
||||
cnp.intp_t min_cluster_size
|
||||
):
|
||||
"""Given a single linkage tree and a cut value, return the
|
||||
vector of cluster labels at that cut value. This is useful
|
||||
for Robust Single Linkage, and extracting DBSCAN results
|
||||
from a single HDBSCAN run.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
The single linkage tree in scipy.cluster.hierarchy format.
|
||||
|
||||
cut : double
|
||||
The cut value at which to find clusters.
|
||||
|
||||
min_cluster_size : int
|
||||
The minimum cluster size; clusters below this size at
|
||||
the cut will be considered noise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t n, cluster, root, n_samples, cluster_label
|
||||
cnp.intp_t[::1] unique_labels, cluster_size
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
TreeUnionFind union_find
|
||||
dict cluster_label_map
|
||||
HIERARCHY_t node
|
||||
|
||||
root = 2 * linkage.shape[0]
|
||||
n_samples = root // 2 + 1
|
||||
result = np.empty(n_samples, dtype=np.intp)
|
||||
union_find = TreeUnionFind(root + 1)
|
||||
|
||||
cluster = n_samples
|
||||
for node in linkage:
|
||||
if node.value < cut:
|
||||
union_find.union(node.left_node, cluster)
|
||||
union_find.union(node.right_node, cluster)
|
||||
cluster += 1
|
||||
|
||||
cluster_size = np.zeros(cluster, dtype=np.intp)
|
||||
for n in range(n_samples):
|
||||
cluster = union_find.find(n)
|
||||
cluster_size[cluster] += 1
|
||||
result[n] = cluster
|
||||
|
||||
cluster_label_map = {-1: NOISE}
|
||||
cluster_label = 0
|
||||
unique_labels = np.unique(result)
|
||||
|
||||
for cluster in unique_labels:
|
||||
if cluster_size[cluster] < min_cluster_size:
|
||||
cluster_label_map[cluster] = NOISE
|
||||
else:
|
||||
cluster_label_map[cluster] = cluster_label
|
||||
cluster_label += 1
|
||||
|
||||
for n in range(n_samples):
|
||||
result[n] = cluster_label_map[result[n]]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
set clusters,
|
||||
dict cluster_label_map,
|
||||
cnp.intp_t allow_single_cluster,
|
||||
cnp.float64_t cluster_selection_epsilon
|
||||
):
|
||||
"""Given a condensed tree, clusters and a labeling map for the clusters,
|
||||
return an array containing the labels of each point based on cluster
|
||||
membership. Note that this is where points may be marked as noisy
|
||||
outliers. The determination of some points as noise is in large, single-
|
||||
cluster datasets is controlled by the `allow_single_cluster` and
|
||||
`cluster_selection_epsilon` parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
clusters : set
|
||||
The set of nodes corresponding to identified clusters. These node
|
||||
values should be the same as those present in `condensed_tree`.
|
||||
|
||||
cluster_label_map : dict
|
||||
A mapping from the node values present in `clusters` to the labels
|
||||
which will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root_cluster
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
|
||||
TreeUnionFind union_find
|
||||
cnp.intp_t n, parent, child, cluster
|
||||
cnp.float64_t threshold
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
root_cluster = np.min(parent_array)
|
||||
result = np.empty(root_cluster, dtype=np.intp)
|
||||
union_find = TreeUnionFind(np.max(parent_array) + 1)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
child = child_array[n]
|
||||
parent = parent_array[n]
|
||||
if child not in clusters:
|
||||
union_find.union(parent, child)
|
||||
|
||||
for n in range(root_cluster):
|
||||
cluster = union_find.find(n)
|
||||
label = NOISE
|
||||
if cluster != root_cluster:
|
||||
label = cluster_label_map[cluster]
|
||||
elif len(clusters) == 1 and allow_single_cluster:
|
||||
# There can only be one edge with this particular child hence this
|
||||
# expression extracts a unique, scalar lambda value.
|
||||
parent_lambda = lambda_array[child_array == n]
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
threshold = 1 / cluster_selection_epsilon
|
||||
else:
|
||||
# The threshold should be calculated per-sample based on the
|
||||
# largest lambda of any simbling node.
|
||||
threshold = lambda_array[parent_array == cluster].max()
|
||||
if parent_lambda >= threshold:
|
||||
label = cluster_label_map[cluster]
|
||||
|
||||
result[n] = label
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict cluster_map,
|
||||
cnp.intp_t[::1] labels
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
|
||||
cnp.float64_t[:] lambda_array
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t[:] child_array, parent_array
|
||||
cnp.intp_t root_cluster, n, point, cluster_num, cluster
|
||||
cnp.float64_t max_lambda, lambda_val
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
result = np.zeros(labels.shape[0])
|
||||
deaths = max_lambdas(condensed_tree)
|
||||
root_cluster = np.min(parent_array)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
point = child_array[n]
|
||||
if point >= root_cluster:
|
||||
continue
|
||||
|
||||
cluster_num = labels[point]
|
||||
if cluster_num == -1:
|
||||
continue
|
||||
|
||||
cluster = cluster_map[cluster_num]
|
||||
max_lambda = deaths[cluster]
|
||||
if max_lambda == 0.0 or isinf(lambda_array[n]):
|
||||
result[point] = 1.0
|
||||
else:
|
||||
lambda_val = min(lambda_array[n], max_lambda)
|
||||
result[point] = lambda_val / max_lambda
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef list recurse_leaf_dfs(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.intp_t current_node
|
||||
):
|
||||
cdef cnp.intp_t[:] children
|
||||
cdef cnp.intp_t child
|
||||
|
||||
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
|
||||
if children.shape[0] == 0:
|
||||
return [current_node,]
|
||||
else:
|
||||
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
|
||||
|
||||
|
||||
cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
|
||||
cdef cnp.intp_t root
|
||||
if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
|
||||
return []
|
||||
root = cluster_tree['parent'].min()
|
||||
return recurse_leaf_dfs(cluster_tree, root)
|
||||
|
||||
cdef cnp.intp_t traverse_upwards(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t leaf,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef cnp.intp_t root, parent
|
||||
cdef cnp.float64_t parent_eps
|
||||
|
||||
root = cluster_tree['parent'].min()
|
||||
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
|
||||
if parent == root:
|
||||
if allow_single_cluster:
|
||||
return parent
|
||||
else:
|
||||
return leaf # return node closest to root
|
||||
|
||||
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
|
||||
if parent_eps > cluster_selection_epsilon:
|
||||
return parent
|
||||
else:
|
||||
return traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
parent,
|
||||
allow_single_cluster
|
||||
)
|
||||
|
||||
cdef set epsilon_search(
|
||||
set leaves,
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef:
|
||||
list selected_clusters = list()
|
||||
list processed = list()
|
||||
cnp.intp_t leaf, epsilon_child, sub_node
|
||||
cnp.float64_t eps
|
||||
cnp.uint8_t[:] leaf_nodes
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
|
||||
|
||||
for leaf in leaves:
|
||||
leaf_nodes = children == leaf
|
||||
eps = 1 / distances[leaf_nodes][0]
|
||||
if eps < cluster_selection_epsilon:
|
||||
if leaf not in processed:
|
||||
epsilon_child = traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
leaf,
|
||||
allow_single_cluster
|
||||
)
|
||||
selected_clusters.append(epsilon_child)
|
||||
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
|
||||
if sub_node != epsilon_child:
|
||||
processed.append(sub_node)
|
||||
else:
|
||||
selected_clusters.append(leaf)
|
||||
|
||||
return set(selected_clusters)
|
||||
|
||||
|
||||
@cython.wraparound(True)
|
||||
cdef tuple _get_clusters(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict stability,
|
||||
cluster_selection_method='eom',
|
||||
cnp.uint8_t allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None
|
||||
):
|
||||
"""Given a tree and stability dict, produce the cluster labels
|
||||
(and probabilities) for a flat clustering based on the chosen
|
||||
cluster selection method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
stability : dict
|
||||
A dictionary mapping cluster_ids to stability values
|
||||
|
||||
cluster_selection_method : string, optional (default 'eom')
|
||||
The method of selecting clusters. The default is the
|
||||
Excess of Mass algorithm specified by 'eom'. The alternate
|
||||
option is 'leaf'.
|
||||
|
||||
allow_single_cluster : boolean, optional (default False)
|
||||
Whether to allow a single cluster to be selected by the
|
||||
Excess of Mass algorithm.
|
||||
|
||||
cluster_selection_epsilon: double, optional (default 0.0)
|
||||
A distance threshold for cluster splits.
|
||||
|
||||
max_cluster_size: int, default=None
|
||||
The maximum size for clusters located by the EOM clusterer. Can
|
||||
be overridden by the cluster_selection_epsilon parameter in
|
||||
rare cases.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
An integer array of cluster labels, with -1 denoting noise.
|
||||
|
||||
probabilities : ndarray (n_samples,)
|
||||
The cluster membership strength of each sample.
|
||||
|
||||
stabilities : ndarray (n_clusters,)
|
||||
The cluster coherence strengths of each cluster.
|
||||
"""
|
||||
cdef:
|
||||
list node_list
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
|
||||
cnp.uint8_t[::1] child_selection
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
dict is_cluster, cluster_sizes
|
||||
cnp.float64_t subtree_stability
|
||||
cnp.intp_t node, sub_node, cluster, n_samples
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
|
||||
|
||||
# Assume clusters are ordered by numeric id equivalent to
|
||||
# a topological sort of the tree; This is valid given the
|
||||
# current implementation above, so don't change that ... or
|
||||
# if you do, change this accordingly!
|
||||
if allow_single_cluster:
|
||||
node_list = sorted(stability.keys(), reverse=True)
|
||||
else:
|
||||
node_list = sorted(stability.keys(), reverse=True)[:-1]
|
||||
# (exclude root)
|
||||
|
||||
cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
|
||||
is_cluster = {cluster: True for cluster in node_list}
|
||||
n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
|
||||
|
||||
if max_cluster_size is None:
|
||||
max_cluster_size = n_samples + 1 # Set to a value that will never be triggered
|
||||
cluster_sizes = {
|
||||
child: cluster_size for child, cluster_size
|
||||
in zip(cluster_tree['child'], cluster_tree['cluster_size'])
|
||||
}
|
||||
if allow_single_cluster:
|
||||
# Compute cluster size for the root node
|
||||
cluster_sizes[node_list[-1]] = np.sum(
|
||||
cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
|
||||
|
||||
if cluster_selection_method == 'eom':
|
||||
for node in node_list:
|
||||
child_selection = (cluster_tree['parent'] == node)
|
||||
subtree_stability = np.sum([
|
||||
stability[child] for
|
||||
child in cluster_tree['child'][child_selection]])
|
||||
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
|
||||
is_cluster[node] = False
|
||||
stability[node] = subtree_stability
|
||||
else:
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
|
||||
if sub_node != node:
|
||||
is_cluster[sub_node] = False
|
||||
|
||||
if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
|
||||
eom_clusters = [c for c in is_cluster if is_cluster[c]]
|
||||
selected_clusters = []
|
||||
# first check if eom_clusters only has root node, which skips epsilon check.
|
||||
if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
|
||||
if allow_single_cluster:
|
||||
selected_clusters = eom_clusters
|
||||
else:
|
||||
selected_clusters = epsilon_search(
|
||||
set(eom_clusters),
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
elif cluster_selection_method == 'leaf':
|
||||
leaves = set(get_cluster_tree_leaves(cluster_tree))
|
||||
if len(leaves) == 0:
|
||||
for c in is_cluster:
|
||||
is_cluster[c] = False
|
||||
is_cluster[condensed_tree['parent'].min()] = True
|
||||
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
selected_clusters = epsilon_search(
|
||||
leaves,
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
else:
|
||||
selected_clusters = leaves
|
||||
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
clusters = set([c for c in is_cluster if is_cluster[c]])
|
||||
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
|
||||
reverse_cluster_map = {n: c for c, n in cluster_map.items()}
|
||||
|
||||
labels = _do_labelling(
|
||||
condensed_tree,
|
||||
clusters,
|
||||
cluster_map,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon
|
||||
)
|
||||
probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
|
||||
|
||||
return (labels, probs)
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,16 @@
|
||||
cluster_hdbscan_extension_metadata = {
|
||||
'_linkage': {'sources': ['_linkage.pyx', metrics_cython_tree]},
|
||||
'_reachability': {'sources': ['_reachability.pyx']},
|
||||
'_tree': {'sources': ['_tree.pyx']}
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
ext_dict.get('sources'),
|
||||
dependencies: [np_dep],
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/cluster/_hdbscan',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,63 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_error_sparse_format():
|
||||
"""Check that we raise an error if the sparse format is not CSR."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, "sparse_csc")
|
||||
|
||||
err_msg = "Only sparse CSR matrices are supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mutual_reachability_graph(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
def test_mutual_reachability_graph_inplace(array_type):
|
||||
"""Check that the operation is happening inplace."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
|
||||
assert id(mr_graph) == id(X)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_equivalence_dense_sparse():
|
||||
"""Check that we get the same results for dense and sparse implementation."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(5, 5)
|
||||
X_dense = X.T @ X
|
||||
X_sparse = _convert_container(X_dense, "sparse_csr")
|
||||
|
||||
mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
|
||||
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
|
||||
|
||||
assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
|
||||
"""Check that the computation preserve dtype thanks to fused types."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = (X.T @ X).astype(dtype)
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
assert X.dtype == dtype
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
assert mr_graph.dtype == dtype
|
||||
Reference in New Issue
Block a user