reconnect moved files to git repo
This commit is contained in:
741
venv/lib/python3.11/site-packages/sklearn/cluster/_birch.py
Normal file
741
venv/lib/python3.11/site-packages/sklearn/cluster/_birch.py
Normal file
@ -0,0 +1,741 @@
|
||||
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from math import sqrt
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from .._config import config_context
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
ClusterMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..metrics import pairwise_distances_argmin
|
||||
from ..metrics.pairwise import euclidean_distances
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import row_norms
|
||||
from ..utils.validation import check_is_fitted
|
||||
from . import AgglomerativeClustering
|
||||
|
||||
|
||||
def _iterate_sparse_X(X):
|
||||
"""This little hack returns a densified row when iterating over a sparse
|
||||
matrix, instead of constructing a sparse matrix for every row that is
|
||||
expensive.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
X_indices = X.indices
|
||||
X_data = X.data
|
||||
X_indptr = X.indptr
|
||||
|
||||
for i in range(n_samples):
|
||||
row = np.zeros(X.shape[1])
|
||||
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
||||
nonzero_indices = X_indices[startptr:endptr]
|
||||
row[nonzero_indices] = X_data[startptr:endptr]
|
||||
yield row
|
||||
|
||||
|
||||
def _split_node(node, threshold, branching_factor):
|
||||
"""The node has to be split if there is no place for a new subcluster
|
||||
in the node.
|
||||
1. Two empty nodes and two empty subclusters are initialized.
|
||||
2. The pair of distant subclusters are found.
|
||||
3. The properties of the empty subclusters and nodes are updated
|
||||
according to the nearest distance between the subclusters to the
|
||||
pair of distant subclusters.
|
||||
4. The two nodes are set as children to the two subclusters.
|
||||
"""
|
||||
new_subcluster1 = _CFSubcluster()
|
||||
new_subcluster2 = _CFSubcluster()
|
||||
new_node1 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_node2 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_subcluster1.child_ = new_node1
|
||||
new_subcluster2.child_ = new_node2
|
||||
|
||||
if node.is_leaf:
|
||||
if node.prev_leaf_ is not None:
|
||||
node.prev_leaf_.next_leaf_ = new_node1
|
||||
new_node1.prev_leaf_ = node.prev_leaf_
|
||||
new_node1.next_leaf_ = new_node2
|
||||
new_node2.prev_leaf_ = new_node1
|
||||
new_node2.next_leaf_ = node.next_leaf_
|
||||
if node.next_leaf_ is not None:
|
||||
node.next_leaf_.prev_leaf_ = new_node2
|
||||
|
||||
dist = euclidean_distances(
|
||||
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
|
||||
)
|
||||
n_clusters = dist.shape[0]
|
||||
|
||||
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
|
||||
node1_dist, node2_dist = dist[(farthest_idx,)]
|
||||
|
||||
node1_closer = node1_dist < node2_dist
|
||||
# make sure node1 is closest to itself even if all distances are equal.
|
||||
# This can only happen when all node.centroids_ are duplicates leading to all
|
||||
# distances between centroids being zero.
|
||||
node1_closer[farthest_idx[0]] = True
|
||||
|
||||
for idx, subcluster in enumerate(node.subclusters_):
|
||||
if node1_closer[idx]:
|
||||
new_node1.append_subcluster(subcluster)
|
||||
new_subcluster1.update(subcluster)
|
||||
else:
|
||||
new_node2.append_subcluster(subcluster)
|
||||
new_subcluster2.update(subcluster)
|
||||
return new_subcluster1, new_subcluster2
|
||||
|
||||
|
||||
class _CFNode:
|
||||
"""Each node in a CFTree is called a CFNode.
|
||||
|
||||
The CFNode can have a maximum of branching_factor
|
||||
number of CFSubclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float
|
||||
Threshold needed for a new subcluster to enter a CFSubcluster.
|
||||
|
||||
branching_factor : int
|
||||
Maximum number of CF subclusters in each node.
|
||||
|
||||
is_leaf : bool
|
||||
We need to know if the CFNode is a leaf or not, in order to
|
||||
retrieve the final subclusters.
|
||||
|
||||
n_features : int
|
||||
The number of features.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
subclusters_ : list
|
||||
List of subclusters for a particular CFNode.
|
||||
|
||||
prev_leaf_ : _CFNode
|
||||
Useful only if is_leaf is True.
|
||||
|
||||
next_leaf_ : _CFNode
|
||||
next_leaf. Useful only if is_leaf is True.
|
||||
the final subclusters.
|
||||
|
||||
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
||||
the centroids are just a view of the ``init_centroids_`` .
|
||||
|
||||
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
|
||||
|
||||
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
View of ``init_centroids_``.
|
||||
|
||||
squared_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
View of ``init_sq_norm_``.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.is_leaf = is_leaf
|
||||
self.n_features = n_features
|
||||
|
||||
# The list of subclusters, centroids and squared norms
|
||||
# to manipulate throughout.
|
||||
self.subclusters_ = []
|
||||
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
||||
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
|
||||
self.squared_norm_ = []
|
||||
self.prev_leaf_ = None
|
||||
self.next_leaf_ = None
|
||||
|
||||
def append_subcluster(self, subcluster):
|
||||
n_samples = len(self.subclusters_)
|
||||
self.subclusters_.append(subcluster)
|
||||
self.init_centroids_[n_samples] = subcluster.centroid_
|
||||
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
|
||||
|
||||
# Keep centroids and squared norm as views. In this way
|
||||
# if we change init_centroids and init_sq_norm_, it is
|
||||
# sufficient,
|
||||
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
||||
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
|
||||
|
||||
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
|
||||
"""Remove a subcluster from a node and update it with the
|
||||
split subclusters.
|
||||
"""
|
||||
ind = self.subclusters_.index(subcluster)
|
||||
self.subclusters_[ind] = new_subcluster1
|
||||
self.init_centroids_[ind] = new_subcluster1.centroid_
|
||||
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
|
||||
self.append_subcluster(new_subcluster2)
|
||||
|
||||
def insert_cf_subcluster(self, subcluster):
|
||||
"""Insert a new subcluster into the node."""
|
||||
if not self.subclusters_:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
# We need to find the closest subcluster among all the
|
||||
# subclusters so that we can insert our new subcluster.
|
||||
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
|
||||
dist_matrix *= -2.0
|
||||
dist_matrix += self.squared_norm_
|
||||
closest_index = np.argmin(dist_matrix)
|
||||
closest_subcluster = self.subclusters_[closest_index]
|
||||
|
||||
# If the subcluster has a child, we need a recursive strategy.
|
||||
if closest_subcluster.child_ is not None:
|
||||
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if not split_child:
|
||||
# If it is determined that the child need not be split, we
|
||||
# can just update the closest_subcluster
|
||||
closest_subcluster.update(subcluster)
|
||||
self.init_centroids_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].centroid_
|
||||
self.init_sq_norm_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].sq_norm_
|
||||
return False
|
||||
|
||||
# things not too good. we need to redistribute the subclusters in
|
||||
# our child node, and add a new subcluster in the parent
|
||||
# subcluster to accommodate the new child.
|
||||
else:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
closest_subcluster.child_,
|
||||
threshold,
|
||||
branching_factor,
|
||||
)
|
||||
self.update_split_subclusters(
|
||||
closest_subcluster, new_subcluster1, new_subcluster2
|
||||
)
|
||||
|
||||
if len(self.subclusters_) > self.branching_factor:
|
||||
return True
|
||||
return False
|
||||
|
||||
# good to go!
|
||||
else:
|
||||
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
||||
if merged:
|
||||
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
||||
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
|
||||
return False
|
||||
|
||||
# not close to any other subclusters, and we still
|
||||
# have space, so add.
|
||||
elif len(self.subclusters_) < self.branching_factor:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
# We do not have enough space nor is it closer to an
|
||||
# other subcluster. We need to split.
|
||||
else:
|
||||
self.append_subcluster(subcluster)
|
||||
return True
|
||||
|
||||
|
||||
class _CFSubcluster:
|
||||
"""Each subcluster in a CFNode is called a CFSubcluster.
|
||||
|
||||
A CFSubcluster can have a CFNode has its child.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linear_sum : ndarray of shape (n_features,), default=None
|
||||
Sample. This is kept optional to allow initialization of empty
|
||||
subclusters.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_samples_ : int
|
||||
Number of samples that belong to each subcluster.
|
||||
|
||||
linear_sum_ : ndarray
|
||||
Linear sum of all the samples in a subcluster. Prevents holding
|
||||
all sample data in memory.
|
||||
|
||||
squared_sum_ : float
|
||||
Sum of the squared l2 norms of all samples belonging to a subcluster.
|
||||
|
||||
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Centroid of the subcluster. Prevent recomputing of centroids when
|
||||
``CFNode.centroids_`` is called.
|
||||
|
||||
child_ : _CFNode
|
||||
Child Node of the subcluster. Once a given _CFNode is set as the child
|
||||
of the _CFNode, it is set to ``self.child_``.
|
||||
|
||||
sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
Squared norm of the subcluster. Used to prevent recomputing when
|
||||
pairwise minimum distances are computed.
|
||||
"""
|
||||
|
||||
def __init__(self, *, linear_sum=None):
|
||||
if linear_sum is None:
|
||||
self.n_samples_ = 0
|
||||
self.squared_sum_ = 0.0
|
||||
self.centroid_ = self.linear_sum_ = 0
|
||||
else:
|
||||
self.n_samples_ = 1
|
||||
self.centroid_ = self.linear_sum_ = linear_sum
|
||||
self.squared_sum_ = self.sq_norm_ = np.dot(
|
||||
self.linear_sum_, self.linear_sum_
|
||||
)
|
||||
self.child_ = None
|
||||
|
||||
def update(self, subcluster):
|
||||
self.n_samples_ += subcluster.n_samples_
|
||||
self.linear_sum_ += subcluster.linear_sum_
|
||||
self.squared_sum_ += subcluster.squared_sum_
|
||||
self.centroid_ = self.linear_sum_ / self.n_samples_
|
||||
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
|
||||
|
||||
def merge_subcluster(self, nominee_cluster, threshold):
|
||||
"""Check if a cluster is worthy enough to be merged. If
|
||||
yes then merge.
|
||||
"""
|
||||
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
|
||||
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
||||
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
||||
new_centroid = (1 / new_n) * new_ls
|
||||
new_sq_norm = np.dot(new_centroid, new_centroid)
|
||||
|
||||
# The squared radius of the cluster is defined:
|
||||
# r^2 = sum_i ||x_i - c||^2 / n
|
||||
# with x_i the n points assigned to the cluster and c its centroid:
|
||||
# c = sum_i x_i / n
|
||||
# This can be expanded to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
|
||||
# and therefore simplifies to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
|
||||
sq_radius = new_ss / new_n - new_sq_norm
|
||||
|
||||
if sq_radius <= threshold**2:
|
||||
(
|
||||
self.n_samples_,
|
||||
self.linear_sum_,
|
||||
self.squared_sum_,
|
||||
self.centroid_,
|
||||
self.sq_norm_,
|
||||
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def radius(self):
|
||||
"""Return radius of the subcluster"""
|
||||
# Because of numerical issues, this could become negative
|
||||
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
|
||||
return sqrt(max(0, sq_radius))
|
||||
|
||||
|
||||
class Birch(
|
||||
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
|
||||
):
|
||||
"""Implements the BIRCH clustering algorithm.
|
||||
|
||||
It is a memory-efficient, online-learning algorithm provided as an
|
||||
alternative to :class:`MiniBatchKMeans`. It constructs a tree
|
||||
data structure with the cluster centroids being read off the leaf.
|
||||
These can be either the final cluster centroids or can be provided as input
|
||||
to another clustering algorithm such as :class:`AgglomerativeClustering`.
|
||||
|
||||
Read more in the :ref:`User Guide <birch>`.
|
||||
|
||||
.. versionadded:: 0.16
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float, default=0.5
|
||||
The radius of the subcluster obtained by merging a new sample and the
|
||||
closest subcluster should be lesser than the threshold. Otherwise a new
|
||||
subcluster is started. Setting this value to be very low promotes
|
||||
splitting and vice-versa.
|
||||
|
||||
branching_factor : int, default=50
|
||||
Maximum number of CF subclusters in each node. If a new samples enters
|
||||
such that the number of subclusters exceed the branching_factor then
|
||||
that node is split into two nodes with the subclusters redistributed
|
||||
in each. The parent subcluster of that node is removed and two new
|
||||
subclusters are added as parents of the 2 split nodes.
|
||||
|
||||
n_clusters : int, instance of sklearn.cluster model or None, default=3
|
||||
Number of clusters after the final clustering step, which treats the
|
||||
subclusters from the leaves as new samples.
|
||||
|
||||
- `None` : the final clustering step is not performed and the
|
||||
subclusters are returned as they are.
|
||||
|
||||
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
|
||||
is fit treating the subclusters as new samples and the initial data
|
||||
is mapped to the label of the closest subcluster.
|
||||
|
||||
- `int` : the model fit is :class:`AgglomerativeClustering` with
|
||||
`n_clusters` set to be equal to the int.
|
||||
|
||||
compute_labels : bool, default=True
|
||||
Whether or not to compute labels for each fit.
|
||||
|
||||
copy : bool, default=True
|
||||
Whether or not to make a copy of the given data. If set to False,
|
||||
the initial data will be overwritten.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
root_ : _CFNode
|
||||
Root of the CFTree.
|
||||
|
||||
dummy_leaf_ : _CFNode
|
||||
Start pointer to all the leaves.
|
||||
|
||||
subcluster_centers_ : ndarray
|
||||
Centroids of all subclusters read directly from the leaves.
|
||||
|
||||
subcluster_labels_ : ndarray
|
||||
Labels assigned to the centroids of the subclusters after
|
||||
they are clustered globally.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Array of labels assigned to the input data.
|
||||
if partial_fit is used instead of fit, they are assigned to the
|
||||
last batch of data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
MiniBatchKMeans : Alternative implementation that does incremental updates
|
||||
of the centers' positions using mini-batches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tree data structure consists of nodes with each node consisting of
|
||||
a number of subclusters. The maximum number of subclusters in a node
|
||||
is determined by the branching factor. Each subcluster maintains a
|
||||
linear sum, squared sum and the number of samples in that subcluster.
|
||||
In addition, each subcluster can also have a node as its child, if the
|
||||
subcluster is not a member of a leaf node.
|
||||
|
||||
For a new point entering the root, it is merged with the subcluster closest
|
||||
to it and the linear sum, squared sum and the number of samples of that
|
||||
subcluster are updated. This is done recursively till the properties of
|
||||
the leaf node are updated.
|
||||
|
||||
References
|
||||
----------
|
||||
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
|
||||
BIRCH: An efficient data clustering method for large databases.
|
||||
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
|
||||
|
||||
* Roberto Perdisci
|
||||
JBirch - Java implementation of BIRCH clustering algorithm
|
||||
https://code.google.com/archive/p/jbirch
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import Birch
|
||||
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
|
||||
>>> brc = Birch(n_clusters=None)
|
||||
>>> brc.fit(X)
|
||||
Birch(n_clusters=None)
|
||||
>>> brc.predict(X)
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"threshold": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
|
||||
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
|
||||
"compute_labels": ["boolean"],
|
||||
"copy": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
threshold=0.5,
|
||||
branching_factor=50,
|
||||
n_clusters=3,
|
||||
compute_labels=True,
|
||||
copy=True,
|
||||
):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.n_clusters = n_clusters
|
||||
self.compute_labels = compute_labels
|
||||
self.copy = copy
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Build a CF Tree for the input data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
return self._fit(X, partial=False)
|
||||
|
||||
def _fit(self, X, partial):
|
||||
has_root = getattr(self, "root_", None)
|
||||
first_call = not (partial and has_root)
|
||||
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse="csr",
|
||||
copy=self.copy,
|
||||
reset=first_call,
|
||||
dtype=[np.float64, np.float32],
|
||||
)
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# If partial_fit is called for the first time or fit is called, we
|
||||
# start a new tree.
|
||||
if first_call:
|
||||
# The first root is the leaf. Manipulate this object throughout.
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
|
||||
# To enable getting back subclusters.
|
||||
self.dummy_leaf_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.dummy_leaf_.next_leaf_ = self.root_
|
||||
self.root_.prev_leaf_ = self.dummy_leaf_
|
||||
|
||||
# Cannot vectorize. Enough to convince to use cython.
|
||||
if not sparse.issparse(X):
|
||||
iter_func = iter
|
||||
else:
|
||||
iter_func = _iterate_sparse_X
|
||||
|
||||
for sample in iter_func(X):
|
||||
subcluster = _CFSubcluster(linear_sum=sample)
|
||||
split = self.root_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if split:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
self.root_, threshold, branching_factor
|
||||
)
|
||||
del self.root_
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=False,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.root_.append_subcluster(new_subcluster1)
|
||||
self.root_.append_subcluster(new_subcluster2)
|
||||
|
||||
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
||||
self.subcluster_centers_ = centroids
|
||||
self._n_features_out = self.subcluster_centers_.shape[0]
|
||||
|
||||
self._global_clustering(X)
|
||||
return self
|
||||
|
||||
def _get_leaves(self):
|
||||
"""
|
||||
Retrieve the leaves of the CF Node.
|
||||
|
||||
Returns
|
||||
-------
|
||||
leaves : list of shape (n_leaves,)
|
||||
List of the leaf nodes.
|
||||
"""
|
||||
leaf_ptr = self.dummy_leaf_.next_leaf_
|
||||
leaves = []
|
||||
while leaf_ptr is not None:
|
||||
leaves.append(leaf_ptr)
|
||||
leaf_ptr = leaf_ptr.next_leaf_
|
||||
return leaves
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X=None, y=None):
|
||||
"""
|
||||
Online learning. Prevents rebuilding of CFTree from scratch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
|
||||
default=None
|
||||
Input data. If X is not provided, only the global clustering
|
||||
step is done.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
if X is None:
|
||||
# Perform just the final global clustering step.
|
||||
self._global_clustering()
|
||||
return self
|
||||
else:
|
||||
return self._fit(X, partial=True)
|
||||
|
||||
def _check_fit(self, X):
|
||||
check_is_fitted(self)
|
||||
|
||||
if (
|
||||
hasattr(self, "subcluster_centers_")
|
||||
and X.shape[1] != self.subcluster_centers_.shape[1]
|
||||
):
|
||||
raise ValueError(
|
||||
"Training data and predicted data do not have same number of features."
|
||||
)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict data using the ``centroids_`` of subclusters.
|
||||
|
||||
Avoid computation of the row norms of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape(n_samples,)
|
||||
Labelled data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
return self._predict(X)
|
||||
|
||||
def _predict(self, X):
|
||||
"""Predict data using the ``centroids_`` of subclusters."""
|
||||
kwargs = {"Y_norm_squared": self._subcluster_norms}
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
argmin = pairwise_distances_argmin(
|
||||
X, self.subcluster_centers_, metric_kwargs=kwargs
|
||||
)
|
||||
return self.subcluster_labels_[argmin]
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X into subcluster centroids dimension.
|
||||
|
||||
Each dimension represents the distance from the sample point to each
|
||||
cluster centroid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
|
||||
Transformed data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
with config_context(assume_finite=True):
|
||||
return euclidean_distances(X, self.subcluster_centers_)
|
||||
|
||||
def _global_clustering(self, X=None):
|
||||
"""
|
||||
Global clustering for the subclusters obtained after fitting
|
||||
"""
|
||||
clusterer = self.n_clusters
|
||||
centroids = self.subcluster_centers_
|
||||
compute_labels = (X is not None) and self.compute_labels
|
||||
|
||||
# Preprocessing for the global clustering.
|
||||
not_enough_centroids = False
|
||||
if isinstance(clusterer, Integral):
|
||||
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
|
||||
# There is no need to perform the global clustering step.
|
||||
if len(centroids) < self.n_clusters:
|
||||
not_enough_centroids = True
|
||||
|
||||
# To use in predict to avoid recalculation.
|
||||
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
|
||||
|
||||
if clusterer is None or not_enough_centroids:
|
||||
self.subcluster_labels_ = np.arange(len(centroids))
|
||||
if not_enough_centroids:
|
||||
warnings.warn(
|
||||
"Number of subclusters found (%d) by BIRCH is less "
|
||||
"than (%d). Decrease the threshold."
|
||||
% (len(centroids), self.n_clusters),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
else:
|
||||
# The global clustering step that clusters the subclusters of
|
||||
# the leaves. It assumes the centroids of the subclusters as
|
||||
# samples and finds the final centroids.
|
||||
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
|
||||
|
||||
if compute_labels:
|
||||
self.labels_ = self._predict(X)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"preserves_dtype": [np.float64, np.float32]}
|
||||
Reference in New Issue
Block a user