some new features
This commit is contained in:
120
.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
Normal file
120
.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
Normal file
@ -0,0 +1,120 @@
|
||||
import numpy as np
|
||||
|
||||
from libc cimport math
|
||||
from libc.math cimport INFINITY
|
||||
|
||||
from ..utils._typedefs cimport float32_t, float64_t
|
||||
|
||||
|
||||
cdef float EPSILON_DBL = 1e-8
|
||||
cdef float PERPLEXITY_TOLERANCE = 1e-5
|
||||
|
||||
|
||||
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
|
||||
def _binary_search_perplexity(
|
||||
const float32_t[:, :] sqdistances,
|
||||
float desired_perplexity,
|
||||
int verbose):
|
||||
"""Binary search for sigmas of conditional Gaussians.
|
||||
|
||||
This approximation reduces the computational complexity from O(N^2) to
|
||||
O(uN).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
|
||||
Distances between training samples and their k nearest neighbors.
|
||||
When using the exact method, this is a square (n_samples, n_samples)
|
||||
distance matrix. The TSNE default metric is "euclidean" which is
|
||||
interpreted as squared euclidean distance.
|
||||
|
||||
desired_perplexity : float
|
||||
Desired perplexity (2^entropy) of the conditional Gaussians.
|
||||
|
||||
verbose : int
|
||||
Verbosity level.
|
||||
|
||||
Returns
|
||||
-------
|
||||
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
|
||||
Probabilities of conditional Gaussian distributions p_i|j.
|
||||
"""
|
||||
# Maximum number of binary search steps
|
||||
cdef long n_steps = 100
|
||||
|
||||
cdef long n_samples = sqdistances.shape[0]
|
||||
cdef long n_neighbors = sqdistances.shape[1]
|
||||
cdef int using_neighbors = n_neighbors < n_samples
|
||||
# Precisions of conditional Gaussian distributions
|
||||
cdef double beta
|
||||
cdef double beta_min
|
||||
cdef double beta_max
|
||||
cdef double beta_sum = 0.0
|
||||
|
||||
# Use log scale
|
||||
cdef double desired_entropy = math.log(desired_perplexity)
|
||||
cdef double entropy_diff
|
||||
|
||||
cdef double entropy
|
||||
cdef double sum_Pi
|
||||
cdef double sum_disti_Pi
|
||||
cdef long i, j, l
|
||||
|
||||
# This array is later used as a 32bit array. It has multiple intermediate
|
||||
# floating point additions that benefit from the extra precision
|
||||
cdef float64_t[:, :] P = np.zeros(
|
||||
(n_samples, n_neighbors), dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
beta_min = -INFINITY
|
||||
beta_max = INFINITY
|
||||
beta = 1.0
|
||||
|
||||
# Binary search of precision for i-th conditional distribution
|
||||
for l in range(n_steps):
|
||||
# Compute current entropy and corresponding probabilities
|
||||
# computed just over the nearest neighbors or over all data
|
||||
# if we're not using neighbors
|
||||
sum_Pi = 0.0
|
||||
for j in range(n_neighbors):
|
||||
if j != i or using_neighbors:
|
||||
P[i, j] = math.exp(-sqdistances[i, j] * beta)
|
||||
sum_Pi += P[i, j]
|
||||
|
||||
if sum_Pi == 0.0:
|
||||
sum_Pi = EPSILON_DBL
|
||||
sum_disti_Pi = 0.0
|
||||
|
||||
for j in range(n_neighbors):
|
||||
P[i, j] /= sum_Pi
|
||||
sum_disti_Pi += sqdistances[i, j] * P[i, j]
|
||||
|
||||
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
|
||||
entropy_diff = entropy - desired_entropy
|
||||
|
||||
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
|
||||
break
|
||||
|
||||
if entropy_diff > 0.0:
|
||||
beta_min = beta
|
||||
if beta_max == INFINITY:
|
||||
beta *= 2.0
|
||||
else:
|
||||
beta = (beta + beta_max) / 2.0
|
||||
else:
|
||||
beta_max = beta
|
||||
if beta_min == -INFINITY:
|
||||
beta /= 2.0
|
||||
else:
|
||||
beta = (beta + beta_min) / 2.0
|
||||
|
||||
beta_sum += beta
|
||||
|
||||
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
|
||||
print("[t-SNE] Computed conditional probabilities for sample "
|
||||
"%d / %d" % (i + 1, n_samples))
|
||||
|
||||
if verbose:
|
||||
print("[t-SNE] Mean sigma: %f"
|
||||
% np.mean(math.sqrt(n_samples / beta_sum)))
|
||||
return np.asarray(P)
|
||||
Reference in New Issue
Block a user