some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
+++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
@ -0,0 +1,120 @@
+import numpy as np
+
+from libc cimport math
+from libc.math cimport INFINITY
+
+from ..utils._typedefs cimport float32_t, float64_t
+
+
+cdef float EPSILON_DBL = 1e-8
+cdef float PERPLEXITY_TOLERANCE = 1e-5
+
+
+# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
+def _binary_search_perplexity(
+        const float32_t[:, :] sqdistances,
+        float desired_perplexity,
+        int verbose):
+    """Binary search for sigmas of conditional Gaussians.
+
+    This approximation reduces the computational complexity from O(N^2) to
+    O(uN).
+
+    Parameters
+    ----------
+    sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
+        Distances between training samples and their k nearest neighbors.
+        When using the exact method, this is a square (n_samples, n_samples)
+        distance matrix. The TSNE default metric is "euclidean" which is
+        interpreted as squared euclidean distance.
+
+    desired_perplexity : float
+        Desired perplexity (2^entropy) of the conditional Gaussians.
+
+    verbose : int
+        Verbosity level.
+
+    Returns
+    -------
+    P : ndarray of shape (n_samples, n_samples), dtype=np.float64
+        Probabilities of conditional Gaussian distributions p_i|j.
+    """
+    # Maximum number of binary search steps
+    cdef long n_steps = 100
+
+    cdef long n_samples = sqdistances.shape[0]
+    cdef long n_neighbors = sqdistances.shape[1]
+    cdef int using_neighbors = n_neighbors < n_samples
+    # Precisions of conditional Gaussian distributions
+    cdef double beta
+    cdef double beta_min
+    cdef double beta_max
+    cdef double beta_sum = 0.0
+
+    # Use log scale
+    cdef double desired_entropy = math.log(desired_perplexity)
+    cdef double entropy_diff
+
+    cdef double entropy
+    cdef double sum_Pi
+    cdef double sum_disti_Pi
+    cdef long i, j, l
+
+    # This array is later used as a 32bit array. It has multiple intermediate
+    # floating point additions that benefit from the extra precision
+    cdef float64_t[:, :] P = np.zeros(
+        (n_samples, n_neighbors), dtype=np.float64)
+
+    for i in range(n_samples):
+        beta_min = -INFINITY
+        beta_max = INFINITY
+        beta = 1.0
+
+        # Binary search of precision for i-th conditional distribution
+        for l in range(n_steps):
+            # Compute current entropy and corresponding probabilities
+            # computed just over the nearest neighbors or over all data
+            # if we're not using neighbors
+            sum_Pi = 0.0
+            for j in range(n_neighbors):
+                if j != i or using_neighbors:
+                    P[i, j] = math.exp(-sqdistances[i, j] * beta)
+                    sum_Pi += P[i, j]
+
+            if sum_Pi == 0.0:
+                sum_Pi = EPSILON_DBL
+            sum_disti_Pi = 0.0
+
+            for j in range(n_neighbors):
+                P[i, j] /= sum_Pi
+                sum_disti_Pi += sqdistances[i, j] * P[i, j]
+
+            entropy = math.log(sum_Pi) + beta * sum_disti_Pi
+            entropy_diff = entropy - desired_entropy
+
+            if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
+                break
+
+            if entropy_diff > 0.0:
+                beta_min = beta
+                if beta_max == INFINITY:
+                    beta *= 2.0
+                else:
+                    beta = (beta + beta_max) / 2.0
+            else:
+                beta_max = beta
+                if beta_min == -INFINITY:
+                    beta /= 2.0
+                else:
+                    beta = (beta + beta_min) / 2.0
+
+        beta_sum += beta
+
+        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
+            print("[t-SNE] Computed conditional probabilities for sample "
+                  "%d / %d" % (i + 1, n_samples))
+
+    if verbose:
+        print("[t-SNE] Mean sigma: %f"
+              % np.mean(math.sqrt(n_samples / beta_sum)))
+    return np.asarray(P)