reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/manifold/_mds.py
+++ b/venv/lib/python3.11/site-packages/sklearn/manifold/_mds.py
@ -0,0 +1,656 @@
+"""
+Multi-dimensional Scaling (MDS).
+"""
+
+# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
+# License: BSD
+
+import warnings
+from numbers import Integral, Real
+
+import numpy as np
+from joblib import effective_n_jobs
+
+from ..base import BaseEstimator, _fit_context
+from ..isotonic import IsotonicRegression
+from ..metrics import euclidean_distances
+from ..utils import check_array, check_random_state, check_symmetric
+from ..utils._param_validation import Interval, StrOptions, validate_params
+from ..utils.parallel import Parallel, delayed
+
+
+def _smacof_single(
+    dissimilarities,
+    metric=True,
+    n_components=2,
+    init=None,
+    max_iter=300,
+    verbose=0,
+    eps=1e-3,
+    random_state=None,
+    normalized_stress=False,
+):
+    """Computes multidimensional scaling using SMACOF algorithm.
+
+    Parameters
+    ----------
+    dissimilarities : ndarray of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : bool, default=True
+        Compute metric or nonmetric SMACOF algorithm.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : ndarray of shape (n_samples, n_components), default=None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence. The value of `eps` should be tuned separately depending
+        on whether or not `normalized_stress` is being used.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    normalized_stress : bool, default=False
+        Whether use and return normed stress value (Stress-1) instead of raw
+        stress calculated by default. Only supported in non-metric MDS. The
+        caller must ensure that if `normalized_stress=True` then `metric=False`
+
+        .. versionadded:: 1.2
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, and `metric=False` returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    n_iter : int
+        The number of iterations corresponding to the best stress.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+           Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+           hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+           Groenen P. Springer Series in Statistics (1997)
+    """
+    dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
+
+    n_samples = dissimilarities.shape[0]
+    random_state = check_random_state(random_state)
+
+    sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
+    sim_flat_w = sim_flat[sim_flat != 0]
+    if init is None:
+        # Randomly choose initial configuration
+        X = random_state.uniform(size=n_samples * n_components)
+        X = X.reshape((n_samples, n_components))
+    else:
+        # overrides the parameter p
+        n_components = init.shape[1]
+        if n_samples != init.shape[0]:
+            raise ValueError(
+                "init matrix should be of shape (%d, %d)" % (n_samples, n_components)
+            )
+        X = init
+
+    old_stress = None
+    ir = IsotonicRegression()
+    for it in range(max_iter):
+        # Compute distance and monotonic regression
+        dis = euclidean_distances(X)
+
+        if metric:
+            disparities = dissimilarities
+        else:
+            dis_flat = dis.ravel()
+            # dissimilarities with 0 are considered as missing values
+            dis_flat_w = dis_flat[sim_flat != 0]
+
+            # Compute the disparities using a monotonic regression
+            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
+            disparities = dis_flat.copy()
+            disparities[sim_flat != 0] = disparities_flat
+            disparities = disparities.reshape((n_samples, n_samples))
+            disparities *= np.sqrt(
+                (n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
+            )
+
+        # Compute stress
+        stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
+        if normalized_stress:
+            stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))
+        # Update X using the Guttman transform
+        dis[dis == 0] = 1e-5
+        ratio = disparities / dis
+        B = -ratio
+        B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
+        X = 1.0 / n_samples * np.dot(B, X)
+
+        dis = np.sqrt((X**2).sum(axis=1)).sum()
+        if verbose >= 2:
+            print("it: %d, stress %s" % (it, stress))
+        if old_stress is not None:
+            if (old_stress - stress / dis) < eps:
+                if verbose:
+                    print("breaking at iteration %d with stress %s" % (it, stress))
+                break
+        old_stress = stress / dis
+
+    return X, stress, it + 1
+
+
+@validate_params(
+    {
+        "dissimilarities": ["array-like"],
+        "metric": ["boolean"],
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "init": ["array-like", None],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "n_jobs": [Integral, None],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "verbose": ["verbose"],
+        "eps": [Interval(Real, 0, None, closed="left")],
+        "random_state": ["random_state"],
+        "return_n_iter": ["boolean"],
+        "normalized_stress": ["boolean", StrOptions({"auto"})],
+    },
+    prefer_skip_nested_validation=True,
+)
+def smacof(
+    dissimilarities,
+    *,
+    metric=True,
+    n_components=2,
+    init=None,
+    n_init=8,
+    n_jobs=None,
+    max_iter=300,
+    verbose=0,
+    eps=1e-3,
+    random_state=None,
+    return_n_iter=False,
+    normalized_stress="auto",
+):
+    """Compute multidimensional scaling using the SMACOF algorithm.
+
+    The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
+    multidimensional scaling algorithm which minimizes an objective function
+    (the *stress*) using a majorization technique. Stress majorization, also
+    known as the Guttman Transform, guarantees a monotone convergence of
+    stress, and is more powerful than traditional techniques such as gradient
+    descent.
+
+    The SMACOF algorithm for metric MDS can be summarized by the following
+    steps:
+
+    1. Set an initial start configuration, randomly or not.
+    2. Compute the stress
+    3. Compute the Guttman Transform
+    4. Iterate 2 and 3 until convergence.
+
+    The nonmetric algorithm adds a monotonic regression step before computing
+    the stress.
+
+    Parameters
+    ----------
+    dissimilarities : array-like of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : bool, default=True
+        Compute metric or nonmetric SMACOF algorithm.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : array-like of shape (n_samples, n_components), default=None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    n_init : int, default=8
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress. If ``init`` is
+        provided, this option is overridden and a single run is performed.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence. The value of `eps` should be tuned separately depending
+        on whether or not `normalized_stress` is being used.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    return_n_iter : bool, default=False
+        Whether or not to return the number of iterations.
+
+    normalized_stress : bool or "auto" default="auto"
+        Whether use and return normed stress value (Stress-1) instead of raw
+        stress calculated by default. Only supported in non-metric MDS.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.4
+           The default value changed from `False` to `"auto"` in version 1.4.
+
+    Returns
+    -------
+    X : ndarray of shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, and `metric=False` returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    n_iter : int
+        The number of iterations corresponding to the best stress. Returned
+        only if ``return_n_iter`` is set to ``True``.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+           Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+           hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+           Groenen P. Springer Series in Statistics (1997)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.manifold import smacof
+    >>> from sklearn.metrics import euclidean_distances
+    >>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]])
+    >>> dissimilarities = euclidean_distances(X)
+    >>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42)
+    >>> mds_result
+    array([[ 0.05... -1.07... ],
+           [ 1.74..., -0.75...],
+           [-1.79...,  1.83...]])
+    >>> stress
+    np.float64(0.0012...)
+    """
+
+    dissimilarities = check_array(dissimilarities)
+    random_state = check_random_state(random_state)
+
+    if normalized_stress == "auto":
+        normalized_stress = not metric
+
+    if normalized_stress and metric:
+        raise ValueError(
+            "Normalized stress is not supported for metric MDS. Either set"
+            " `normalized_stress=False` or use `metric=False`."
+        )
+    if hasattr(init, "__array__"):
+        init = np.asarray(init).copy()
+        if not n_init == 1:
+            warnings.warn(
+                "Explicit initial positions passed: "
+                "performing only one init of the MDS instead of %d" % n_init
+            )
+            n_init = 1
+
+    best_pos, best_stress = None, None
+
+    if effective_n_jobs(n_jobs) == 1:
+        for it in range(n_init):
+            pos, stress, n_iter_ = _smacof_single(
+                dissimilarities,
+                metric=metric,
+                n_components=n_components,
+                init=init,
+                max_iter=max_iter,
+                verbose=verbose,
+                eps=eps,
+                random_state=random_state,
+                normalized_stress=normalized_stress,
+            )
+            if best_stress is None or stress < best_stress:
+                best_stress = stress
+                best_pos = pos.copy()
+                best_iter = n_iter_
+    else:
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
+            delayed(_smacof_single)(
+                dissimilarities,
+                metric=metric,
+                n_components=n_components,
+                init=init,
+                max_iter=max_iter,
+                verbose=verbose,
+                eps=eps,
+                random_state=seed,
+                normalized_stress=normalized_stress,
+            )
+            for seed in seeds
+        )
+        positions, stress, n_iters = zip(*results)
+        best = np.argmin(stress)
+        best_stress = stress[best]
+        best_pos = positions[best]
+        best_iter = n_iters[best]
+
+    if return_n_iter:
+        return best_pos, best_stress, best_iter
+    else:
+        return best_pos, best_stress
+
+
+class MDS(BaseEstimator):
+    """Multidimensional scaling.
+
+    Read more in the :ref:`User Guide <multidimensional_scaling>`.
+
+    Parameters
+    ----------
+    n_components : int, default=2
+        Number of dimensions in which to immerse the dissimilarities.
+
+    metric : bool, default=True
+        If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
+        When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
+        missing values.
+
+    n_init : int, default=4
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress.
+
+    max_iter : int, default=300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, default=0
+        Level of verbosity.
+
+    eps : float, default=1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence. The value of `eps` should be tuned separately depending
+        on whether or not `normalized_stress` is being used.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
+        Dissimilarity measure to use:
+
+        - 'euclidean':
+            Pairwise Euclidean distances between points in the dataset.
+
+        - 'precomputed':
+            Pre-computed dissimilarities are passed directly to ``fit`` and
+            ``fit_transform``.
+
+    normalized_stress : bool or "auto" default="auto"
+        Whether use and return normed stress value (Stress-1) instead of raw
+        stress calculated by default. Only supported in non-metric MDS.
+
+        .. versionadded:: 1.2
+
+        .. versionchanged:: 1.4
+           The default value changed from `False` to `"auto"` in version 1.4.
+
+    Attributes
+    ----------
+    embedding_ : ndarray of shape (n_samples, n_components)
+        Stores the position of the dataset in the embedding space.
+
+    stress_ : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+        If `normalized_stress=True`, and `metric=False` returns Stress-1.
+        A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
+        0.1 fair, and 0.2 poor [1]_.
+
+    dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Symmetric matrix that:
+
+        - either uses a custom dissimilarity matrix by setting `dissimilarity`
+          to 'precomputed';
+        - or constructs a dissimilarity matrix from data using
+          Euclidean distances.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        The number of iterations corresponding to the best stress.
+
+    See Also
+    --------
+    sklearn.decomposition.PCA : Principal component analysis that is a linear
+        dimensionality reduction method.
+    sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
+        kernels and PCA.
+    TSNE : T-distributed Stochastic Neighbor Embedding.
+    Isomap : Manifold learning based on Isometric Mapping.
+    LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
+    SpectralEmbedding : Spectral embedding for non-linear dimensionality.
+
+    References
+    ----------
+    .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+       Psychometrika, 29 (1964)
+
+    .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+       hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+       Groenen P. Springer Series in Statistics (1997)
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import MDS
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = MDS(n_components=2, normalized_stress='auto')
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+
+    For a more detailed example of usage, see
+    :ref:`sphx_glr_auto_examples_manifold_plot_mds.py`.
+
+    For a comparison of manifold learning techniques, see
+    :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "metric": ["boolean"],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "max_iter": [Interval(Integral, 1, None, closed="left")],
+        "verbose": ["verbose"],
+        "eps": [Interval(Real, 0.0, None, closed="left")],
+        "n_jobs": [None, Integral],
+        "random_state": ["random_state"],
+        "dissimilarity": [StrOptions({"euclidean", "precomputed"})],
+        "normalized_stress": ["boolean", StrOptions({"auto"})],
+    }
+
+    def __init__(
+        self,
+        n_components=2,
+        *,
+        metric=True,
+        n_init=4,
+        max_iter=300,
+        verbose=0,
+        eps=1e-3,
+        n_jobs=None,
+        random_state=None,
+        dissimilarity="euclidean",
+        normalized_stress="auto",
+    ):
+        self.n_components = n_components
+        self.dissimilarity = dissimilarity
+        self.metric = metric
+        self.n_init = n_init
+        self.max_iter = max_iter
+        self.eps = eps
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.normalized_stress = normalized_stress
+
+    def _more_tags(self):
+        return {"pairwise": self.dissimilarity == "precomputed"}
+
+    def fit(self, X, y=None, init=None):
+        """
+        Compute the position of the points in the embedding space.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        init : ndarray of shape (n_samples, n_components), default=None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        self.fit_transform(X, init=init)
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit_transform(self, X, y=None, init=None):
+        """
+        Fit the data from `X`, and returns the embedded coordinates.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        init : ndarray of shape (n_samples, n_components), default=None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            X transformed in the new space.
+        """
+        X = self._validate_data(X)
+        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
+            warnings.warn(
+                "The MDS API has changed. ``fit`` now constructs an"
+                " dissimilarity matrix from data. To use a custom "
+                "dissimilarity matrix, set "
+                "``dissimilarity='precomputed'``."
+            )
+
+        if self.dissimilarity == "precomputed":
+            self.dissimilarity_matrix_ = X
+        elif self.dissimilarity == "euclidean":
+            self.dissimilarity_matrix_ = euclidean_distances(X)
+
+        self.embedding_, self.stress_, self.n_iter_ = smacof(
+            self.dissimilarity_matrix_,
+            metric=self.metric,
+            n_components=self.n_components,
+            init=init,
+            n_init=self.n_init,
+            n_jobs=self.n_jobs,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            eps=self.eps,
+            random_state=self.random_state,
+            return_n_iter=True,
+            normalized_stress=self.normalized_stress,
+        )
+
+        return self.embedding_