reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/decomposition/_incremental_pca.py
+++ b/venv/lib/python3.11/site-packages/sklearn/decomposition/_incremental_pca.py
@ -0,0 +1,414 @@
+"""Incremental Principal Components Analysis."""
+
+# Author: Kyle Kastner <kastnerkyle@gmail.com>
+#         Giorgio Patrini
+# License: BSD 3 clause
+
+from numbers import Integral
+
+import numpy as np
+from scipy import linalg, sparse
+
+from ..base import _fit_context
+from ..utils import gen_batches
+from ..utils._param_validation import Interval
+from ..utils.extmath import _incremental_mean_and_var, svd_flip
+from ._base import _BasePCA
+
+
+class IncrementalPCA(_BasePCA):
+    """Incremental principal components analysis (IPCA).
+
+    Linear dimensionality reduction using Singular Value Decomposition of
+    the data, keeping only the most significant singular vectors to
+    project the data to a lower dimensional space. The input data is centered
+    but not scaled for each feature before applying the SVD.
+
+    Depending on the size of the input data, this algorithm can be much more
+    memory efficient than a PCA, and allows sparse input.
+
+    This algorithm has constant memory complexity, on the order
+    of ``batch_size * n_features``, enabling use of np.memmap files without
+    loading the entire file into memory. For sparse matrices, the input
+    is converted to dense in batches (in order to be able to subtract the
+    mean) which avoids storing the entire dense matrix at any one time.
+
+    The computational overhead of each SVD is
+    ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
+    remain in memory at a time. There will be ``n_samples / batch_size`` SVD
+    computations to get the principal components, versus 1 large SVD of
+    complexity ``O(n_samples * n_features ** 2)`` for PCA.
+
+    For a usage example, see
+    :ref:`sphx_glr_auto_examples_decomposition_plot_incremental_pca.py`.
+
+    Read more in the :ref:`User Guide <IncrementalPCA>`.
+
+    .. versionadded:: 0.16
+
+    Parameters
+    ----------
+    n_components : int, default=None
+        Number of components to keep. If ``n_components`` is ``None``,
+        then ``n_components`` is set to ``min(n_samples, n_features)``.
+
+    whiten : bool, default=False
+        When True (False by default) the ``components_`` vectors are divided
+        by ``n_samples`` times ``components_`` to ensure uncorrelated outputs
+        with unit component-wise variances.
+
+        Whitening will remove some information from the transformed signal
+        (the relative variance scales of the components) but can sometimes
+        improve the predictive accuracy of the downstream estimators by
+        making data respect some hard-wired assumptions.
+
+    copy : bool, default=True
+        If False, X will be overwritten. ``copy=False`` can be used to
+        save memory but is unsafe for general use.
+
+    batch_size : int, default=None
+        The number of samples to use for each batch. Only used when calling
+        ``fit``. If ``batch_size`` is ``None``, then ``batch_size``
+        is inferred from the data and set to ``5 * n_features``, to provide a
+        balance between approximation accuracy and memory consumption.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Principal axes in feature space, representing the directions of
+        maximum variance in the data. Equivalently, the right singular
+        vectors of the centered input data, parallel to its eigenvectors.
+        The components are sorted by decreasing ``explained_variance_``.
+
+    explained_variance_ : ndarray of shape (n_components,)
+        Variance explained by each of the selected components.
+
+    explained_variance_ratio_ : ndarray of shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+        If all components are stored, the sum of explained variances is equal
+        to 1.0.
+
+    singular_values_ : ndarray of shape (n_components,)
+        The singular values corresponding to each of the selected components.
+        The singular values are equal to the 2-norms of the ``n_components``
+        variables in the lower-dimensional space.
+
+    mean_ : ndarray of shape (n_features,)
+        Per-feature empirical mean, aggregate over calls to ``partial_fit``.
+
+    var_ : ndarray of shape (n_features,)
+        Per-feature empirical variance, aggregate over calls to
+        ``partial_fit``.
+
+    noise_variance_ : float
+        The estimated noise covariance following the Probabilistic PCA model
+        from Tipping and Bishop 1999. See "Pattern Recognition and
+        Machine Learning" by C. Bishop, 12.2.1 p. 574 or
+        http://www.miketipping.com/papers/met-mppca.pdf.
+
+    n_components_ : int
+        The estimated number of components. Relevant when
+        ``n_components=None``.
+
+    n_samples_seen_ : int
+        The number of samples processed by the estimator. Will be reset on
+        new calls to fit, but increments across ``partial_fit`` calls.
+
+    batch_size_ : int
+        Inferred batch size from ``batch_size``.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    PCA : Principal component analysis (PCA).
+    KernelPCA : Kernel Principal component analysis (KPCA).
+    SparsePCA : Sparse Principal Components Analysis (SparsePCA).
+    TruncatedSVD : Dimensionality reduction using truncated SVD.
+
+    Notes
+    -----
+    Implements the incremental PCA model from:
+    *D. Ross, J. Lim, R. Lin, M. Yang, Incremental Learning for Robust Visual
+    Tracking, International Journal of Computer Vision, Volume 77, Issue 1-3,
+    pp. 125-141, May 2008.*
+    See https://www.cs.toronto.edu/~dross/ivt/RossLimLinYang_ijcv.pdf
+
+    This model is an extension of the Sequential Karhunen-Loeve Transform from:
+    :doi:`A. Levy and M. Lindenbaum, Sequential Karhunen-Loeve Basis Extraction and
+    its Application to Images, IEEE Transactions on Image Processing, Volume 9,
+    Number 8, pp. 1371-1374, August 2000. <10.1109/83.855432>`
+
+    We have specifically abstained from an optimization used by authors of both
+    papers, a QR decomposition used in specific situations to reduce the
+    algorithmic complexity of the SVD. The source for this technique is
+    *Matrix Computations, Third Edition, G. Holub and C. Van Loan, Chapter 5,
+    section 5.4.4, pp 252-253.*. This technique has been omitted because it is
+    advantageous only when decomposing a matrix with ``n_samples`` (rows)
+    >= 5/3 * ``n_features`` (columns), and hurts the readability of the
+    implemented algorithm. This would be a good opportunity for future
+    optimization, if it is deemed necessary.
+
+    References
+    ----------
+    D. Ross, J. Lim, R. Lin, M. Yang. Incremental Learning for Robust Visual
+    Tracking, International Journal of Computer Vision, Volume 77,
+    Issue 1-3, pp. 125-141, May 2008.
+
+    G. Golub and C. Van Loan. Matrix Computations, Third Edition, Chapter 5,
+    Section 5.4.4, pp. 252-253.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.decomposition import IncrementalPCA
+    >>> from scipy import sparse
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> transformer = IncrementalPCA(n_components=7, batch_size=200)
+    >>> # either partially fit on smaller batches of data
+    >>> transformer.partial_fit(X[:100, :])
+    IncrementalPCA(batch_size=200, n_components=7)
+    >>> # or let the fit function itself divide the data into batches
+    >>> X_sparse = sparse.csr_matrix(X)
+    >>> X_transformed = transformer.fit_transform(X_sparse)
+    >>> X_transformed.shape
+    (1797, 7)
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "whiten": ["boolean"],
+        "copy": ["boolean"],
+        "batch_size": [Interval(Integral, 1, None, closed="left"), None],
+    }
+
+    def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None):
+        self.n_components = n_components
+        self.whiten = whiten
+        self.copy = copy
+        self.batch_size = batch_size
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model with X, using minibatches of size batch_size.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self.components_ = None
+        self.n_samples_seen_ = 0
+        self.mean_ = 0.0
+        self.var_ = 0.0
+        self.singular_values_ = None
+        self.explained_variance_ = None
+        self.explained_variance_ratio_ = None
+        self.noise_variance_ = None
+
+        X = self._validate_data(
+            X,
+            accept_sparse=["csr", "csc", "lil"],
+            copy=self.copy,
+            dtype=[np.float64, np.float32],
+            force_writeable=True,
+        )
+        n_samples, n_features = X.shape
+
+        if self.batch_size is None:
+            self.batch_size_ = 5 * n_features
+        else:
+            self.batch_size_ = self.batch_size
+
+        for batch in gen_batches(
+            n_samples, self.batch_size_, min_batch_size=self.n_components or 0
+        ):
+            X_batch = X[batch]
+            if sparse.issparse(X_batch):
+                X_batch = X_batch.toarray()
+            self.partial_fit(X_batch, check_input=False)
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None, check_input=True):
+        """Incremental fit with X. All of X is processed as a single batch.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        check_input : bool, default=True
+            Run check_array on X.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        first_pass = not hasattr(self, "components_")
+
+        if check_input:
+            if sparse.issparse(X):
+                raise TypeError(
+                    "IncrementalPCA.partial_fit does not support "
+                    "sparse input. Either convert data to dense "
+                    "or use IncrementalPCA.fit to do so in batches."
+                )
+            X = self._validate_data(
+                X,
+                copy=self.copy,
+                dtype=[np.float64, np.float32],
+                force_writeable=True,
+                reset=first_pass,
+            )
+        n_samples, n_features = X.shape
+        if first_pass:
+            self.components_ = None
+
+        if self.n_components is None:
+            if self.components_ is None:
+                self.n_components_ = min(n_samples, n_features)
+            else:
+                self.n_components_ = self.components_.shape[0]
+        elif not self.n_components <= n_features:
+            raise ValueError(
+                "n_components=%r invalid for n_features=%d, need "
+                "more rows than columns for IncrementalPCA "
+                "processing" % (self.n_components, n_features)
+            )
+        elif not self.n_components <= n_samples:
+            raise ValueError(
+                "n_components=%r must be less or equal to "
+                "the batch number of samples "
+                "%d." % (self.n_components, n_samples)
+            )
+        else:
+            self.n_components_ = self.n_components
+
+        if (self.components_ is not None) and (
+            self.components_.shape[0] != self.n_components_
+        ):
+            raise ValueError(
+                "Number of input features has changed from %i "
+                "to %i between calls to partial_fit! Try "
+                "setting n_components to a fixed value."
+                % (self.components_.shape[0], self.n_components_)
+            )
+
+        # This is the first partial_fit
+        if not hasattr(self, "n_samples_seen_"):
+            self.n_samples_seen_ = 0
+            self.mean_ = 0.0
+            self.var_ = 0.0
+
+        # Update stats - they are 0 if this is the first step
+        col_mean, col_var, n_total_samples = _incremental_mean_and_var(
+            X,
+            last_mean=self.mean_,
+            last_variance=self.var_,
+            last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]),
+        )
+        n_total_samples = n_total_samples[0]
+
+        # Whitening
+        if self.n_samples_seen_ == 0:
+            # If it is the first step, simply whiten X
+            X -= col_mean
+        else:
+            col_batch_mean = np.mean(X, axis=0)
+            X -= col_batch_mean
+            # Build matrix of combined previous basis and new data
+            mean_correction = np.sqrt(
+                (self.n_samples_seen_ / n_total_samples) * n_samples
+            ) * (self.mean_ - col_batch_mean)
+            X = np.vstack(
+                (
+                    self.singular_values_.reshape((-1, 1)) * self.components_,
+                    X,
+                    mean_correction,
+                )
+            )
+
+        U, S, Vt = linalg.svd(X, full_matrices=False, check_finite=False)
+        U, Vt = svd_flip(U, Vt, u_based_decision=False)
+        explained_variance = S**2 / (n_total_samples - 1)
+        explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)
+
+        self.n_samples_seen_ = n_total_samples
+        self.components_ = Vt[: self.n_components_]
+        self.singular_values_ = S[: self.n_components_]
+        self.mean_ = col_mean
+        self.var_ = col_var
+        self.explained_variance_ = explained_variance[: self.n_components_]
+        self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components_]
+        # we already checked `self.n_components <= n_samples` above
+        if self.n_components_ not in (n_samples, n_features):
+            self.noise_variance_ = explained_variance[self.n_components_ :].mean()
+        else:
+            self.noise_variance_ = 0.0
+        return self
+
+    def transform(self, X):
+        """Apply dimensionality reduction to X.
+
+        X is projected on the first principal components previously extracted
+        from a training set, using minibatches of size batch_size if X is
+        sparse.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            New data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+        Returns
+        -------
+        X_new : ndarray of shape (n_samples, n_components)
+            Projection of X in the first principal components.
+
+        Examples
+        --------
+
+        >>> import numpy as np
+        >>> from sklearn.decomposition import IncrementalPCA
+        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],
+        ...               [1, 1], [2, 1], [3, 2]])
+        >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
+        >>> ipca.fit(X)
+        IncrementalPCA(batch_size=3, n_components=2)
+        >>> ipca.transform(X) # doctest: +SKIP
+        """
+        if sparse.issparse(X):
+            n_samples = X.shape[0]
+            output = []
+            for batch in gen_batches(
+                n_samples, self.batch_size_, min_batch_size=self.n_components or 0
+            ):
+                output.append(super().transform(X[batch].toarray()))
+            return np.vstack(output)
+        else:
+            return super().transform(X)