reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/impute/_knn.py
+++ b/venv/lib/python3.11/site-packages/sklearn/impute/_knn.py
@ -0,0 +1,402 @@
+# Authors: Ashim Bhattarai <ashimb9@gmail.com>
+#          Thomas J Fan <thomasjpfan@gmail.com>
+# License: BSD 3 clause
+
+from numbers import Integral
+
+import numpy as np
+
+from ..base import _fit_context
+from ..metrics import pairwise_distances_chunked
+from ..metrics.pairwise import _NAN_METRICS
+from ..neighbors._base import _get_weights
+from ..utils._mask import _get_mask
+from ..utils._missing import is_scalar_nan
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted
+from ._base import _BaseImputer
+
+
+class KNNImputer(_BaseImputer):
+    """Imputation for completing missing values using k-Nearest Neighbors.
+
+    Each sample's missing values are imputed using the mean value from
+    `n_neighbors` nearest neighbors found in the training set. Two samples are
+    close if the features that neither is missing are close.
+
+    Read more in the :ref:`User Guide <knnimpute>`.
+
+    .. versionadded:: 0.22
+
+    Parameters
+    ----------
+    missing_values : int, float, str, np.nan or None, default=np.nan
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to np.nan, since `pd.NA` will be converted to np.nan.
+
+    n_neighbors : int, default=5
+        Number of neighboring samples to use for imputation.
+
+    weights : {'uniform', 'distance'} or callable, default='uniform'
+        Weight function used in prediction.  Possible values:
+
+        - 'uniform' : uniform weights. All points in each neighborhood are
+          weighted equally.
+        - 'distance' : weight points by the inverse of their distance.
+          in this case, closer neighbors of a query point will have a
+          greater influence than neighbors which are further away.
+        - callable : a user-defined function which accepts an
+          array of distances, and returns an array of the same shape
+          containing the weights.
+
+    metric : {'nan_euclidean'} or callable, default='nan_euclidean'
+        Distance metric for searching neighbors. Possible values:
+
+        - 'nan_euclidean'
+        - callable : a user-defined function which conforms to the definition
+          of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y`
+          corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively.
+          The callable should returns a scalar distance value.
+
+    copy : bool, default=True
+        If True, a copy of X will be created. If False, imputation will
+        be done in-place whenever possible.
+
+    add_indicator : bool, default=False
+        If True, a :class:`MissingIndicator` transform will stack onto the
+        output of the imputer's transform. This allows a predictive estimator
+        to account for missingness despite imputation. If a feature has no
+        missing values at fit/train time, the feature won't appear on the
+        missing indicator even if there are missing values at transform/test
+        time.
+
+    keep_empty_features : bool, default=False
+        If True, features that consist exclusively of missing values when
+        `fit` is called are returned in results when `transform` is called.
+        The imputed value is always `0`.
+
+        .. versionadded:: 1.2
+
+    Attributes
+    ----------
+    indicator_ : :class:`~sklearn.impute.MissingIndicator`
+        Indicator used to add binary indicators for missing values.
+        ``None`` if add_indicator is False.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    SimpleImputer : Univariate imputer for completing missing values
+        with simple strategies.
+    IterativeImputer : Multivariate imputer that estimates values to impute for
+        each feature with missing values from all the others.
+
+    References
+    ----------
+    * `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
+      Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
+      value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
+      no. 6, 2001 Pages 520-525.
+      <https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.impute import KNNImputer
+    >>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
+    >>> imputer = KNNImputer(n_neighbors=2)
+    >>> imputer.fit_transform(X)
+    array([[1. , 2. , 4. ],
+           [3. , 4. , 3. ],
+           [5.5, 6. , 5. ],
+           [8. , 8. , 7. ]])
+
+    For a more detailed example see
+    :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
+    """
+
+    _parameter_constraints: dict = {
+        **_BaseImputer._parameter_constraints,
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
+        "metric": [StrOptions(set(_NAN_METRICS)), callable],
+        "copy": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        *,
+        missing_values=np.nan,
+        n_neighbors=5,
+        weights="uniform",
+        metric="nan_euclidean",
+        copy=True,
+        add_indicator=False,
+        keep_empty_features=False,
+    ):
+        super().__init__(
+            missing_values=missing_values,
+            add_indicator=add_indicator,
+            keep_empty_features=keep_empty_features,
+        )
+        self.n_neighbors = n_neighbors
+        self.weights = weights
+        self.metric = metric
+        self.copy = copy
+
+    def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
+        """Helper function to impute a single column.
+
+        Parameters
+        ----------
+        dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
+            Distance matrix between the receivers and potential donors from
+            training set. There must be at least one non-nan distance between
+            a receiver and a potential donor.
+
+        n_neighbors : int
+            Number of neighbors to consider.
+
+        fit_X_col : ndarray of shape (n_potential_donors,)
+            Column of potential donors from training set.
+
+        mask_fit_X_col : ndarray of shape (n_potential_donors,)
+            Missing mask for fit_X_col.
+
+        Returns
+        -------
+        imputed_values: ndarray of shape (n_receivers,)
+            Imputed values for receiver.
+        """
+        # Get donors
+        donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
+            :, :n_neighbors
+        ]
+
+        # Get weight matrix from distance matrix
+        donors_dist = dist_pot_donors[
+            np.arange(donors_idx.shape[0])[:, None], donors_idx
+        ]
+
+        weight_matrix = _get_weights(donors_dist, self.weights)
+
+        # fill nans with zeros
+        if weight_matrix is not None:
+            weight_matrix[np.isnan(weight_matrix)] = 0.0
+
+        # Retrieve donor values and calculate kNN average
+        donors = fit_X_col.take(donors_idx)
+        donors_mask = mask_fit_X_col.take(donors_idx)
+        donors = np.ma.array(donors, mask=donors_mask)
+
+        return np.ma.average(donors, axis=1, weights=weight_matrix).data
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+
+        Parameters
+        ----------
+        X : array-like shape of (n_samples, n_features)
+            Input data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            The fitted `KNNImputer` class instance.
+        """
+        # Check data integrity and calling arguments
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+
+        X = self._validate_data(
+            X,
+            accept_sparse=False,
+            dtype=FLOAT_DTYPES,
+            force_all_finite=force_all_finite,
+            copy=self.copy,
+        )
+
+        self._fit_X = X
+        self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
+        self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
+
+        super()._fit_indicator(self._mask_fit_X)
+
+        return self
+
+    def transform(self, X):
+        """Impute all missing values in X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data to complete.
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_output_features)
+            The imputed dataset. `n_output_features` is the number of features
+            that is not always missing during `fit`.
+        """
+
+        check_is_fitted(self)
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+        X = self._validate_data(
+            X,
+            accept_sparse=False,
+            dtype=FLOAT_DTYPES,
+            force_writeable=True,
+            force_all_finite=force_all_finite,
+            copy=self.copy,
+            reset=False,
+        )
+
+        mask = _get_mask(X, self.missing_values)
+        mask_fit_X = self._mask_fit_X
+        valid_mask = self._valid_mask
+
+        X_indicator = super()._transform_indicator(mask)
+
+        # Removes columns where the training data is all nan
+        if not np.any(mask):
+            # No missing values in X
+            if self.keep_empty_features:
+                Xc = X
+                Xc[:, ~valid_mask] = 0
+            else:
+                Xc = X[:, valid_mask]
+
+            # Even if there are no missing values in X, we still concatenate Xc
+            # with the missing value indicator matrix, X_indicator.
+            # This is to ensure that the output maintains consistency in terms
+            # of columns, regardless of whether missing values exist in X or not.
+            return super()._concatenate_indicator(Xc, X_indicator)
+
+        row_missing_idx = np.flatnonzero(mask.any(axis=1))
+
+        non_missing_fix_X = np.logical_not(mask_fit_X)
+
+        # Maps from indices from X to indices in dist matrix
+        dist_idx_map = np.zeros(X.shape[0], dtype=int)
+        dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
+
+        def process_chunk(dist_chunk, start):
+            row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
+
+            # Find and impute missing by column
+            for col in range(X.shape[1]):
+                if not valid_mask[col]:
+                    # column was all missing during training
+                    continue
+
+                col_mask = mask[row_missing_chunk, col]
+                if not np.any(col_mask):
+                    # column has no missing values
+                    continue
+
+                (potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
+
+                # receivers_idx are indices in X
+                receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
+
+                # distances for samples that needed imputation for column
+                dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
+                    :, potential_donors_idx
+                ]
+
+                # receivers with all nan distances impute with mean
+                all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
+                all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
+
+                if all_nan_receivers_idx.size:
+                    col_mean = np.ma.array(
+                        self._fit_X[:, col], mask=mask_fit_X[:, col]
+                    ).mean()
+                    X[all_nan_receivers_idx, col] = col_mean
+
+                    if len(all_nan_receivers_idx) == len(receivers_idx):
+                        # all receivers imputed with mean
+                        continue
+
+                    # receivers with at least one defined distance
+                    receivers_idx = receivers_idx[~all_nan_dist_mask]
+                    dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
+                        :, potential_donors_idx
+                    ]
+
+                n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
+                value = self._calc_impute(
+                    dist_subset,
+                    n_neighbors,
+                    self._fit_X[potential_donors_idx, col],
+                    mask_fit_X[potential_donors_idx, col],
+                )
+                X[receivers_idx, col] = value
+
+        # process in fixed-memory chunks
+        gen = pairwise_distances_chunked(
+            X[row_missing_idx, :],
+            self._fit_X,
+            metric=self.metric,
+            missing_values=self.missing_values,
+            force_all_finite=force_all_finite,
+            reduce_func=process_chunk,
+        )
+        for chunk in gen:
+            # process_chunk modifies X in place. No return value.
+            pass
+
+        if self.keep_empty_features:
+            Xc = X
+            Xc[:, ~valid_mask] = 0
+        else:
+            Xc = X[:, valid_mask]
+
+        return super()._concatenate_indicator(Xc, X_indicator)
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If `input_features` is `None`, then `feature_names_in_` is
+              used as feature names in. If `feature_names_in_` is not defined,
+              then the following input feature names are generated:
+              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
+            - If `input_features` is an array-like, then `input_features` must
+              match `feature_names_in_` if `feature_names_in_` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "n_features_in_")
+        input_features = _check_feature_names_in(self, input_features)
+        names = input_features[self._valid_mask]
+        return self._concatenate_indicator_feature_names_out(names, input_features)