reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/feature_selection/_sequential.py
+++ b/venv/lib/python3.11/site-packages/sklearn/feature_selection/_sequential.py
@ -0,0 +1,304 @@
+"""
+Sequential feature selection
+"""
+
+from numbers import Integral, Real
+
+import numpy as np
+
+from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
+from ..metrics import get_scorer_names
+from ..model_selection import check_cv, cross_val_score
+from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
+from ..utils._tags import _safe_tags
+from ..utils.metadata_routing import _RoutingNotSupportedMixin
+from ..utils.validation import check_is_fitted
+from ._base import SelectorMixin
+
+
+class SequentialFeatureSelector(
+    _RoutingNotSupportedMixin, SelectorMixin, MetaEstimatorMixin, BaseEstimator
+):
+    """Transformer that performs Sequential Feature Selection.
+
+    This Sequential Feature Selector adds (forward selection) or
+    removes (backward selection) features to form a feature subset in a
+    greedy fashion. At each stage, this estimator chooses the best feature to
+    add or remove based on the cross-validation score of an estimator. In
+    the case of unsupervised learning, this Sequential Feature Selector
+    looks only at the features (X), not the desired outputs (y).
+
+    Read more in the :ref:`User Guide <sequential_feature_selection>`.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        An unfitted estimator.
+
+    n_features_to_select : "auto", int or float, default="auto"
+        If `"auto"`, the behaviour depends on the `tol` parameter:
+
+        - if `tol` is not `None`, then features are selected while the score
+          change does not exceed `tol`.
+        - otherwise, half of the features are selected.
+
+        If integer, the parameter is the absolute number of features to select.
+        If float between 0 and 1, it is the fraction of features to select.
+
+        .. versionadded:: 1.1
+           The option `"auto"` was added in version 1.1.
+
+        .. versionchanged:: 1.3
+           The default changed from `"warn"` to `"auto"` in 1.3.
+
+    tol : float, default=None
+        If the score is not incremented by at least `tol` between two
+        consecutive feature additions or removals, stop adding or removing.
+
+        `tol` can be negative when removing features using `direction="backward"`.
+        `tol` is required to be strictly positive when doing forward selection.
+        It can be useful to reduce the number of features at the cost of a small
+        decrease in the score.
+
+        `tol` is enabled only when `n_features_to_select` is `"auto"`.
+
+        .. versionadded:: 1.1
+
+    direction : {'forward', 'backward'}, default='forward'
+        Whether to perform forward selection or backward selection.
+
+    scoring : str or callable, default=None
+        A single str (see :ref:`scoring_parameter`) or a callable
+        (see :ref:`scoring`) to evaluate the predictions on the test set.
+
+        NOTE that when using a custom scorer, it should return a single
+        value.
+
+        If None, the estimator's score method is used.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
+        cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
+        are instantiated with `shuffle=False` so the splits will be the same
+        across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel. When evaluating a new feature to
+        add or remove, the cross-validation procedure is parallel over the
+        folds.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_features_to_select_ : int
+        The number of features that were selected.
+
+    support_ : ndarray of shape (n_features,), dtype=bool
+        The mask of selected features.
+
+    See Also
+    --------
+    GenericUnivariateSelect : Univariate feature selector with configurable
+        strategy.
+    RFE : Recursive feature elimination based on importance weights.
+    RFECV : Recursive feature elimination based on importance weights, with
+        automatic selection of the number of features.
+    SelectFromModel : Feature selection based on thresholds of importance
+        weights.
+
+    Examples
+    --------
+    >>> from sklearn.feature_selection import SequentialFeatureSelector
+    >>> from sklearn.neighbors import KNeighborsClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> knn = KNeighborsClassifier(n_neighbors=3)
+    >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
+    >>> sfs.fit(X, y)
+    SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
+                              n_features_to_select=3)
+    >>> sfs.get_support()
+    array([ True, False,  True,  True])
+    >>> sfs.transform(X).shape
+    (150, 3)
+    """
+
+    _parameter_constraints: dict = {
+        "estimator": [HasMethods(["fit"])],
+        "n_features_to_select": [
+            StrOptions({"auto"}),
+            Interval(RealNotInt, 0, 1, closed="right"),
+            Interval(Integral, 0, None, closed="neither"),
+        ],
+        "tol": [None, Interval(Real, None, None, closed="neither")],
+        "direction": [StrOptions({"forward", "backward"})],
+        "scoring": [None, StrOptions(set(get_scorer_names())), callable],
+        "cv": ["cv_object"],
+        "n_jobs": [None, Integral],
+    }
+
+    def __init__(
+        self,
+        estimator,
+        *,
+        n_features_to_select="auto",
+        tol=None,
+        direction="forward",
+        scoring=None,
+        cv=5,
+        n_jobs=None,
+    ):
+        self.estimator = estimator
+        self.n_features_to_select = n_features_to_select
+        self.tol = tol
+        self.direction = direction
+        self.scoring = scoring
+        self.cv = cv
+        self.n_jobs = n_jobs
+
+    @_fit_context(
+        # SequentialFeatureSelector.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
+    def fit(self, X, y=None):
+        """Learn the features to select from X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of predictors.
+
+        y : array-like of shape (n_samples,), default=None
+            Target values. This parameter may be ignored for
+            unsupervised learning.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        tags = self._get_tags()
+        X = self._validate_data(
+            X,
+            accept_sparse="csc",
+            ensure_min_features=2,
+            force_all_finite=not tags.get("allow_nan", True),
+        )
+        n_features = X.shape[1]
+
+        if self.n_features_to_select == "auto":
+            if self.tol is not None:
+                # With auto feature selection, `n_features_to_select_` will be updated
+                # to `support_.sum()` after features are selected.
+                self.n_features_to_select_ = n_features - 1
+            else:
+                self.n_features_to_select_ = n_features // 2
+        elif isinstance(self.n_features_to_select, Integral):
+            if self.n_features_to_select >= n_features:
+                raise ValueError("n_features_to_select must be < n_features.")
+            self.n_features_to_select_ = self.n_features_to_select
+        elif isinstance(self.n_features_to_select, Real):
+            self.n_features_to_select_ = int(n_features * self.n_features_to_select)
+
+        if self.tol is not None and self.tol < 0 and self.direction == "forward":
+            raise ValueError(
+                "tol must be strictly positive when doing forward selection"
+            )
+
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
+
+        cloned_estimator = clone(self.estimator)
+
+        # the current mask corresponds to the set of features:
+        # - that we have already *selected* if we do forward selection
+        # - that we have already *excluded* if we do backward selection
+        current_mask = np.zeros(shape=n_features, dtype=bool)
+        n_iterations = (
+            self.n_features_to_select_
+            if self.n_features_to_select == "auto" or self.direction == "forward"
+            else n_features - self.n_features_to_select_
+        )
+
+        old_score = -np.inf
+        is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
+        for _ in range(n_iterations):
+            new_feature_idx, new_score = self._get_best_new_feature_score(
+                cloned_estimator, X, y, cv, current_mask
+            )
+            if is_auto_select and ((new_score - old_score) < self.tol):
+                break
+
+            old_score = new_score
+            current_mask[new_feature_idx] = True
+
+        if self.direction == "backward":
+            current_mask = ~current_mask
+
+        self.support_ = current_mask
+        self.n_features_to_select_ = self.support_.sum()
+
+        return self
+
+    def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask):
+        # Return the best new feature and its score to add to the current_mask,
+        # i.e. return the best new feature and its score to add (resp. remove)
+        # when doing forward selection (resp. backward selection).
+        # Feature will be added if the current score and past score are greater
+        # than tol when n_feature is auto,
+        candidate_feature_indices = np.flatnonzero(~current_mask)
+        scores = {}
+        for feature_idx in candidate_feature_indices:
+            candidate_mask = current_mask.copy()
+            candidate_mask[feature_idx] = True
+            if self.direction == "backward":
+                candidate_mask = ~candidate_mask
+            X_new = X[:, candidate_mask]
+            scores[feature_idx] = cross_val_score(
+                estimator,
+                X_new,
+                y,
+                cv=cv,
+                scoring=self.scoring,
+                n_jobs=self.n_jobs,
+            ).mean()
+        new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
+        return new_feature_idx, scores[new_feature_idx]
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+        return self.support_
+
+    def _more_tags(self):
+        return {
+            "allow_nan": _safe_tags(self.estimator, key="allow_nan"),
+        }