""" Sequential feature selection """ from numbers import Integral, Real import numpy as np from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier from ..metrics import get_scorer_names from ..model_selection import check_cv, cross_val_score from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions from ..utils._tags import _safe_tags from ..utils.metadata_routing import _RoutingNotSupportedMixin from ..utils.validation import check_is_fitted from ._base import SelectorMixin class SequentialFeatureSelector( _RoutingNotSupportedMixin, SelectorMixin, MetaEstimatorMixin, BaseEstimator ): """Transformer that performs Sequential Feature Selection. This Sequential Feature Selector adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. In the case of unsupervised learning, this Sequential Feature Selector looks only at the features (X), not the desired outputs (y). Read more in the :ref:`User Guide `. .. versionadded:: 0.24 Parameters ---------- estimator : estimator instance An unfitted estimator. n_features_to_select : "auto", int or float, default="auto" If `"auto"`, the behaviour depends on the `tol` parameter: - if `tol` is not `None`, then features are selected while the score change does not exceed `tol`. - otherwise, half of the features are selected. If integer, the parameter is the absolute number of features to select. If float between 0 and 1, it is the fraction of features to select. .. versionadded:: 1.1 The option `"auto"` was added in version 1.1. .. versionchanged:: 1.3 The default changed from `"warn"` to `"auto"` in 1.3. tol : float, default=None If the score is not incremented by at least `tol` between two consecutive feature additions or removals, stop adding or removing. `tol` can be negative when removing features using `direction="backward"`. It can be useful to reduce the number of features at the cost of a small decrease in the score. `tol` is enabled only when `n_features_to_select` is `"auto"`. .. versionadded:: 1.1 direction : {'forward', 'backward'}, default='forward' Whether to perform forward selection or backward selection. scoring : str or callable, default=None A single str (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`) to evaluate the predictions on the test set. NOTE that when using a custom scorer, it should return a single value. If None, the estimator's score method is used. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`~sklearn.model_selection.KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. n_jobs : int, default=None Number of jobs to run in parallel. When evaluating a new feature to add or remove, the cross-validation procedure is parallel over the folds. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. Attributes ---------- n_features_in_ : int Number of features seen during :term:`fit`. Only defined if the underlying estimator exposes such an attribute when fit. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 n_features_to_select_ : int The number of features that were selected. support_ : ndarray of shape (n_features,), dtype=bool The mask of selected features. See Also -------- GenericUnivariateSelect : Univariate feature selector with configurable strategy. RFE : Recursive feature elimination based on importance weights. RFECV : Recursive feature elimination based on importance weights, with automatic selection of the number of features. SelectFromModel : Feature selection based on thresholds of importance weights. Examples -------- >>> from sklearn.feature_selection import SequentialFeatureSelector >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> knn = KNeighborsClassifier(n_neighbors=3) >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3) >>> sfs.fit(X, y) SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3), n_features_to_select=3) >>> sfs.get_support() array([ True, False, True, True]) >>> sfs.transform(X).shape (150, 3) """ _parameter_constraints: dict = { "estimator": [HasMethods(["fit"])], "n_features_to_select": [ StrOptions({"auto"}), Interval(RealNotInt, 0, 1, closed="right"), Interval(Integral, 0, None, closed="neither"), ], "tol": [None, Interval(Real, None, None, closed="neither")], "direction": [StrOptions({"forward", "backward"})], "scoring": [None, StrOptions(set(get_scorer_names())), callable], "cv": ["cv_object"], "n_jobs": [None, Integral], } def __init__( self, estimator, *, n_features_to_select="auto", tol=None, direction="forward", scoring=None, cv=5, n_jobs=None, ): self.estimator = estimator self.n_features_to_select = n_features_to_select self.tol = tol self.direction = direction self.scoring = scoring self.cv = cv self.n_jobs = n_jobs @_fit_context( # SequentialFeatureSelector.estimator is not validated yet prefer_skip_nested_validation=False ) def fit(self, X, y=None): """Learn the features to select from X. Parameters ---------- X : array-like of shape (n_samples, n_features) Training vectors, where `n_samples` is the number of samples and `n_features` is the number of predictors. y : array-like of shape (n_samples,), default=None Target values. This parameter may be ignored for unsupervised learning. Returns ------- self : object Returns the instance itself. """ tags = self._get_tags() X = self._validate_data( X, accept_sparse="csc", ensure_min_features=2, force_all_finite=not tags.get("allow_nan", True), ) n_features = X.shape[1] if self.n_features_to_select == "auto": if self.tol is not None: # With auto feature selection, `n_features_to_select_` will be updated # to `support_.sum()` after features are selected. self.n_features_to_select_ = n_features - 1 else: self.n_features_to_select_ = n_features // 2 elif isinstance(self.n_features_to_select, Integral): if self.n_features_to_select >= n_features: raise ValueError("n_features_to_select must be < n_features.") self.n_features_to_select_ = self.n_features_to_select elif isinstance(self.n_features_to_select, Real): self.n_features_to_select_ = int(n_features * self.n_features_to_select) if self.tol is not None and self.tol < 0 and self.direction == "forward": raise ValueError("tol must be positive when doing forward selection") cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) cloned_estimator = clone(self.estimator) # the current mask corresponds to the set of features: # - that we have already *selected* if we do forward selection # - that we have already *excluded* if we do backward selection current_mask = np.zeros(shape=n_features, dtype=bool) n_iterations = ( self.n_features_to_select_ if self.n_features_to_select == "auto" or self.direction == "forward" else n_features - self.n_features_to_select_ ) old_score = -np.inf is_auto_select = self.tol is not None and self.n_features_to_select == "auto" for _ in range(n_iterations): new_feature_idx, new_score = self._get_best_new_feature_score( cloned_estimator, X, y, cv, current_mask ) if is_auto_select and ((new_score - old_score) < self.tol): break old_score = new_score current_mask[new_feature_idx] = True if self.direction == "backward": current_mask = ~current_mask self.support_ = current_mask self.n_features_to_select_ = self.support_.sum() return self def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask): # Return the best new feature and its score to add to the current_mask, # i.e. return the best new feature and its score to add (resp. remove) # when doing forward selection (resp. backward selection). # Feature will be added if the current score and past score are greater # than tol when n_feature is auto, candidate_feature_indices = np.flatnonzero(~current_mask) scores = {} for feature_idx in candidate_feature_indices: candidate_mask = current_mask.copy() candidate_mask[feature_idx] = True if self.direction == "backward": candidate_mask = ~candidate_mask X_new = X[:, candidate_mask] scores[feature_idx] = cross_val_score( estimator, X_new, y, cv=cv, scoring=self.scoring, n_jobs=self.n_jobs, ).mean() new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx]) return new_feature_idx, scores[new_feature_idx] def _get_support_mask(self): check_is_fitted(self) return self.support_ def _more_tags(self): return { "allow_nan": _safe_tags(self.estimator, key="allow_nan"), }