"""Generic feature selection mixin""" # Authors: G. Varoquaux, A. Gramfort, L. Buitinck, J. Nothman # License: BSD 3 clause import warnings from abc import ABCMeta, abstractmethod from operator import attrgetter import numpy as np from scipy.sparse import csc_matrix, issparse from ..base import TransformerMixin from ..utils import ( _is_pandas_df, _safe_indexing, check_array, safe_sqr, ) from ..utils._set_output import _get_output_config from ..utils._tags import _safe_tags from ..utils.validation import _check_feature_names_in, check_is_fitted class SelectorMixin(TransformerMixin, metaclass=ABCMeta): """ Transformer mixin that performs feature selection given a support mask This mixin provides a feature selector implementation with `transform` and `inverse_transform` functionality given an implementation of `_get_support_mask`. Examples -------- >>> import numpy as np >>> from sklearn.datasets import load_iris >>> from sklearn.base import BaseEstimator >>> from sklearn.feature_selection import SelectorMixin >>> class FeatureSelector(SelectorMixin, BaseEstimator): ... def fit(self, X, y=None): ... self.n_features_in_ = X.shape[1] ... return self ... def _get_support_mask(self): ... mask = np.zeros(self.n_features_in_, dtype=bool) ... mask[:2] = True # select the first two features ... return mask >>> X, y = load_iris(return_X_y=True) >>> FeatureSelector().fit_transform(X, y).shape (150, 2) """ def get_support(self, indices=False): """ Get a mask, or integer index, of the features selected. Parameters ---------- indices : bool, default=False If True, the return value will be an array of integers, rather than a boolean mask. Returns ------- support : array An index that selects the retained features from a feature vector. If `indices` is False, this is a boolean array of shape [# input features], in which an element is True iff its corresponding feature is selected for retention. If `indices` is True, this is an integer array of shape [# output features] whose values are indices into the input feature vector. """ mask = self._get_support_mask() return mask if not indices else np.where(mask)[0] @abstractmethod def _get_support_mask(self): """ Get the boolean mask indicating which features are selected Returns ------- support : boolean array of shape [# input features] An element is True iff its corresponding feature is selected for retention. """ def transform(self, X): """Reduce X to the selected features. Parameters ---------- X : array of shape [n_samples, n_features] The input samples. Returns ------- X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ # Preserve X when X is a dataframe and the output is configured to # be pandas. output_config_dense = _get_output_config("transform", estimator=self)["dense"] preserve_X = output_config_dense != "default" and _is_pandas_df(X) # note: we use _safe_tags instead of _get_tags because this is a # public Mixin. X = self._validate_data( X, dtype=None, accept_sparse="csr", force_all_finite=not _safe_tags(self, key="allow_nan"), cast_to_ndarray=not preserve_X, reset=False, ) return self._transform(X) def _transform(self, X): """Reduce X to the selected features.""" mask = self.get_support() if not mask.any(): warnings.warn( ( "No features were selected: either the data is" " too noisy or the selection test too strict." ), UserWarning, ) if hasattr(X, "iloc"): return X.iloc[:, :0] return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0)) return _safe_indexing(X, mask, axis=1) def inverse_transform(self, X): """Reverse the transformation operation. Parameters ---------- X : array of shape [n_samples, n_selected_features] The input samples. Returns ------- X_r : array of shape [n_samples, n_original_features] `X` with columns of zeros inserted where features would have been removed by :meth:`transform`. """ if issparse(X): X = X.tocsc() # insert additional entries in indptr: # e.g. if transform changed indptr from [0 2 6 7] to [0 2 3] # col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3] it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1)) col_nonzeros = it.ravel() indptr = np.concatenate([[0], np.cumsum(col_nonzeros)]) Xt = csc_matrix( (X.data, X.indices, indptr), shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype, ) return Xt support = self.get_support() X = check_array(X, dtype=None) if support.sum() != X.shape[1]: raise ValueError("X has a different shape than during fitting.") if X.ndim == 1: X = X[None, :] Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype) Xt[:, support] = X return Xt def get_feature_names_out(self, input_features=None): """Mask feature names according to selected features. Parameters ---------- input_features : array-like of str or None, default=None Input features. - If `input_features` is `None`, then `feature_names_in_` is used as feature names in. If `feature_names_in_` is not defined, then the following input feature names are generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- feature_names_out : ndarray of str objects Transformed feature names. """ check_is_fitted(self) input_features = _check_feature_names_in(self, input_features) return input_features[self.get_support()] def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): """ Retrieve and aggregate (ndim > 1) the feature importances from an estimator. Also optionally applies transformation. Parameters ---------- estimator : estimator A scikit-learn estimator from which we want to get the feature importances. getter : "auto", str or callable An attribute or a callable to get the feature importance. If `"auto"`, `estimator` is expected to expose `coef_` or `feature_importances`. transform_func : {"norm", "square"}, default=None The transform to apply to the feature importances. By default (`None`) no transformation is applied. norm_order : int, default=1 The norm order to apply when `transform_func="norm"`. Only applied when `importances.ndim > 1`. Returns ------- importances : ndarray of shape (n_features,) The features importances, optionally transformed. """ if isinstance(getter, str): if getter == "auto": if hasattr(estimator, "coef_"): getter = attrgetter("coef_") elif hasattr(estimator, "feature_importances_"): getter = attrgetter("feature_importances_") else: raise ValueError( "when `importance_getter=='auto'`, the underlying " f"estimator {estimator.__class__.__name__} should have " "`coef_` or `feature_importances_` attribute. Either " "pass a fitted estimator to feature selector or call fit " "before calling transform." ) else: getter = attrgetter(getter) elif not callable(getter): raise ValueError("`importance_getter` has to be a string or `callable`") importances = getter(estimator) if transform_func is None: return importances elif transform_func == "norm": if importances.ndim == 1: importances = np.abs(importances) else: importances = np.linalg.norm(importances, axis=0, ord=norm_order) elif transform_func == "square": if importances.ndim == 1: importances = safe_sqr(importances) else: importances = safe_sqr(importances).sum(axis=0) else: raise ValueError( "Valid values for `transform_func` are " + "None, 'norm' and 'square'. Those two " + "transformation are only supported now" ) return importances