""" Todo: cross-check the F-value with stats model """ import itertools import warnings import numpy as np import pytest from numpy.testing import assert_allclose from scipy import sparse, stats from sklearn.datasets import load_iris, make_classification, make_regression from sklearn.feature_selection import ( GenericUnivariateSelect, SelectFdr, SelectFpr, SelectFwe, SelectKBest, SelectPercentile, chi2, f_classif, f_oneway, f_regression, mutual_info_classif, mutual_info_regression, r_regression, ) from sklearn.utils import safe_mask from sklearn.utils._testing import ( _convert_container, assert_almost_equal, assert_array_almost_equal, assert_array_equal, ignore_warnings, ) from sklearn.utils.fixes import CSR_CONTAINERS ############################################################################## # Test the score functions def test_f_oneway_vs_scipy_stats(): # Test that our f_oneway gives the same result as scipy.stats rng = np.random.RandomState(0) X1 = rng.randn(10, 3) X2 = 1 + rng.randn(10, 3) f, pv = stats.f_oneway(X1, X2) f2, pv2 = f_oneway(X1, X2) assert np.allclose(f, f2) assert np.allclose(pv, pv2) def test_f_oneway_ints(): # Smoke test f_oneway on integers: that it does raise casting errors # with recent numpys rng = np.random.RandomState(0) X = rng.randint(10, size=(10, 10)) y = np.arange(10) fint, pint = f_oneway(X, y) # test that is gives the same result as with float f, p = f_oneway(X.astype(float), y) assert_array_almost_equal(f, fint, decimal=4) assert_array_almost_equal(p, pint, decimal=4) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_f_classif(csr_container): # Test whether the F test yields meaningful results # on a simple simulated classification problem X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, y) F_sparse, pv_sparse = f_classif(csr_container(X), y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-4).all() assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv) @pytest.mark.parametrize("center", [True, False]) def test_r_regression(center): X, y = make_regression( n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0 ) corr_coeffs = r_regression(X, y, center=center) assert (-1 < corr_coeffs).all() assert (corr_coeffs < 1).all() sparse_X = _convert_container(X, "sparse") sparse_corr_coeffs = r_regression(sparse_X, y, center=center) assert_allclose(sparse_corr_coeffs, corr_coeffs) # Testing against numpy for reference Z = np.hstack((X, y[:, np.newaxis])) correlation_matrix = np.corrcoef(Z, rowvar=False) np_corr_coeffs = correlation_matrix[:-1, -1] assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_f_regression(csr_container): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 ) F, pv = f_regression(X, y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-4).all() # with centering, compare with sparse F, pv = f_regression(X, y, center=True) F_sparse, pv_sparse = f_regression(csr_container(X), y, center=True) assert_allclose(F_sparse, F) assert_allclose(pv_sparse, pv) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(csr_container(X), y, center=False) assert_allclose(F_sparse, F) assert_allclose(pv_sparse, pv) def test_f_regression_input_dtype(): # Test whether f_regression returns the same value # for any numeric data_type rng = np.random.RandomState(0) X = rng.rand(10, 20) y = np.arange(10).astype(int) F1, pv1 = f_regression(X, y) F2, pv2 = f_regression(X, y.astype(float)) assert_allclose(F1, F2, 5) assert_allclose(pv1, pv2, 5) def test_f_regression_center(): # Test whether f_regression preserves dof according to 'center' argument # We use two centered variates so we have a simple relationship between # F-score with variates centering and F-score without variates centering. # Create toy example X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean n_samples = X.size Y = np.ones(n_samples) Y[::2] *= -1.0 Y[0] = 0.0 # have Y mean being null F1, _ = f_regression(X, Y, center=True) F2, _ = f_regression(X, Y, center=False) assert_allclose(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2) assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS @pytest.mark.parametrize( "X, y, expected_corr_coef, force_finite", [ ( # A feature in X is constant - forcing finite np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), np.array([0, 1, 1, 0]), np.array([0.0, 0.32075]), True, ), ( # The target y is constant - forcing finite np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), np.array([0, 0, 0, 0]), np.array([0.0, 0.0]), True, ), ( # A feature in X is constant - not forcing finite np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), np.array([0, 1, 1, 0]), np.array([np.nan, 0.32075]), False, ), ( # The target y is constant - not forcing finite np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), np.array([0, 0, 0, 0]), np.array([np.nan, np.nan]), False, ), ], ) def test_r_regression_force_finite(X, y, expected_corr_coef, force_finite): """Check the behaviour of `force_finite` for some corner cases with `r_regression`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/15672 """ with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) corr_coef = r_regression(X, y, force_finite=force_finite) np.testing.assert_array_almost_equal(corr_coef, expected_corr_coef) @pytest.mark.parametrize( "X, y, expected_f_statistic, expected_p_values, force_finite", [ ( # A feature in X is constant - forcing finite np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), np.array([0, 1, 1, 0]), np.array([0.0, 0.2293578]), np.array([1.0, 0.67924985]), True, ), ( # The target y is constant - forcing finite np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), np.array([0, 0, 0, 0]), np.array([0.0, 0.0]), np.array([1.0, 1.0]), True, ), ( # Feature in X correlated with y - forcing finite np.array([[0, 1], [1, 0], [2, 10], [3, 4]]), np.array([0, 1, 2, 3]), np.array([np.finfo(np.float64).max, 0.845433]), np.array([0.0, 0.454913]), True, ), ( # Feature in X anti-correlated with y - forcing finite np.array([[3, 1], [2, 0], [1, 10], [0, 4]]), np.array([0, 1, 2, 3]), np.array([np.finfo(np.float64).max, 0.845433]), np.array([0.0, 0.454913]), True, ), ( # A feature in X is constant - not forcing finite np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), np.array([0, 1, 1, 0]), np.array([np.nan, 0.2293578]), np.array([np.nan, 0.67924985]), False, ), ( # The target y is constant - not forcing finite np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), np.array([0, 0, 0, 0]), np.array([np.nan, np.nan]), np.array([np.nan, np.nan]), False, ), ( # Feature in X correlated with y - not forcing finite np.array([[0, 1], [1, 0], [2, 10], [3, 4]]), np.array([0, 1, 2, 3]), np.array([np.inf, 0.845433]), np.array([0.0, 0.454913]), False, ), ( # Feature in X anti-correlated with y - not forcing finite np.array([[3, 1], [2, 0], [1, 10], [0, 4]]), np.array([0, 1, 2, 3]), np.array([np.inf, 0.845433]), np.array([0.0, 0.454913]), False, ), ], ) def test_f_regression_corner_case( X, y, expected_f_statistic, expected_p_values, force_finite ): """Check the behaviour of `force_finite` for some corner cases with `f_regression`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/15672 """ with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) f_statistic, p_values = f_regression(X, y, force_finite=force_finite) np.testing.assert_array_almost_equal(f_statistic, expected_f_statistic) np.testing.assert_array_almost_equal(p_values, expected_p_values) def test_f_classif_multi_class(): # Test whether the F test yields meaningful results # on a simple simulated classification problem X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-4).all() def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(f_classif, mode="percentile", param=25) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_select_percentile_classif_sparse(csr_container): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = csr_container(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(f_classif, mode="percentile", param=25) .fit(X, y) .transform(X) ) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert sparse.issparse(X_r2inv) support_mask = safe_mask(X_r2inv, support) assert X_r2inv.shape == X.shape assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert X_r2inv.nnz == X_r.nnz ############################################################################## # Test univariate selection in classification settings def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(f_classif, mode="k_best", param=5) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification( n_samples=20, n_features=10, shuffle=False, random_state=0 ) univariate_filter = SelectKBest(f_classif, k="all") X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r) # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/24949 X_r2 = ( GenericUnivariateSelect(f_classif, mode="k_best", param="all") .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) @pytest.mark.parametrize("dtype_in", [np.float32, np.float64]) def test_select_kbest_zero(dtype_in): # Test whether k=0 correctly returns no features. X, y = make_classification( n_samples=20, n_features=10, shuffle=False, random_state=0 ) X = X.astype(dtype_in) univariate_filter = SelectKBest(f_classif, k=0) univariate_filter.fit(X, y) support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth) with pytest.warns(UserWarning, match="No features were selected"): X_selected = univariate_filter.transform(X) assert X_selected.shape == (20, 0) assert X_selected.dtype == dtype_in def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = ( GenericUnivariateSelect(f_classif, mode=mode, param=0.01) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_allclose(support, gtruth) ############################################################################## # Test univariate selection in regression settings def assert_best_scores_kept(score_filter): scores = score_filter.scores_ support = score_filter.get_support() assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum() :]) def test_select_percentile_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the percentile heuristic X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 ) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = ( GenericUnivariateSelect(f_regression, mode="percentile", param=25) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal( X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)) ) def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 ) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = ( GenericUnivariateSelect(f_regression, mode="percentile", param=100) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth) def test_select_kbest_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the k best heuristic X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10, ) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = ( GenericUnivariateSelect(f_regression, mode="k_best", param=5) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10, ) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = ( GenericUnivariateSelect(f_regression, mode=mode, param=0.01) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5,), dtype=bool)) assert np.sum(support[5:] == 1) < 3 def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False])) @pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1]) @pytest.mark.parametrize("n_informative", [1, 5, 10]) def test_select_fdr_regression(alpha, n_informative): # Test that fdr heuristic actually has low FDR. def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(f_regression, mode="fdr", param=alpha) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / ( num_true_positives + num_false_positives ) return false_discovery_rate # As per Benjamini-Hochberg, the expected false discovery rate # should be lower than alpha: # FDR = E(FP / (TP + FP)) <= alpha false_discovery_rate = np.mean( [single_fdr(alpha, n_informative, random_state) for random_state in range(100)] ) assert alpha >= false_discovery_rate # Make sure that the empirical false discovery rate increases # with alpha: if false_discovery_rate != 0: assert false_discovery_rate > alpha / 10 def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression( n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 ) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(f_regression, mode="fwe", param=0.01) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5,), dtype=bool)) assert np.sum(support[5:] == 1) < 2 def test_selectkbest_tiebreaking(): # Test whether SelectKBest actually selects k features in case of ties. # Prior to 0.11, SelectKBest would return more features than requested. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectKBest(dummy_score, k=1) X1 = ignore_warnings(sel.fit_transform)([X], y) assert X1.shape[1] == 1 assert_best_scores_kept(sel) sel = SelectKBest(dummy_score, k=2) X2 = ignore_warnings(sel.fit_transform)([X], y) assert X2.shape[1] == 2 assert_best_scores_kept(sel) def test_selectpercentile_tiebreaking(): # Test if SelectPercentile selects the right n_features in case of ties. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectPercentile(dummy_score, percentile=34) X1 = ignore_warnings(sel.fit_transform)([X], y) assert X1.shape[1] == 1 assert_best_scores_kept(sel) sel = SelectPercentile(dummy_score, percentile=67) X2 = ignore_warnings(sel.fit_transform)([X], y) assert X2.shape[1] == 2 assert_best_scores_kept(sel) def test_tied_pvalues(): # Test whether k-best and percentiles work with tied pvalues from chi2. # chi2 will return the same p-values for the following features, but it # will return different scores. X0 = np.array([[10000, 9999, 9998], [1, 1, 1]]) y = [0, 1] for perm in itertools.permutations((0, 1, 2)): X = X0[:, perm] Xt = SelectKBest(chi2, k=2).fit_transform(X, y) assert Xt.shape == (2, 2) assert 9998 not in Xt Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) assert Xt.shape == (2, 2) assert 9998 not in Xt def test_scorefunc_multilabel(): # Test whether k-best and percentiles works with multilabels with chi2. X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]]) y = [[1, 1], [0, 1], [1, 0]] Xt = SelectKBest(chi2, k=2).fit_transform(X, y) assert Xt.shape == (3, 2) assert 0 not in Xt Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) assert Xt.shape == (3, 2) assert 0 not in Xt def test_tied_scores(): # Test for stable sorting in k-best with tied scores. X_train = np.array([[0, 0, 0], [1, 1, 1]]) y_train = [0, 1] for n_features in [1, 2, 3]: sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train) X_test = sel.transform([[0, 1, 2]]) assert_array_equal(X_test[0], np.arange(3)[-n_features:]) def test_nans(): # Assert that SelectKBest and SelectPercentile can handle NaNs. # First feature has zero variance to confuse f_classif (ANOVA) and # make it return a NaN. X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]] y = [1, 0, 1] for select in ( SelectKBest(f_classif, k=2), SelectPercentile(f_classif, percentile=67), ): ignore_warnings(select.fit)(X, y) assert_array_equal(select.get_support(indices=True), np.array([1, 2])) def test_invalid_k(): X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]] y = [1, 0, 1] msg = "k=4 is greater than n_features=3. All the features will be returned." with pytest.warns(UserWarning, match=msg): SelectKBest(k=4).fit(X, y) with pytest.warns(UserWarning, match=msg): GenericUnivariateSelect(mode="k_best", param=4).fit(X, y) def test_f_classif_constant_feature(): # Test that f_classif warns if a feature is constant throughout. X, y = make_classification(n_samples=10, n_features=5) X[:, 0] = 2.0 with pytest.warns(UserWarning): f_classif(X, y) def test_no_feature_selected(): rng = np.random.RandomState(0) # Generate random uncorrelated data: a strict univariate test should # rejects all the features X = rng.rand(40, 10) y = rng.randint(0, 4, size=40) strict_selectors = [ SelectFwe(alpha=0.01).fit(X, y), SelectFdr(alpha=0.01).fit(X, y), SelectFpr(alpha=0.01).fit(X, y), SelectPercentile(percentile=0).fit(X, y), SelectKBest(k=0).fit(X, y), ] for selector in strict_selectors: assert_array_equal(selector.get_support(), np.zeros(10)) with pytest.warns(UserWarning, match="No features were selected"): X_selected = selector.transform(X) assert X_selected.shape == (40, 0) def test_mutual_info_classif(): X, y = make_classification( n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) def test_mutual_info_regression(): X, y = make_regression( n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = ( GenericUnivariateSelect(mutual_info_regression, mode="k_best", param=2) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = ( GenericUnivariateSelect(mutual_info_regression, mode="percentile", param=20) .fit(X, y) .transform(X) ) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) def test_dataframe_output_dtypes(): """Check that the output datafarme dtypes are the same as the input. Non-regression test for gh-24860. """ pd = pytest.importorskip("pandas") X, y = load_iris(return_X_y=True, as_frame=True) X = X.astype( { "petal length (cm)": np.float32, "petal width (cm)": np.float64, } ) X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10) column_order = X.columns def selector(X, y): ranking = { "sepal length (cm)": 1, "sepal width (cm)": 2, "petal length (cm)": 3, "petal width (cm)": 4, "petal_width_binned": 5, } return np.asarray([ranking[name] for name in column_order]) univariate_filter = SelectKBest(selector, k=3).set_output(transform="pandas") output = univariate_filter.fit_transform(X, y) assert_array_equal( output.columns, ["petal length (cm)", "petal width (cm)", "petal_width_binned"] ) for name, dtype in output.dtypes.items(): assert dtype == X.dtypes[name] @pytest.mark.parametrize( "selector", [ SelectKBest(k=4), SelectPercentile(percentile=80), GenericUnivariateSelect(mode="k_best", param=4), GenericUnivariateSelect(mode="percentile", param=80), ], ) def test_unsupervised_filter(selector): """Check support for unsupervised feature selection for the filter that could require only `X`. """ rng = np.random.RandomState(0) X = rng.randn(10, 5) def score_func(X, y=None): return np.array([1, 1, 1, 1, 0]) selector.set_params(score_func=score_func) selector.fit(X) X_trans = selector.transform(X) assert_allclose(X_trans, X[:, :4]) X_trans = selector.fit_transform(X) assert_allclose(X_trans, X[:, :4])