""" Tests for chi2, currently the only feature selection function designed specifically to work with sparse matrices. """ import warnings import numpy as np import pytest import scipy.stats from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_selection._univariate_selection import _chisquare from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS # Feature 0 is highly informative for class 1; # feature 1 is the same everywhere; # feature 2 is a bit informative for class 2. X = [[2, 1, 2], [9, 1, 1], [6, 1, 2], [0, 1, 2]] y = [0, 1, 2, 2] def mkchi2(k): """Make k-best chi2 selector""" return SelectKBest(chi2, k=k) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_chi2(csr_container): # Test Chi2 feature extraction chi2 = mkchi2(k=1).fit(X, y) chi2 = mkchi2(k=1).fit(X, y) assert_array_equal(chi2.get_support(indices=True), [0]) assert_array_equal(chi2.transform(X), np.array(X)[:, [0]]) chi2 = mkchi2(k=2).fit(X, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xsp = csr_container(X, dtype=np.float64) chi2 = mkchi2(k=2).fit(Xsp, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xtrans = chi2.transform(Xsp) assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2]) # == doesn't work on scipy.sparse matrices Xtrans = Xtrans.toarray() Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() assert_array_almost_equal(Xtrans, Xtrans2) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_chi2_coo(coo_container): # Check that chi2 works with a COO matrix # (as returned by CountVectorizer, DictVectorizer) Xcoo = coo_container(X) mkchi2(k=2).fit_transform(Xcoo, y) # if we got here without an exception, we're safe @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_chi2_negative(csr_container): # Check for proper error on negative numbers in the input X. X, y = [[0, 1], [-1e-20, 1]], [0, 1] for X in (X, np.array(X), csr_container(X)): with pytest.raises(ValueError): chi2(X, y) def test_chi2_unused_feature(): # Unused feature should evaluate to NaN # and should issue no runtime warning with warnings.catch_warnings(record=True) as warned: warnings.simplefilter("always") chi, p = chi2([[1, 0], [0, 0]], [1, 0]) for w in warned: if "divide by zero" in repr(w): raise AssertionError("Found unexpected warning %s" % w) assert_array_equal(chi, [1, np.nan]) assert_array_equal(p[1], np.nan) def test_chisquare(): # Test replacement for scipy.stats.chisquare against the original. obs = np.array([[2.0, 2.0], [1.0, 1.0]]) exp = np.array([[1.5, 1.5], [1.5, 1.5]]) # call SciPy first because our version overwrites obs chi_scp, p_scp = scipy.stats.chisquare(obs, exp) chi_our, p_our = _chisquare(obs, exp) assert_array_almost_equal(chi_scp, chi_our) assert_array_almost_equal(p_scp, p_our)