import numpy as np from numpy.testing import assert_array_equal import pytest from sklearn.feature_extraction import FeatureHasher from sklearn.utils._testing import ignore_warnings, fails_if_pypy pytestmark = fails_if_pypy def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert "dict" == h.input_type raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (iter(d.items()) for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) assert_array_equal(X1.toarray(), X2.toarray()) def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [ ["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"], ] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher( n_features=n_features, input_type="string", alternate_sign=False ) X = h.transform(it) assert X.shape[0] == len(raw_X) assert X.shape[1] == n_features assert X[0].sum() == 4 assert X[1].sum() == 3 assert X.nnz == 6 def test_hashing_transform_seed(): # check the influence of the seed when computing the hashes # import is here to avoid importing on pypy from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform raw_X = [ ["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"], ] raw_X_ = (((f, 1) for f in x) for x in raw_X) indices, indptr, _ = _hashing_transform(raw_X_, 2 ** 7, str, False) raw_X_ = (((f, 1) for f in x) for x in raw_X) indices_0, indptr_0, _ = _hashing_transform(raw_X_, 2 ** 7, str, False, seed=0) assert_array_equal(indices, indices_0) assert_array_equal(indptr, indptr_0) raw_X_ = (((f, 1) for f in x) for x in raw_X) indices_1, _, _ = _hashing_transform(raw_X_, 2 ** 7, str, False, seed=1) with pytest.raises(AssertionError): assert_array_equal(indices, indices_1) def test_feature_hasher_pairs(): raw_X = ( iter(d.items()) for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}] ) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert [1, 2] == x1_nz assert [1, 3, 4] == x2_nz def test_feature_hasher_pairs_with_string_values(): raw_X = ( iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}] ) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert [1, 1] == x1_nz assert [1, 1, 4] == x2_nz raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) x1, x2 = h.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert [1] == x1_nz assert [1] == x2_nz assert_array_equal(x1, x2) def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features))) def test_hasher_invalid_input(): with pytest.raises(ValueError): FeatureHasher(input_type="gobbledygook") with pytest.raises(ValueError): FeatureHasher(n_features=-1) with pytest.raises(ValueError): FeatureHasher(n_features=0) with pytest.raises(TypeError): FeatureHasher(n_features="ham") h = FeatureHasher(n_features=np.uint16(2 ** 6)) with pytest.raises(ValueError): h.transform([]) with pytest.raises(Exception): h.transform([[5.5]]) with pytest.raises(Exception): h.transform([[None]]) def test_hasher_set_params(): # Test delayed input validation in fit (useful for grid search). hasher = FeatureHasher() hasher.set_params(n_features=np.inf) with pytest.raises(TypeError): hasher.fit() def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{"foo": 0}]) assert X.data.shape == (0,) @ignore_warnings(category=FutureWarning) def test_hasher_alternate_sign(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, input_type="string").fit_transform(X) assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=False, input_type="string").fit_transform(X) assert Xt.data.min() > 0 def test_hash_collisions(): X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher( alternate_sign=True, n_features=1, input_type="string" ).fit_transform(X) # check that some of the hashed tokens are added # with an opposite sign and cancel out assert abs(Xt.data[0]) < len(X[0]) Xt = FeatureHasher( alternate_sign=False, n_features=1, input_type="string" ).fit_transform(X) assert Xt.data[0] == len(X[0])