import functools import warnings from typing import Any, List import numpy as np import pytest import scipy.sparse as sp from sklearn.exceptions import DataDimensionalityWarning, NotFittedError from sklearn.metrics import euclidean_distances from sklearn.random_projection import ( GaussianRandomProjection, SparseRandomProjection, _gaussian_random_matrix, _sparse_random_matrix, johnson_lindenstrauss_min_dim, ) from sklearn.utils._testing import ( assert_allclose, assert_allclose_dense_sparse, assert_almost_equal, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import COO_CONTAINERS all_sparse_random_matrix: List[Any] = [_sparse_random_matrix] all_dense_random_matrix: List[Any] = [_gaussian_random_matrix] all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix all_SparseRandomProjection: List[Any] = [SparseRandomProjection] all_DenseRandomProjection: List[Any] = [GaussianRandomProjection] all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection def make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=None, sparse_format="csr", ): """Make some random data with uniformly located non zero entries with Gaussian distributed values; `sparse_format` can be `"csr"` (default) or `None` (in which case a dense array is returned). """ rng = np.random.RandomState(random_state) data_coo = coo_container( ( rng.randn(n_nonzeros), ( rng.randint(n_samples, size=n_nonzeros), rng.randint(n_features, size=n_nonzeros), ), ), shape=(n_samples, n_features), ) if sparse_format is not None: return data_coo.asformat(sparse_format) else: return data_coo.toarray() def densify(matrix): if not sp.issparse(matrix): return matrix else: return matrix.toarray() n_samples, n_features = (10, 1000) n_nonzeros = int(n_samples * n_features / 100.0) ############################################################################### # test on JL lemma ############################################################################### @pytest.mark.parametrize( "n_samples, eps", [ ([100, 110], [0.9, 1.1]), ([90, 100], [0.1, 0.0]), ([50, -40], [0.1, 0.2]), ], ) def test_invalid_jl_domain(n_samples, eps): with pytest.raises(ValueError): johnson_lindenstrauss_min_dim(n_samples, eps=eps) def test_input_size_jl_min_dim(): with pytest.raises(ValueError): johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9]) johnson_lindenstrauss_min_dim( np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5) ) ############################################################################### # tests random matrix generation ############################################################################### def check_input_size_random_matrix(random_matrix): inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)] for n_components, n_features in inputs: with pytest.raises(ValueError): random_matrix(n_components, n_features) def check_size_generated(random_matrix): inputs = [(1, 5), (5, 1), (5, 5), (1, 1)] for n_components, n_features in inputs: assert random_matrix(n_components, n_features).shape == ( n_components, n_features, ) def check_zero_mean_and_unit_norm(random_matrix): # All random matrix should produce a transformation matrix # with zero mean and unit norm for each columns A = densify(random_matrix(10000, 1, random_state=0)) assert_array_almost_equal(0, np.mean(A), 3) assert_array_almost_equal(1.0, np.linalg.norm(A), 1) def check_input_with_sparse_random_matrix(random_matrix): n_components, n_features = 5, 10 for density in [-1.0, 0.0, 1.1]: with pytest.raises(ValueError): random_matrix(n_components, n_features, density=density) @pytest.mark.parametrize("random_matrix", all_random_matrix) def test_basic_property_of_random_matrix(random_matrix): # Check basic properties of random matrix generation check_input_size_random_matrix(random_matrix) check_size_generated(random_matrix) check_zero_mean_and_unit_norm(random_matrix) @pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) def test_basic_property_of_sparse_random_matrix(random_matrix): check_input_with_sparse_random_matrix(random_matrix) random_matrix_dense = functools.partial(random_matrix, density=1.0) check_zero_mean_and_unit_norm(random_matrix_dense) def test_gaussian_random_matrix(): # Check some statical properties of Gaussian random matrix # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # a_ij ~ N(0.0, 1 / n_components). # n_components = 100 n_features = 1000 A = _gaussian_random_matrix(n_components, n_features, random_state=0) assert_array_almost_equal(0.0, np.mean(A), 2) assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1) def test_sparse_random_matrix(): # Check some statical properties of sparse random matrix n_components = 100 n_features = 500 for density in [0.3, 1.0]: s = 1 / density A = _sparse_random_matrix( n_components, n_features, density=density, random_state=0 ) A = densify(A) # Check possible values values = np.unique(A) assert np.sqrt(s) / np.sqrt(n_components) in values assert -np.sqrt(s) / np.sqrt(n_components) in values if density == 1.0: assert np.size(values) == 2 else: assert 0.0 in values assert np.size(values) == 3 # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s # - 0 with probability 1 - 1 / s # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s # assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) assert_almost_equal( np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 ) assert_almost_equal( np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2 ) assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) assert_almost_equal( np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2, ) assert_almost_equal( np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2, ) ############################################################################### # tests on random projection transformer ############################################################################### def test_random_projection_transformer_invalid_input(): n_components = "auto" fit_data = [[0, 1, 2]] for RandomProjection in all_RandomProjection: with pytest.raises(ValueError): RandomProjection(n_components=n_components).fit(fit_data) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_try_to_transform_before_fit(coo_container, global_random_seed): data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format=None, ) for RandomProjection in all_RandomProjection: with pytest.raises(NotFittedError): RandomProjection(n_components="auto").transform(data) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_too_many_samples_to_find_a_safe_embedding(coo_container, global_random_seed): data = make_sparse_random_data( coo_container, n_samples=1000, n_features=100, n_nonzeros=1000, random_state=global_random_seed, sparse_format=None, ) for RandomProjection in all_RandomProjection: rp = RandomProjection(n_components="auto", eps=0.1) expected_msg = ( "eps=0.100000 and n_samples=1000 lead to a target dimension" " of 5920 which is larger than the original space with" " n_features=100" ) with pytest.raises(ValueError, match=expected_msg): rp.fit(data) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_random_projection_embedding_quality(coo_container): data = make_sparse_random_data( coo_container, n_samples=8, n_features=5000, n_nonzeros=15000, random_state=0, sparse_format=None, ) eps = 0.2 original_distances = euclidean_distances(data, squared=True) original_distances = original_distances.ravel() non_identical = original_distances != 0.0 # remove 0 distances to avoid division by 0 original_distances = original_distances[non_identical] for RandomProjection in all_RandomProjection: rp = RandomProjection(n_components="auto", eps=eps, random_state=0) projected = rp.fit_transform(data) projected_distances = euclidean_distances(projected, squared=True) projected_distances = projected_distances.ravel() # remove 0 distances to avoid division by 0 projected_distances = projected_distances[non_identical] distances_ratio = projected_distances / original_distances # check that the automatically tuned values for the density respect the # contract for eps: pairwise distances are preserved according to the # Johnson-Lindenstrauss lemma assert distances_ratio.max() < 1 + eps assert 1 - eps < distances_ratio.min() @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_SparseRandomProj_output_representation(coo_container): dense_data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=0, sparse_format=None, ) sparse_data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=0, sparse_format="csr", ) for SparseRandomProj in all_SparseRandomProjection: # when using sparse input, the projected data can be forced to be a # dense numpy array rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0) rp.fit(dense_data) assert isinstance(rp.transform(dense_data), np.ndarray) assert isinstance(rp.transform(sparse_data), np.ndarray) # the output can be left to a sparse matrix instead rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0) rp = rp.fit(dense_data) # output for dense input will stay dense: assert isinstance(rp.transform(dense_data), np.ndarray) # output for sparse output will be sparse: assert sp.issparse(rp.transform(sparse_data)) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_correct_RandomProjection_dimensions_embedding( coo_container, global_random_seed ): data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format=None, ) for RandomProjection in all_RandomProjection: rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data) # the number of components is adjusted from the shape of the training # set assert rp.n_components == "auto" assert rp.n_components_ == 110 if RandomProjection in all_SparseRandomProjection: assert rp.density == "auto" assert_almost_equal(rp.density_, 0.03, 2) assert rp.components_.shape == (110, n_features) projected_1 = rp.transform(data) assert projected_1.shape == (n_samples, 110) # once the RP is 'fitted' the projection is always the same projected_2 = rp.transform(data) assert_array_equal(projected_1, projected_2) # fit transform with same random seed will lead to the same results rp2 = RandomProjection(random_state=0, eps=0.5) projected_3 = rp2.fit_transform(data) assert_array_equal(projected_1, projected_3) # Try to transform with an input X of size different from fitted. with pytest.raises(ValueError): rp.transform(data[:, 1:5]) # it is also possible to fix the number of components and the density # level if RandomProjection in all_SparseRandomProjection: rp = RandomProjection(n_components=100, density=0.001, random_state=0) projected = rp.fit_transform(data) assert projected.shape == (n_samples, 100) assert rp.components_.shape == (100, n_features) assert rp.components_.nnz < 115 # close to 1% density assert 85 < rp.components_.nnz # close to 1% density @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_warning_n_components_greater_than_n_features( coo_container, global_random_seed ): n_features = 20 n_samples = 5 n_nonzeros = int(n_features / 4) data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format=None, ) for RandomProjection in all_RandomProjection: with pytest.warns(DataDimensionalityWarning): RandomProjection(n_components=n_features + 1).fit(data) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) def test_works_with_sparse_data(coo_container, global_random_seed): n_features = 20 n_samples = 5 n_nonzeros = int(n_features / 4) dense_data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format=None, ) sparse_data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format="csr", ) for RandomProjection in all_RandomProjection: rp_dense = RandomProjection(n_components=3, random_state=1).fit(dense_data) rp_sparse = RandomProjection(n_components=3, random_state=1).fit(sparse_data) assert_array_almost_equal( densify(rp_dense.components_), densify(rp_sparse.components_) ) def test_johnson_lindenstrauss_min_dim(): """Test Johnson-Lindenstrauss for small eps. Regression test for #17111: before #19374, 32-bit systems would fail. """ assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986 @pytest.mark.parametrize("coo_container", COO_CONTAINERS) @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) def test_random_projection_feature_names_out( coo_container, random_projection_cls, global_random_seed ): data = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros, random_state=global_random_seed, sparse_format=None, ) random_projection = random_projection_cls(n_components=2) random_projection.fit(data) names_out = random_projection.get_feature_names_out() class_name_lower = random_projection_cls.__name__.lower() expected_names_out = np.array( [f"{class_name_lower}{i}" for i in range(random_projection.n_components_)], dtype=object, ) assert_array_equal(names_out, expected_names_out) @pytest.mark.parametrize("coo_container", COO_CONTAINERS) @pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000)) @pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000)) @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) @pytest.mark.parametrize("compute_inverse_components", [True, False]) def test_inverse_transform( coo_container, n_samples, n_features, random_projection_cls, compute_inverse_components, global_random_seed, ): n_components = 10 random_projection = random_projection_cls( n_components=n_components, compute_inverse_components=compute_inverse_components, random_state=global_random_seed, ) X_dense = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros=n_samples * n_features // 100 + 1, random_state=global_random_seed, sparse_format=None, ) X_csr = make_sparse_random_data( coo_container, n_samples, n_features, n_nonzeros=n_samples * n_features // 100 + 1, random_state=global_random_seed, sparse_format="csr", ) for X in [X_dense, X_csr]: with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=( "The number of components is higher than the number of features" ), category=DataDimensionalityWarning, ) projected = random_projection.fit_transform(X) if compute_inverse_components: assert hasattr(random_projection, "inverse_components_") inv_components = random_projection.inverse_components_ assert inv_components.shape == (n_features, n_components) projected_back = random_projection.inverse_transform(projected) assert projected_back.shape == X.shape projected_again = random_projection.transform(projected_back) if hasattr(projected, "toarray"): projected = projected.toarray() assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10) @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) @pytest.mark.parametrize( "input_dtype, expected_dtype", ( (np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64), ), ) def test_random_projection_dtype_match( random_projection_cls, input_dtype, expected_dtype ): # Verify output matrix dtype rng = np.random.RandomState(42) X = rng.rand(25, 3000) rp = random_projection_cls(random_state=0) transformed = rp.fit_transform(X.astype(input_dtype)) assert rp.components_.dtype == expected_dtype assert transformed.dtype == expected_dtype @pytest.mark.parametrize("random_projection_cls", all_RandomProjection) def test_random_projection_numerical_consistency(random_projection_cls): # Verify numerical consistency among np.float32 and np.float64 atol = 1e-5 rng = np.random.RandomState(42) X = rng.rand(25, 3000) rp_32 = random_projection_cls(random_state=0) rp_64 = random_projection_cls(random_state=0) projection_32 = rp_32.fit_transform(X.astype(np.float32)) projection_64 = rp_64.fit_transform(X.astype(np.float64)) assert_allclose(projection_64, projection_32, atol=atol) assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)