"""Testing for Spectral Biclustering methods""" import numpy as np import pytest from scipy.sparse import issparse from sklearn.base import BaseEstimator, BiclusterMixin from sklearn.cluster import SpectralBiclustering, SpectralCoclustering from sklearn.cluster._bicluster import ( _bistochastic_normalize, _log_normalize, _scale_normalize, ) from sklearn.datasets import make_biclusters, make_checkerboard from sklearn.metrics import consensus_score, v_measure_score from sklearn.model_selection import ParameterGrid from sklearn.utils._testing import ( assert_almost_equal, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import CSR_CONTAINERS class MockBiclustering(BiclusterMixin, BaseEstimator): # Mock object for testing get_submatrix. def __init__(self): pass def get_indices(self, i): # Overridden to reproduce old get_submatrix test. return ( np.where([True, True, False, False, True])[0], np.where([False, False, True, True])[0], ) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_get_submatrix(csr_container): data = np.arange(20).reshape(5, 4) model = MockBiclustering() for X in (data, csr_container(data), data.tolist()): submatrix = model.get_submatrix(0, X) if issparse(submatrix): submatrix = submatrix.toarray() assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]]) submatrix[:] = -1 if issparse(X): X = X.toarray() assert np.all(X != -1) def _test_shape_indices(model): # Test get_shape and get_indices on fitted model. for i in range(model.n_clusters): m, n = model.get_shape(i) i_ind, j_ind = model.get_indices(i) assert len(i_ind) == m assert len(j_ind) == n @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_spectral_coclustering(global_random_seed, csr_container): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { "svd_method": ["randomized", "arpack"], "n_svd_vecs": [None, 20], "mini_batch": [False, True], "init": ["k-means++"], "n_init": [10], } S, rows, cols = make_biclusters( (30, 30), 3, noise=0.1, random_state=global_random_seed ) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_container(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering( n_clusters=3, random_state=global_random_seed, **kwargs ) model.fit(mat) assert model.rows_.shape == (3, 30) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_spectral_biclustering(global_random_seed, csr_container): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard( (30, 30), 3, noise=0.5, random_state=global_random_seed ) non_default_params = { "method": ["scale", "log"], "svd_method": ["arpack"], "n_svd_vecs": [20], "mini_batch": [True], } for mat in (S, csr_container(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init="k-means++", random_state=global_random_seed, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get("method") == "log": # cannot take log of sparse matrix with pytest.raises(ValueError): model.fit(mat) continue else: model.fit(mat) assert model.rows_.shape == (9, 30) assert model.columns_.shape == (9, 30) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model) def _do_scale_test(scaled): """Check that rows sum to one constant, and columns to another.""" row_sum = scaled.sum(axis=1) col_sum = scaled.sum(axis=0) if issparse(scaled): row_sum = np.asarray(row_sum).squeeze() col_sum = np.asarray(col_sum).squeeze() assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1) assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1) def _do_bistochastic_test(scaled): """Check that rows and columns sum to the same constant.""" _do_scale_test(scaled) assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_scale_normalize(global_random_seed, csr_container): generator = np.random.RandomState(global_random_seed) X = generator.rand(100, 100) for mat in (X, csr_container(X)): scaled, _, _ = _scale_normalize(mat) _do_scale_test(scaled) if issparse(mat): assert issparse(scaled) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_bistochastic_normalize(global_random_seed, csr_container): generator = np.random.RandomState(global_random_seed) X = generator.rand(100, 100) for mat in (X, csr_container(X)): scaled = _bistochastic_normalize(mat) _do_bistochastic_test(scaled) if issparse(mat): assert issparse(scaled) def test_log_normalize(global_random_seed): # adding any constant to a log-scaled matrix should make it # bistochastic generator = np.random.RandomState(global_random_seed) mat = generator.rand(100, 100) scaled = _log_normalize(mat) + 1 _do_bistochastic_test(scaled) def test_fit_best_piecewise(global_random_seed): model = SpectralBiclustering(random_state=global_random_seed) vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]]) best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2) assert_array_equal(best, vectors[:2]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_project_and_cluster(global_random_seed, csr_container): model = SpectralBiclustering(random_state=global_random_seed) data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]]) vectors = np.array([[1, 0], [0, 1], [0, 0]]) for mat in (data, csr_container(data)): labels = model._project_and_cluster(mat, vectors, n_clusters=2) assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0) def test_perfect_checkerboard(global_random_seed): # XXX Previously failed on build bot (not reproducible) model = SpectralBiclustering( 3, svd_method="arpack", random_state=global_random_seed ) S, rows, cols = make_checkerboard( (30, 30), 3, noise=0, random_state=global_random_seed ) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard( (40, 30), 3, noise=0, random_state=global_random_seed ) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard( (30, 40), 3, noise=0, random_state=global_random_seed ) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 @pytest.mark.parametrize( "params, type_err, err_msg", [ ( {"n_clusters": 6}, ValueError, "n_clusters should be <= n_samples=5", ), ( {"n_clusters": (3, 3, 3)}, ValueError, "Incorrect parameter n_clusters", ), ( {"n_clusters": (3, 6)}, ValueError, "Incorrect parameter n_clusters", ), ( {"n_components": 3, "n_best": 4}, ValueError, "n_best=4 must be <= n_components=3", ), ], ) def test_spectralbiclustering_parameter_validation(params, type_err, err_msg): """Check parameters validation in `SpectralBiClustering`""" data = np.arange(25).reshape((5, 5)) model = SpectralBiclustering(**params) with pytest.raises(type_err, match=err_msg): model.fit(data) @pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering())) def test_n_features_in_(est): X, _, _ = make_biclusters((3, 3), 3, random_state=0) assert not hasattr(est, "n_features_in_") est.fit(X) assert est.n_features_in_ == 3