import warnings import numpy as np import pytest import sklearn from sklearn.datasets import load_iris, make_blobs, make_circles from sklearn.decomposition import PCA, KernelPCA from sklearn.exceptions import NotFittedError from sklearn.linear_model import Perceptron from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, assert_array_equal, ) from sklearn.utils.fixes import CSR_CONTAINERS from sklearn.utils.validation import _check_psd_eigenvalues def test_kernel_pca(): """Nominal test for all solvers and all known kernels + a custom one It tests - that fit_transform is equivalent to fit+transform - that the shapes of transforms and inverse transforms are correct """ rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) def histogram(x, y, **kwargs): # Histogram kernel implemented as a callable. assert kwargs == {} # no kernel_params that we didn't ask for return np.minimum(x, y).sum() for eigen_solver in ("auto", "dense", "arpack", "randomized"): for kernel in ("linear", "rbf", "poly", histogram): # histogram kernel produces singular matrix inside linalg.solve # XXX use a least-squares approximation? inv = not callable(kernel) # transform fit data kpca = KernelPCA( 4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv ) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal( np.abs(X_fit_transformed), np.abs(X_fit_transformed2) ) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert X_fit_transformed.size != 0 # transform new data X_pred_transformed = kpca.transform(X_pred) assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1] # inverse transform if inv: X_pred2 = kpca.inverse_transform(X_pred_transformed) assert X_pred2.shape == X_pred.shape def test_kernel_pca_invalid_parameters(): """Check that kPCA raises an error if the parameters are invalid Tests fitting inverse transform with a precomputed kernel raises a ValueError. """ estimator = KernelPCA( n_components=10, fit_inverse_transform=True, kernel="precomputed" ) err_ms = "Cannot fit_inverse_transform with a precomputed kernel" with pytest.raises(ValueError, match=err_ms): estimator.fit(np.random.randn(10, 10)) def test_kernel_pca_consistent_transform(): """Check robustness to mutations in the original training array Test that after fitting a kPCA model, it stays independent of any mutation of the values of the original data object by relying on an internal copy. """ # X_fit_ needs to retain the old, unmodified copy of X state = np.random.RandomState(0) X = state.rand(10, 10) kpca = KernelPCA(random_state=state).fit(X) transformed1 = kpca.transform(X) X_copy = X.copy() X[:, 0] = 666 transformed2 = kpca.transform(X_copy) assert_array_almost_equal(transformed1, transformed2) def test_kernel_pca_deterministic_output(): """Test that Kernel PCA produces deterministic output Tests that the same inputs and random state produce the same output. """ rng = np.random.RandomState(0) X = rng.rand(10, 10) eigen_solver = ("arpack", "dense") for solver in eigen_solver: transformed_X = np.zeros((20, 2)) for i in range(20): kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng) transformed_X[i, :] = kpca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_kernel_pca_sparse(csr_container): """Test that kPCA works on a sparse data input. Same test as ``test_kernel_pca except inverse_transform`` since it's not implemented for sparse matrices. """ rng = np.random.RandomState(0) X_fit = csr_container(rng.random_sample((5, 4))) X_pred = csr_container(rng.random_sample((2, 4))) for eigen_solver in ("auto", "arpack", "randomized"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA( 4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False, random_state=0, ) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal( np.abs(X_fit_transformed), np.abs(X_fit_transformed2) ) # transform new data X_pred_transformed = kpca.transform(X_pred) assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1] # inverse transform: not available for sparse matrices # XXX: should we raise another exception type here? For instance: # NotImplementedError. with pytest.raises(NotFittedError): kpca.inverse_transform(X_pred_transformed) @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) @pytest.mark.parametrize("n_features", [4, 10]) def test_kernel_pca_linear_kernel(solver, n_features): """Test that kPCA with linear kernel is equivalent to PCA for all solvers. KernelPCA with linear kernel should produce the same output as PCA. """ rng = np.random.RandomState(0) X_fit = rng.random_sample((5, n_features)) X_pred = rng.random_sample((2, n_features)) # for a linear kernel, kernel PCA should find the same projection as PCA # modulo the sign (direction) # fit only the first four components: fifth is near zero eigenvalue, so # can be trimmed due to roundoff error n_comps = 3 if solver == "arpack" else 4 assert_array_almost_equal( np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)), np.abs( PCA(n_comps, svd_solver=solver if solver != "dense" else "full") .fit(X_fit) .transform(X_pred) ), ) def test_kernel_pca_n_components(): """Test that `n_components` is correctly taken into account for projections For all solvers this tests that the output has the correct shape depending on the selected number of components. """ rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("dense", "arpack", "randomized"): for c in [1, 2, 4]: kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver) shape = kpca.fit(X_fit).transform(X_pred).shape assert shape == (2, c) def test_remove_zero_eig(): """Check that the ``remove_zero_eig`` parameter works correctly. Tests that the null-space (Zero) eigenvalues are removed when remove_zero_eig=True, whereas they are not by default. """ X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]]) # n_components=None (default) => remove_zero_eig is True kpca = KernelPCA() Xt = kpca.fit_transform(X) assert Xt.shape == (3, 0) kpca = KernelPCA(n_components=2) Xt = kpca.fit_transform(X) assert Xt.shape == (3, 2) kpca = KernelPCA(n_components=2, remove_zero_eig=True) Xt = kpca.fit_transform(X) assert Xt.shape == (3, 0) def test_leave_zero_eig(): """Non-regression test for issue #12141 (PR #12143) This test checks that fit().transform() returns the same result as fit_transform() in case of non-removed zero eigenvalue. """ X_fit = np.array([[1, 1], [0, 0]]) # Assert that even with all np warnings on, there is no div by zero warning with warnings.catch_warnings(): # There might be warnings about the kernel being badly conditioned, # but there should not be warnings about division by zero. # (Numpy division by zero warning can have many message variants, but # at least we know that it is a RuntimeWarning so lets check only this) warnings.simplefilter("error", RuntimeWarning) with np.errstate(all="warn"): k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense") # Fit, then transform A = k.fit(X_fit).transform(X_fit) # Do both at once B = k.fit_transform(X_fit) # Compare assert_array_almost_equal(np.abs(A), np.abs(B)) def test_kernel_pca_precomputed(): """Test that kPCA works with a precomputed kernel, for all solvers""" rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("dense", "arpack", "randomized"): X_kpca = ( KernelPCA(4, eigen_solver=eigen_solver, random_state=0) .fit(X_fit) .transform(X_pred) ) X_kpca2 = ( KernelPCA( 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 ) .fit(np.dot(X_fit, X_fit.T)) .transform(np.dot(X_pred, X_fit.T)) ) X_kpca_train = KernelPCA( 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 ).fit_transform(np.dot(X_fit, X_fit.T)) X_kpca_train2 = ( KernelPCA( 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 ) .fit(np.dot(X_fit, X_fit.T)) .transform(np.dot(X_fit, X_fit.T)) ) assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2)) assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2)) @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) def test_kernel_pca_precomputed_non_symmetric(solver): """Check that the kernel centerer works. Tests that a non symmetric precomputed kernel is actually accepted because the kernel centerer does its job correctly. """ # a non symmetric gram matrix K = [[1, 2], [3, 40]] kpca = KernelPCA( kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0 ) kpca.fit(K) # no error # same test with centered kernel Kc = [[9, -9], [-9, 9]] kpca_c = KernelPCA( kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0 ) kpca_c.fit(Kc) # comparison between the non-centered and centered versions assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_) assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_) def test_gridsearch_pipeline(): """Check that kPCA works as expected in a grid search pipeline Test if we can do a grid-search to find parameters to separate circles with a perceptron model. """ X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1 def test_gridsearch_pipeline_precomputed(): """Check that kPCA works as expected in a grid search pipeline (2) Test if we can do a grid-search to find parameters to separate circles with a perceptron model. This test uses a precomputed kernel. """ X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) kpca = KernelPCA(kernel="precomputed", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(Perceptron__max_iter=np.arange(1, 5)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) X_kernel = rbf_kernel(X, gamma=2.0) grid_search.fit(X_kernel, y) assert grid_search.best_score_ == 1 def test_nested_circles(): """Check that kPCA projects in a space where nested circles are separable Tests that 2D nested circles become separable with a perceptron when projected in the first 2 kPCA using an RBF kernel, while raw samples are not directly separable in the original space. """ X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron(max_iter=5).fit(X, y).score(X, y) assert train_score < 0.8 # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA( kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0 ) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) assert train_score == 1.0 def test_kernel_conditioning(): """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA Non-regression test for issue #12140 (PR #12145). """ # create a pathological X leading to small non-zero eigenvalue X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]] kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True) kpca.fit(X) # check that the small non-zero eigenvalue was correctly set to zero assert kpca.eigenvalues_.min() == 0 assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_)) @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) def test_precomputed_kernel_not_psd(solver): """Check how KernelPCA works with non-PSD kernels depending on n_components Tests for all methods what happens with a non PSD gram matrix (this can happen in an isomap scenario, or with custom kernel functions, or maybe with ill-posed datasets). When ``n_component`` is large enough to capture a negative eigenvalue, an error should be raised. Otherwise, KernelPCA should run without error since the negative eigenvalues are not selected. """ # a non PSD kernel with large eigenvalues, already centered # it was captured from an isomap call and multiplied by 100 for compacity K = [ [4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78], [-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49], [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23], [2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9], [2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9], [2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9], [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75], [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46], ] # this gram matrix has 5 positive eigenvalues and 3 negative ones # [ 52.72, 7.65, 7.65, 5.02, 0. , -0. , -6.13, -15.11] # 1. ask for enough components to get a significant negative one kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7) # make sure that the appropriate error is raised with pytest.raises(ValueError, match="There are significant negative eigenvalues"): kpca.fit(K) # 2. ask for a small enough n_components to get only positive ones kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2) if solver == "randomized": # the randomized method is still inconsistent with the others on this # since it selects the eigenvalues based on the largest 2 modules, not # on the largest 2 values. # # At least we can ensure that we return an error instead of returning # the wrong eigenvalues with pytest.raises( ValueError, match="There are significant negative eigenvalues" ): kpca.fit(K) else: # general case: make sure that it works kpca.fit(K) @pytest.mark.parametrize("n_components", [4, 10, 20]) def test_kernel_pca_solvers_equivalence(n_components): """Check that 'dense' 'arpack' & 'randomized' solvers give similar results""" # Generate random data n_train, n_test = 1_000, 100 X, _ = make_circles( n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0 ) X_fit, X_pred = X[:n_train, :], X[n_train:, :] # reference (full) ref_pred = ( KernelPCA(n_components, eigen_solver="dense", random_state=0) .fit(X_fit) .transform(X_pred) ) # arpack a_pred = ( KernelPCA(n_components, eigen_solver="arpack", random_state=0) .fit(X_fit) .transform(X_pred) ) # check that the result is still correct despite the approx assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) # randomized r_pred = ( KernelPCA(n_components, eigen_solver="randomized", random_state=0) .fit(X_fit) .transform(X_pred) ) # check that the result is still correct despite the approximation assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) def test_kernel_pca_inverse_transform_reconstruction(): """Test if the reconstruction is a good approximation. Note that in general it is not possible to get an arbitrarily good reconstruction because of kernel centering that does not preserve all the information of the original data. """ X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) kpca = KernelPCA( n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3 ) X_trans = kpca.fit_transform(X) X_reconst = kpca.inverse_transform(X_trans) assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1 def test_kernel_pca_raise_not_fitted_error(): X = np.random.randn(15).reshape(5, 3) kpca = KernelPCA() kpca.fit(X) with pytest.raises(NotFittedError): kpca.inverse_transform(X) def test_32_64_decomposition_shape(): """Test that the decomposition is similar for 32 and 64 bits data Non regression test for https://github.com/scikit-learn/scikit-learn/issues/18146 """ X, y = make_blobs( n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1 ) X = StandardScaler().fit_transform(X) X -= X.min() # Compare the shapes (corresponds to the number of non-zero eigenvalues) kpca = KernelPCA() assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape def test_kernel_pca_feature_names_out(): """Check feature names out for KernelPCA.""" X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) kpca = KernelPCA(n_components=2).fit(X) names = kpca.get_feature_names_out() assert_array_equal([f"kernelpca{i}" for i in range(2)], names) def test_kernel_pca_inverse_correct_gamma(): """Check that gamma is set correctly when not provided. Non-regression test for #26280 """ rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) kwargs = { "n_components": 2, "random_state": rng, "fit_inverse_transform": True, "kernel": "rbf", } expected_gamma = 1 / X.shape[1] kpca1 = KernelPCA(gamma=None, **kwargs).fit(X) kpca2 = KernelPCA(gamma=expected_gamma, **kwargs).fit(X) assert kpca1.gamma_ == expected_gamma assert kpca2.gamma_ == expected_gamma X1_recon = kpca1.inverse_transform(kpca1.transform(X)) X2_recon = kpca2.inverse_transform(kpca1.transform(X)) assert_allclose(X1_recon, X2_recon) def test_kernel_pca_pandas_output(): """Check that KernelPCA works with pandas output when the solver is arpack. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/27579 """ pytest.importorskip("pandas") X, _ = load_iris(as_frame=True, return_X_y=True) with sklearn.config_context(transform_output="pandas"): KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X)