from itertools import product import pytest import re import numpy as np from scipy.sparse import ( bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse, ) from sklearn import metrics from sklearn import neighbors, datasets from sklearn.base import clone from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import EfficiencyWarning from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed from sklearn.pipeline import make_pipeline from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version import joblib rng = np.random.RandomState(0) # load and shuffle iris dataset iris = datasets.load_iris() perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # load and shuffle digits digits = datasets.load_digits() perm = rng.permutation(digits.target.size) digits.data = digits.data[perm] digits.target = digits.target[perm] SPARSE_TYPES = (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix) SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto") P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) # Filter deprecation warnings. neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph) neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph) def _weight_func(dist): """Weight function to replace lambda d: d ** -2. The lambda function is not valid because: if d==0 then 0^-2 is not valid.""" # Dist could be multidimensional, flatten it so all values # can be looped with np.errstate(divide="ignore"): retval = 1.0 / dist return retval ** 2 def test_unsupervised_kneighbors( n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5 ): # Test unsupervised neighbors methods X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for p in P: results_nodist = [] results = [] for algorithm in ALGORITHMS: neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, p=p ) neigh.fit(X) results_nodist.append(neigh.kneighbors(test, return_distance=False)) results.append(neigh.kneighbors(test, return_distance=True)) for i in range(len(results) - 1): assert_array_almost_equal(results_nodist[i], results[i][1]) assert_array_almost_equal(results[i][0], results[i + 1][0]) assert_array_almost_equal(results[i][1], results[i + 1][1]) @pytest.mark.parametrize( "NearestNeighbors", [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors, ], ) def test_unsupervised_inputs(NearestNeighbors): # Test unsupervised inputs for neighbors estimators X = rng.random_sample((10, 3)) y = rng.randint(3, size=10) nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1) nbrs_fid.fit(X) dist1, ind1 = nbrs_fid.kneighbors(X) nbrs = NearestNeighbors(n_neighbors=1) for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(data, y) dist2, ind2 = nbrs.kneighbors(X) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2) def test_n_neighbors_datatype(): # Test to check whether n_neighbors is integer X = [[1, 1], [1, 1], [1, 1]] expected_msg = "n_neighbors does not take .*float.* value, enter integer value" msg = "Expected n_neighbors > 0. Got -3" neighbors_ = neighbors.NearestNeighbors(n_neighbors=3.0) with pytest.raises(TypeError, match=expected_msg): neighbors_.fit(X) with pytest.raises(ValueError, match=msg): neighbors_.kneighbors(X=X, n_neighbors=-3) with pytest.raises(TypeError, match=expected_msg): neighbors_.kneighbors(X=X, n_neighbors=3.0) def test_not_fitted_error_gets_raised(): X = [[1]] neighbors_ = neighbors.NearestNeighbors() with pytest.raises(NotFittedError): neighbors_.kneighbors_graph(X) with pytest.raises(NotFittedError): neighbors_.radius_neighbors_graph(X) @ignore_warnings(category=EfficiencyWarning) def check_precomputed(make_train_test, estimators): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(42) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX, DYX = make_train_test(X, Y) for method in [ "kneighbors", ]: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) nbrs_X.fit(X) dist_X, ind_X = getattr(nbrs_X, method)(Y) # As a dense distance matrix (n_samples by n_samples) nbrs_D = neighbors.NearestNeighbors( n_neighbors=3, algorithm="brute", metric="precomputed" ) nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check auto works too nbrs_D = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric="precomputed" ) nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check X=None in prediction dist_X, ind_X = getattr(nbrs_X, method)(None) dist_D, ind_D = getattr(nbrs_D, method)(None) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape with pytest.raises(ValueError): getattr(nbrs_D, method)(X) target = np.arange(X.shape[0]) for Est in estimators: est = Est(metric="euclidean") est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) est.metric = "precomputed" pred_D = est.fit(DXX, target).predict(DYX) assert_array_almost_equal(pred_X, pred_D) def test_precomputed_dense(): def make_train_test(X_train, X_test): return ( metrics.pairwise_distances(X_train), metrics.pairwise_distances(X_test, X_train), ) estimators = [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor, ] check_precomputed(make_train_test, estimators) @pytest.mark.parametrize("fmt", ["csr", "lil"]) def test_precomputed_sparse_knn(fmt): def make_train_test(X_train, X_test): nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train) return ( nn.kneighbors_graph(X_train, mode="distance").asformat(fmt), nn.kneighbors_graph(X_test, mode="distance").asformat(fmt), ) # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor # since the precomputed neighbors graph is built with k neighbors only. estimators = [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, ] check_precomputed(make_train_test, estimators) @pytest.mark.parametrize("fmt", ["csr", "lil"]) def test_precomputed_sparse_radius(fmt): def make_train_test(X_train, X_test): nn = neighbors.NearestNeighbors(radius=1).fit(X_train) return ( nn.radius_neighbors_graph(X_train, mode="distance").asformat(fmt), nn.radius_neighbors_graph(X_test, mode="distance").asformat(fmt), ) # We do not test KNeighborsClassifier and KNeighborsRegressor # since the precomputed neighbors graph is built with a radius. estimators = [ neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor, ] check_precomputed(make_train_test, estimators) def test_is_sorted_by_data(): # Test that _is_sorted_by_data works as expected. In CSR sparse matrix, # entries in each row can be sorted by indices, by data, or unsorted. # _is_sorted_by_data should return True when entries are sorted by data, # and False in all other cases. # Test with sorted 1D array X = csr_matrix(np.arange(10)) assert _is_sorted_by_data(X) # Test with unsorted 1D array X[0, 2] = 5 assert not _is_sorted_by_data(X) # Test when the data is sorted in each sample, but not necessarily # between samples X = csr_matrix([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]]) assert _is_sorted_by_data(X) # Test with duplicates entries in X.indptr data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4] X = csr_matrix((data, indices, indptr), shape=(3, 3)) assert _is_sorted_by_data(X) @ignore_warnings(category=EfficiencyWarning) def test_check_precomputed(): # Test that _check_precomputed returns a graph sorted by data X = csr_matrix(np.abs(np.random.RandomState(42).randn(10, 10))) assert not _is_sorted_by_data(X) Xt = _check_precomputed(X) assert _is_sorted_by_data(Xt) # est with a different number of nonzero entries for each sample mask = np.random.RandomState(42).randint(2, size=(10, 10)) X = X.toarray() X[mask == 1] = 0 X = csr_matrix(X) assert not _is_sorted_by_data(X) Xt = _check_precomputed(X) assert _is_sorted_by_data(Xt) @ignore_warnings(category=EfficiencyWarning) def test_precomputed_sparse_invalid(): dist = np.array([[0.0, 2.0, 1.0], [2.0, 0.0, 3.0], [1.0, 3.0, 0.0]]) dist_csr = csr_matrix(dist) neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed") neigh.fit(dist_csr) neigh.kneighbors(None, n_neighbors=1) neigh.kneighbors(np.array([[0.0, 0.0, 0.0]]), n_neighbors=2) # Ensures enough number of nearest neighbors dist = np.array([[0.0, 2.0, 0.0], [2.0, 0.0, 3.0], [0.0, 3.0, 0.0]]) dist_csr = csr_matrix(dist) neigh.fit(dist_csr) msg = "2 neighbors per samples are required, but some samples have only 1" with pytest.raises(ValueError, match=msg): neigh.kneighbors(None, n_neighbors=1) # Checks error with inconsistent distance matrix dist = np.array([[5.0, 2.0, 1.0], [-2.0, 0.0, 3.0], [1.0, 3.0, 0.0]]) dist_csr = csr_matrix(dist) msg = "Negative values in data passed to precomputed distance matrix." with pytest.raises(ValueError, match=msg): neigh.kneighbors(dist_csr, n_neighbors=1) def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) X = rng.rand(20, 2) D = pairwise_distances(X, metric="euclidean") y = rng.randint(3, size=20) for Est in ( neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor, ): metric_score = cross_val_score(Est(), X, y) precomp_score = cross_val_score(Est(metric="precomputed"), D, y) assert_array_equal(metric_score, precomp_score) def test_unsupervised_radius_neighbors( n_samples=20, n_features=5, n_query_pts=2, radius=0.5, random_state=0 ): # Test unsupervised radius-based query rng = np.random.RandomState(random_state) X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for p in P: results = [] for algorithm in ALGORITHMS: neigh = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm, p=p) neigh.fit(X) ind1 = neigh.radius_neighbors(test, return_distance=False) # sort the results: this is not done automatically for # radius searches dist, ind = neigh.radius_neighbors(test, return_distance=True) for (d, i, i1) in zip(dist, ind, ind1): j = d.argsort() d[:] = d[j] i[:] = i[j] i1[:] = i1[j] results.append((dist, ind)) assert_array_almost_equal( np.concatenate(list(ind)), np.concatenate(list(ind1)) ) for i in range(len(results) - 1): assert_array_almost_equal( np.concatenate(list(results[i][0])), np.concatenate(list(results[i + 1][0])), ), assert_array_almost_equal( np.concatenate(list(results[i][1])), np.concatenate(list(results[i + 1][1])), ) def test_kneighbors_classifier( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 ): # Test k-neighbors classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < 0.5).astype(int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: knn = neighbors.KNeighborsClassifier( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm ) knn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) # Test prediction with y_str knn.fit(X, y_str) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts]) def test_kneighbors_classifier_float_labels( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 ): # Test k-neighbors classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < 0.5).astype(int) knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(X, y.astype(float)) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) def test_kneighbors_classifier_predict_proba(): # Test KNeighborsClassifier.predict_proba() method X = np.array([[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]]) y = np.array([4, 4, 5, 5, 1, 1]) cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1) # cityblock dist cls.fit(X, y) y_prob = cls.predict_proba(X) real_prob = np.array( [ [0, 2.0 / 3, 1.0 / 3], [1.0 / 3, 2.0 / 3, 0], [1.0 / 3, 0, 2.0 / 3], [0, 1.0 / 3, 2.0 / 3], [2.0 / 3, 1.0 / 3, 0], [2.0 / 3, 1.0 / 3, 0], ] ) assert_array_equal(real_prob, y_prob) # Check that it also works with non integer labels cls.fit(X, y.astype(str)) y_prob = cls.predict_proba(X) assert_array_equal(real_prob, y_prob) # Check that it works with weights='distance' cls = neighbors.KNeighborsClassifier(n_neighbors=2, p=1, weights="distance") cls.fit(X, y) y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]])) real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]]) assert_array_almost_equal(real_prob, y_prob) def test_radius_neighbors_classifier( n_samples=40, n_features=5, n_test_pts=10, radius=0.5, random_state=0 ): # Test radius-based classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < 0.5).astype(int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: neigh = neighbors.RadiusNeighborsClassifier( radius=radius, weights=weights, algorithm=algorithm ) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) neigh.fit(X, y_str) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts]) # TODO: Remove in v1.2 def test_radius_neighbors_classifier_kwargs_is_deprecated(): extra_kwargs = { "unused_param": "", "extra_param": None, } msg = ( "Passing additional keyword parameters has no effect and is deprecated " "in 1.0. An error will be raised from 1.2 and beyond. The ignored " f"keyword parameter(s) are: {extra_kwargs.keys()}." ) with pytest.warns(FutureWarning, match=re.escape(msg)): neighbors.RadiusNeighborsClassifier(**extra_kwargs) def test_radius_neighbors_classifier_when_no_neighbors(): # Test radius-based classifier when no neighbors found. # In this case it should rise an informative exception X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.01, 1.01], [1.4, 1.4]]) # one outlier weight_func = _weight_func for outlier_label in [0, -1, None]: for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: rnc = neighbors.RadiusNeighborsClassifier clf = rnc( radius=radius, weights=weights, algorithm=algorithm, outlier_label=outlier_label, ) clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) if outlier_label is None: with pytest.raises(ValueError): clf.predict(z2) def test_radius_neighbors_classifier_outlier_labeling(): # Test radius-based classifier when no neighbors found and outliers # are labeled. X = np.array([[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98], [2.01, 2.01]]) y = np.array([1, 2, 1, 1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]]) # one outlier correct_labels1 = np.array([1, 2]) correct_labels2 = np.array([-1, 1, 2]) outlier_proba = np.array([0, 0]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: clf = neighbors.RadiusNeighborsClassifier( radius=radius, weights=weights, algorithm=algorithm, outlier_label=-1 ) clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) assert_array_equal(correct_labels2, clf.predict(z2)) assert_array_equal(outlier_proba, clf.predict_proba(z2)[0]) # test outlier_labeling of using predict_proba() RNC = neighbors.RadiusNeighborsClassifier X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) # test outlier_label scalar verification def check_array_exception(): clf = RNC(radius=1, outlier_label=[[5]]) clf.fit(X, y) with pytest.raises(TypeError): check_array_exception() # test invalid outlier_label dtype def check_dtype_exception(): clf = RNC(radius=1, outlier_label="a") clf.fit(X, y) with pytest.raises(TypeError): check_dtype_exception() # test most frequent clf = RNC(radius=1, outlier_label="most_frequent") clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) assert_array_equal(proba[1, :], [0, 0, 0, 1]) # test manual label in y clf = RNC(radius=1, outlier_label=1) clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) assert_array_equal(proba[1, :], [0, 1, 0, 0]) pred = clf.predict([[1], [15]]) assert_array_equal(pred, [2, 1]) # test manual label out of y warning def check_warning(): clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) clf.predict_proba([[1], [15]]) with pytest.warns(UserWarning): check_warning() # test multi output same outlier label y_multi = [ [0, 1], [2, 1], [2, 2], [1, 2], [1, 2], [1, 3], [3, 3], [3, 3], [3, 0], [3, 0], ] clf = RNC(radius=1, outlier_label=1) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) assert_array_equal(pred[1, :], [1, 1]) # test multi output different outlier label y_multi = [ [0, 0], [2, 2], [2, 2], [1, 1], [1, 1], [1, 1], [3, 3], [3, 3], [3, 3], [3, 3], ] clf = RNC(radius=1, outlier_label=[0, 1]) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) assert_array_equal(pred[1, :], [0, 1]) # test inconsistent outlier label list length def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1, 2]) clf.fit(X, y_multi) with pytest.raises(ValueError): check_exception() def test_radius_neighbors_classifier_zero_distance(): # Test radius-based classifier, when distance to a sample is zero. X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.0, 2.0]]) correct_labels1 = np.array([1, 2]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: clf = neighbors.RadiusNeighborsClassifier( radius=radius, weights=weights, algorithm=algorithm ) clf.fit(X, y) with np.errstate(invalid="ignore"): # Ignore the warning raised in _weight_func when making # predictions with null distances resulting in np.inf values. assert_array_equal(correct_labels1, clf.predict(z1)) def test_neighbors_regressors_zero_distance(): # Test radius-based regressor, when distance to a sample is zero. X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]]) y = np.array([1.0, 1.5, 2.0, 0.0]) radius = 0.2 z = np.array([[1.1, 1.1], [2.0, 2.0]]) rnn_correct_labels = np.array([1.25, 2.0]) knn_correct_unif = np.array([1.25, 1.0]) knn_correct_dist = np.array([1.25, 2.0]) for algorithm in ALGORITHMS: # we don't test for weights=_weight_func since user will be expected # to handle zero distances themselves in the function. for weights in ["uniform", "distance"]: rnn = neighbors.RadiusNeighborsRegressor( radius=radius, weights=weights, algorithm=algorithm ) rnn.fit(X, y) assert_array_almost_equal(rnn_correct_labels, rnn.predict(z)) for weights, corr_labels in zip( ["uniform", "distance"], [knn_correct_unif, knn_correct_dist] ): knn = neighbors.KNeighborsRegressor( n_neighbors=2, weights=weights, algorithm=algorithm ) knn.fit(X, y) assert_array_almost_equal(corr_labels, knn.predict(z)) def test_radius_neighbors_boundary_handling(): """Test whether points lying on boundary are handled consistently Also ensures that even with only one query point, an object array is returned rather than a 2d array. """ X = np.array([[1.5], [3.0], [3.01]]) radius = 3.0 for algorithm in ALGORITHMS: nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X) results = nbrs.radius_neighbors([[0.0]], return_distance=False) assert results.shape == (1,) assert results.dtype == object assert_array_equal(results[0], [0, 1]) def test_radius_neighbors_returns_array_of_objects(): # check that we can pass precomputed distances to # NearestNeighbors.radius_neighbors() # non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/16036 X = csr_matrix(np.ones((4, 4))) X.setdiag([0, 0, 0, 0]) nbrs = neighbors.NearestNeighbors( radius=0.5, algorithm="auto", leaf_size=30, metric="precomputed" ).fit(X) neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) expected_dist = np.empty(X.shape[0], dtype=object) expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), np.array([0])] expected_ind = np.empty(X.shape[0], dtype=object) expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), np.array([3])] assert_array_equal(neigh_dist, expected_dist) assert_array_equal(neigh_ind, expected_ind) @pytest.mark.parametrize("algorithm", ["ball_tree", "kd_tree", "brute"]) def test_query_equidistant_kth_nn(algorithm): # For several candidates for the k-th nearest neighbor position, # the first candidate should be chosen query_point = np.array([[0, 0]]) equidistant_points = np.array([[1, 0], [0, 1], [-1, 0], [0, -1]]) # The 3rd and 4th points should not replace the 2nd point # for the 2th nearest neighbor position k = 2 knn_indices = np.array([[0, 1]]) nn = neighbors.NearestNeighbors(algorithm=algorithm).fit(equidistant_points) indices = np.sort(nn.kneighbors(query_point, n_neighbors=k, return_distance=False)) assert_array_equal(indices, knn_indices) @pytest.mark.parametrize( ["algorithm", "metric"], [ ("ball_tree", "euclidean"), ("kd_tree", "euclidean"), ("brute", "euclidean"), ("brute", "precomputed"), ], ) def test_radius_neighbors_sort_results(algorithm, metric): # Test radius_neighbors[_graph] output when sort_result is True n_samples = 10 rng = np.random.RandomState(42) X = rng.random_sample((n_samples, 4)) if metric == "precomputed": X = neighbors.radius_neighbors_graph(X, radius=np.inf, mode="distance") model = neighbors.NearestNeighbors(algorithm=algorithm, metric=metric) model.fit(X) # self.radius_neighbors distances, indices = model.radius_neighbors(X=X, radius=np.inf, sort_results=True) for ii in range(n_samples): assert_array_equal(distances[ii], np.sort(distances[ii])) # sort_results=True and return_distance=False if metric != "precomputed": # no need to raise with precomputed graph with pytest.raises(ValueError, match="return_distance must be True"): model.radius_neighbors( X=X, radius=np.inf, sort_results=True, return_distance=False ) # self.radius_neighbors_graph graph = model.radius_neighbors_graph( X=X, radius=np.inf, mode="distance", sort_results=True ) assert _is_sorted_by_data(graph) def test_RadiusNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, "uniform", "distance", _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier( weights=weights, algorithm=algorithm ) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert y_pred_so.shape == y_test.shape # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier( weights=weights, algorithm=algorithm ) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert y_pred_mo.shape == y_test.shape assert_array_almost_equal(y_pred_mo, y_pred_so) def test_kneighbors_classifier_sparse( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 ): # Test k-NN classifier on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 X *= X > 0.2 y = ((X ** 2).sum(axis=1) < 0.5).astype(int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm="auto") knn.fit(sparsemat(X), y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) for sparsev in SPARSE_TYPES + (np.asarray,): X_eps = sparsev(X[:n_test_pts] + epsilon) y_pred = knn.predict(X_eps) assert_array_equal(y_pred, y[:n_test_pts]) def test_KNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) n_features = 5 n_samples = 50 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, "uniform", "distance", _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] y_pred_proba_so = [] for o in range(n_output): knn = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) knn.fit(X_train, y_train[:, o]) y_pred_so.append(knn.predict(X_test)) y_pred_proba_so.append(knn.predict_proba(X_test)) y_pred_so = np.vstack(y_pred_so).T assert y_pred_so.shape == y_test.shape assert len(y_pred_proba_so) == n_output # Multioutput prediction knn_mo = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) knn_mo.fit(X_train, y_train) y_pred_mo = knn_mo.predict(X_test) assert y_pred_mo.shape == y_test.shape assert_array_almost_equal(y_pred_mo, y_pred_so) # Check proba y_pred_proba_mo = knn_mo.predict_proba(X_test) assert len(y_pred_proba_mo) == n_output for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so): assert_array_almost_equal(proba_mo, proba_so) def test_kneighbors_regressor( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0 ): # Test k-neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: knn = neighbors.KNeighborsRegressor( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm ) knn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert np.all(abs(y_pred - y_target) < 0.3) def test_KNeighborsRegressor_multioutput_uniform_weight(): # Test k-neighbors in multi-output regression with uniform weight rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, "uniform"]): knn = neighbors.KNeighborsRegressor(weights=weights, algorithm=algorithm) knn.fit(X_train, y_train) neigh_idx = knn.kneighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred = knn.predict(X_test) assert y_pred.shape == y_test.shape assert y_pred_idx.shape == y_test.shape assert_array_almost_equal(y_pred, y_pred_idx) def test_kneighbors_regressor_multioutput( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0 ): # Test k-neighbors in multi-output regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ["uniform", "distance", _weight_func] for algorithm, weights in product(ALGORITHMS, weights): knn = neighbors.KNeighborsRegressor( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm ) knn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert y_pred.shape == y_target.shape assert np.all(np.abs(y_pred - y_target) < 0.3) def test_radius_neighbors_regressor( n_samples=40, n_features=3, n_test_pts=10, radius=0.5, random_state=0 ): # Test radius-based neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ["uniform", "distance", weight_func]: neigh = neighbors.RadiusNeighborsRegressor( radius=radius, weights=weights, algorithm=algorithm ) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert np.all(abs(y_pred - y_target) < radius / 2) # test that nan is returned when no nearby observations for weights in ["uniform", "distance"]: neigh = neighbors.RadiusNeighborsRegressor( radius=radius, weights=weights, algorithm="auto" ) neigh.fit(X, y) X_test_nan = np.full((1, n_features), -1.0) empty_warning_msg = ( "One or more samples have no neighbors " "within specified radius; predicting NaN." ) with pytest.warns(UserWarning, match=re.escape(empty_warning_msg)): pred = neigh.predict(X_test_nan) assert np.all(np.isnan(pred)) def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): # Test radius neighbors in multi-output regression (uniform weight) rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, "uniform"]): rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train) neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred_idx = np.array(y_pred_idx) y_pred = rnn.predict(X_test) assert y_pred_idx.shape == y_test.shape assert y_pred.shape == y_test.shape assert_array_almost_equal(y_pred, y_pred_idx) def test_RadiusNeighborsRegressor_multioutput( n_samples=40, n_features=5, n_test_pts=10, random_state=0 ): # Test k-neighbors in multi-output regression with various weight rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ["uniform", "distance", _weight_func] for algorithm, weights in product(ALGORITHMS, weights): rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = rnn.predict(X[:n_test_pts] + epsilon) assert y_pred.shape == y_target.shape assert np.all(np.abs(y_pred - y_target) < 0.3) @ignore_warnings(category=EfficiencyWarning) def test_kneighbors_regressor_sparse( n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 ): # Test radius-based regression on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < 0.25).astype(int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm="auto") knn.fit(sparsemat(X), y) knn_pre = neighbors.KNeighborsRegressor( n_neighbors=n_neighbors, metric="precomputed" ) knn_pre.fit(pairwise_distances(X, metric="euclidean"), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) assert np.mean(knn.predict(X2).round() == y) > 0.95 X2_pre = sparsev(pairwise_distances(X, metric="euclidean")) if sparsev in {dok_matrix, bsr_matrix}: msg = "not supported due to its handling of explicit zeros" with pytest.raises(TypeError, match=msg): knn_pre.predict(X2_pre) else: assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95 def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. for algorithm in ALGORITHMS: clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_array_equal(clf.predict(iris.data), iris.target) clf.set_params(n_neighbors=9, algorithm=algorithm) clf.fit(iris.data, iris.target) assert np.mean(clf.predict(iris.data) == iris.target) > 0.95 rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) rgs.fit(iris.data, iris.target) assert np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95 def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits.data.astype("uint8") Y = digits.target (n_samples, n_features) = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test] clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm="brute") score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test) score_float = clf.fit(X_train.astype(float, copy=False), Y_train).score( X_test.astype(float, copy=False), Y_test ) assert score_uint8 == score_float def test_kneighbors_graph(): # Test kneighbors_graph to build the k-Nearest Neighbor graph. X = np.array([[0, 1], [1.01, 1.0], [2, 0]]) # n_neighbors = 1 A = neighbors.kneighbors_graph(X, 1, mode="connectivity", include_self=True) assert_array_equal(A.toarray(), np.eye(A.shape[0])) A = neighbors.kneighbors_graph(X, 1, mode="distance") assert_array_almost_equal( A.toarray(), [[0.00, 1.01, 0.0], [1.01, 0.0, 0.0], [0.00, 1.40716026, 0.0]] ) # n_neighbors = 2 A = neighbors.kneighbors_graph(X, 2, mode="connectivity", include_self=True) assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 1.0, 1.0]]) A = neighbors.kneighbors_graph(X, 2, mode="distance") assert_array_almost_equal( A.toarray(), [ [0.0, 1.01, 2.23606798], [1.01, 0.0, 1.40716026], [2.23606798, 1.40716026, 0.0], ], ) # n_neighbors = 3 A = neighbors.kneighbors_graph(X, 3, mode="connectivity", include_self=True) assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) def test_kneighbors_graph_sparse(seed=36): # Test kneighbors_graph to build the k-Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), ) def test_radius_neighbors_graph(): # Test radius_neighbors_graph to build the Nearest Neighbor graph. X = np.array([[0, 1], [1.01, 1.0], [2, 0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode="connectivity", include_self=True) assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode="distance") assert_array_almost_equal( A.toarray(), [[0.0, 1.01, 0.0], [1.01, 0.0, 1.40716026], [0.0, 1.40716026, 0.0]] ) def test_radius_neighbors_graph_sparse(seed=36): # Test radius_neighbors_graph to build the Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.radius_neighbors_graph( Xcsr, n_neighbors, mode=mode ).toarray(), ) def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) X3 = rng.random_sample((10, 3)) y = np.ones(10) est = neighbors.NearestNeighbors(algorithm="blah") with pytest.raises(ValueError): est.fit(X) for cls in ( neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor, ): est = cls(weights="blah") with pytest.raises(ValueError): est.fit(X, y) est = cls(p=-1) with pytest.raises(ValueError): est.fit(X, y) est = cls(algorithm="blah") with pytest.raises(ValueError): est.fit(X, y) nbrs = cls(algorithm="ball_tree", metric="haversine") with pytest.raises(ValueError): nbrs.predict(X) with pytest.raises(ValueError): ignore_warnings(nbrs.fit(Xsparse, y)) nbrs = cls(metric="haversine", algorithm="brute") nbrs.fit(X3, y) msg = "Haversine distance only valid in 2 dimensions" with pytest.raises(ValueError, match=msg): nbrs.predict(X3) nbrs = cls() with pytest.raises(ValueError): nbrs.fit(np.ones((0, 2)), np.ones(0)) with pytest.raises(ValueError): nbrs.fit(X[:, :, None], y) nbrs.fit(X, y) with pytest.raises(ValueError): nbrs.predict([[]]) if issubclass(cls, neighbors.KNeighborsClassifier) or issubclass( cls, neighbors.KNeighborsRegressor ): nbrs = cls(n_neighbors=-1) with pytest.raises(ValueError): nbrs.fit(X, y) nbrs = neighbors.NearestNeighbors().fit(X) with pytest.raises(ValueError): nbrs.kneighbors_graph(X, mode="blah") with pytest.raises(ValueError): nbrs.radius_neighbors_graph(X, mode="blah") def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): # Test computing the neighbors for various metrics # create a symmetric matrix V = rng.rand(n_features, n_features) VI = np.dot(V, V.T) metrics = [ ("euclidean", {}), ("manhattan", {}), ("minkowski", dict(p=1)), ("minkowski", dict(p=2)), ("minkowski", dict(p=3)), ("minkowski", dict(p=np.inf)), ("chebyshev", {}), ("seuclidean", dict(V=rng.rand(n_features))), ("mahalanobis", dict(VI=VI)), ("haversine", {}), ] if sp_version < parse_version("1.8.0.dev0"): # TODO: remove once we no longer support scipy < 1.8.0. # wminkowski was removed in scipy 1.8.0 but should work for previous # versions. metrics.append( ("wminkowski", dict(p=3, w=rng.rand(n_features))), ) else: # Recent scipy versions accept weights in the Minkowski metric directly: metrics.append( ("minkowski", dict(p=3, w=rng.rand(n_features))), ) algorithms = ["brute", "ball_tree", "kd_tree"] X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for metric, metric_params in metrics: results = {} p = metric_params.pop("p", 2) w = metric_params.get("w", None) for algorithm in algorithms: # KD tree doesn't support all metrics if algorithm == "kd_tree" and ( metric not in neighbors.KDTree.valid_metrics or w is not None ): est = neighbors.NearestNeighbors( algorithm=algorithm, metric=metric, metric_params=metric_params ) with pytest.raises(ValueError): est.fit(X) continue neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, metric=metric, p=p, metric_params=metric_params, ) # Haversine distance only accepts 2D data feature_sl = slice(None, 2) if metric == "haversine" else slice(None) neigh.fit(X[:, feature_sl]) # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 ExceptionToAssert = None if ( metric == "wminkowski" and algorithm == "brute" and sp_version >= parse_version("1.6.0") ): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): results[algorithm] = neigh.kneighbors( test[:, feature_sl], return_distance=True ) assert_array_almost_equal(results["brute"][0], results["ball_tree"][0]) assert_array_almost_equal(results["brute"][1], results["ball_tree"][1]) if "kd_tree" in results: assert_array_almost_equal(results["brute"][0], results["kd_tree"][0]) assert_array_almost_equal(results["brute"][1], results["kd_tree"][1]) def test_callable_metric(): def custom_metric(x1, x2): return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) X = np.random.RandomState(42).rand(20, 2) nbrs1 = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric=custom_metric ) nbrs2 = neighbors.NearestNeighbors( n_neighbors=3, algorithm="brute", metric=custom_metric ) nbrs1.fit(X) nbrs2.fit(X) dist1, ind1 = nbrs1.kneighbors(X) dist2, ind2 = nbrs2.kneighbors(X) assert_array_almost_equal(dist1, dist2) def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) # check that there is a metric that is valid for brute # but not ball_tree (so we actually test something) assert "cosine" in VALID_METRICS["brute"] assert "cosine" not in VALID_METRICS["ball_tree"] # Metric which don't required any additional parameter require_params = ["mahalanobis", "wminkowski", "seuclidean"] for metric in VALID_METRICS["brute"]: if metric != "precomputed" and metric not in require_params: nn = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric=metric ) if metric != "haversine": nn.fit(X) nn.kneighbors(X) else: nn.fit(X[:, :2]) nn.kneighbors(X[:, :2]) elif metric == "precomputed": X_precomputed = rng.random_sample((10, 4)) Y_precomputed = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") DYX = metrics.pairwise_distances( Y_precomputed, X_precomputed, metric="euclidean" ) nb_p = neighbors.NearestNeighbors(n_neighbors=3) nb_p.fit(DXX) nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE["brute"]: if metric != "precomputed" and metric not in require_params: nn = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric=metric ).fit(Xcsr) nn.kneighbors(Xcsr) # Metric with parameter VI = np.dot(X, X.T) list_metrics = [ ("seuclidean", dict(V=rng.rand(12))), ("wminkowski", dict(w=rng.rand(12))), ("mahalanobis", dict(VI=VI)), ] for metric, params in list_metrics: nn = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric=metric, metric_params=params ).fit(X) nn.kneighbors(X) def test_metric_params_interface(): X = rng.rand(5, 5) y = rng.randint(0, 2, 5) est = neighbors.KNeighborsClassifier(metric_params={"p": 3}) with pytest.warns(SyntaxWarning): est.fit(X, y) def test_predict_sparse_ball_kd_tree(): rng = np.random.RandomState(0) X = rng.rand(5, 5) y = rng.randint(0, 2, 5) nbrs1 = neighbors.KNeighborsClassifier(1, algorithm="kd_tree") nbrs2 = neighbors.KNeighborsRegressor(1, algorithm="ball_tree") for model in [nbrs1, nbrs2]: model.fit(X, y) with pytest.raises(ValueError): model.predict(csr_matrix(X)) def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ["manhattan", "chebyshev"]: nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric, mode="connectivity", include_self=True ).toarray() nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ["manhattan", "chebyshev"]: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric, mode="connectivity", include_self=True ).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric="manhattan") X_nbrs.fit(X) with pytest.raises(ValueError): neighbors.kneighbors_graph(X_nbrs, 3, metric="euclidean") X_nbrs = neighbors.NearestNeighbors(radius=radius, metric="manhattan") X_nbrs.fit(X) with pytest.raises(ValueError): neighbors.radius_neighbors_graph(X_nbrs, radius, metric="euclidean") def check_object_arrays(nparray, list_check): for ind, ele in enumerate(nparray): assert_array_equal(ele, list_check[ind]) def test_k_and_radius_neighbors_train_is_not_query(): # Test kneighbors et.al when query is not training data for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) X = [[0], [1]] nn.fit(X) test_data = [[2], [1]] # Test neighbors. dist, ind = nn.kneighbors(test_data) assert_array_equal(dist, [[1], [0]]) assert_array_equal(ind, [[1], [1]]) dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5) check_object_arrays(dist, [[1], [1, 0]]) check_object_arrays(ind, [[1], [0, 1]]) # Test the graph variants. assert_array_equal(nn.kneighbors_graph(test_data).A, [[0.0, 1.0], [0.0, 1.0]]) assert_array_equal( nn.kneighbors_graph([[2], [1]], mode="distance").A, np.array([[0.0, 1.0], [0.0, 0.0]]), ) rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5) assert_array_equal(rng.A, [[0, 1], [1, 1]]) def test_k_and_radius_neighbors_X_None(): # Test kneighbors et.al when query is None for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) X = [[0], [1]] nn.fit(X) dist, ind = nn.kneighbors() assert_array_equal(dist, [[1], [1]]) assert_array_equal(ind, [[1], [0]]) dist, ind = nn.radius_neighbors(None, radius=1.5) check_object_arrays(dist, [[1], [1]]) check_object_arrays(ind, [[1], [0]]) # Test the graph variants. rng = nn.radius_neighbors_graph(None, radius=1.5) kng = nn.kneighbors_graph(None) for graph in [rng, kng]: assert_array_equal(graph.A, [[0, 1], [1, 0]]) assert_array_equal(graph.data, [1, 1]) assert_array_equal(graph.indices, [1, 0]) X = [[0, 1], [0, 1], [1, 1]] nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) nn.fit(X) assert_array_equal( nn.kneighbors_graph().A, np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), ) def test_k_and_radius_neighbors_duplicates(): # Test behavior of kneighbors when duplicates are present in query for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) nn.fit([[0], [1]]) # Do not do anything special to duplicates. kng = nn.kneighbors_graph([[0], [1]], mode="distance") assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]])) assert_array_equal(kng.data, [0.0, 0.0]) assert_array_equal(kng.indices, [0, 1]) dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) check_object_arrays(dist, [[0, 1], [1, 0]]) check_object_arrays(ind, [[0, 1], [0, 1]]) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) assert_array_equal(rng.A, np.ones((2, 2))) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") rng.sort_indices() assert_array_equal(rng.A, [[0, 1], [1, 0]]) assert_array_equal(rng.indices, [0, 1, 0, 1]) assert_array_equal(rng.data, [0, 1, 1, 0]) # Mask the first duplicates when n_duplicates > n_neighbors. X = np.ones((3, 1)) nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") nn.fit(X) dist, ind = nn.kneighbors() assert_array_equal(dist, np.zeros((3, 1))) assert_array_equal(ind, [[1], [0], [1]]) # Test that zeros are explicitly marked in kneighbors_graph. kng = nn.kneighbors_graph(mode="distance") assert_array_equal(kng.A, np.zeros((3, 3))) assert_array_equal(kng.data, np.zeros(3)) assert_array_equal(kng.indices, [1.0, 0.0, 1.0]) assert_array_equal( nn.kneighbors_graph().A, np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), ) def test_include_self_neighbors_graph(): # Test include_self parameter in neighbors_graph X = [[2, 3], [4, 5]] kng = neighbors.kneighbors_graph(X, 1, include_self=True).A kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]]) assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]]) rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A rng_not_self = neighbors.radius_neighbors_graph(X, 5.0, include_self=False).A assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]]) assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]]) @pytest.mark.parametrize("algorithm", ALGORITHMS) def test_same_knn_parallel(algorithm): X, y = datasets.make_classification( n_samples=30, n_features=5, n_redundant=0, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.KNeighborsClassifier(n_neighbors=3, algorithm=algorithm) clf.fit(X_train, y_train) y = clf.predict(X_test) dist, ind = clf.kneighbors(X_test) graph = clf.kneighbors_graph(X_test, mode="distance").toarray() clf.set_params(n_jobs=3) clf.fit(X_train, y_train) y_parallel = clf.predict(X_test) dist_parallel, ind_parallel = clf.kneighbors(X_test) graph_parallel = clf.kneighbors_graph(X_test, mode="distance").toarray() assert_array_equal(y, y_parallel) assert_array_almost_equal(dist, dist_parallel) assert_array_equal(ind, ind_parallel) assert_array_almost_equal(graph, graph_parallel) @pytest.mark.parametrize("algorithm", ALGORITHMS) def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification( n_samples=30, n_features=5, n_redundant=0, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.RadiusNeighborsClassifier(radius=10, algorithm=algorithm) clf.fit(X_train, y_train) y = clf.predict(X_test) dist, ind = clf.radius_neighbors(X_test) graph = clf.radius_neighbors_graph(X_test, mode="distance").toarray() clf.set_params(n_jobs=3) clf.fit(X_train, y_train) y_parallel = clf.predict(X_test) dist_parallel, ind_parallel = clf.radius_neighbors(X_test) graph_parallel = clf.radius_neighbors_graph(X_test, mode="distance").toarray() assert_array_equal(y, y_parallel) for i in range(len(dist)): assert_array_almost_equal(dist[i], dist_parallel[i]) assert_array_equal(ind[i], ind_parallel[i]) assert_array_almost_equal(graph, graph_parallel) @pytest.mark.parametrize("backend", JOBLIB_BACKENDS) @pytest.mark.parametrize("algorithm", ALGORITHMS) def test_knn_forcing_backend(backend, algorithm): # Non-regression test which ensure the knn methods are properly working # even when forcing the global joblib backend. with joblib.parallel_backend(backend): X, y = datasets.make_classification( n_samples=30, n_features=5, n_redundant=0, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.KNeighborsClassifier( n_neighbors=3, algorithm=algorithm, n_jobs=3 ) clf.fit(X_train, y_train) clf.predict(X_test) clf.kneighbors(X_test) clf.kneighbors_graph(X_test, mode="distance").toarray() def test_dtype_convert(): classifier = neighbors.KNeighborsClassifier(n_neighbors=1) CLASSES = 15 X = np.eye(CLASSES) y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:CLASSES]] result = classifier.fit(X, y).predict(X) assert_array_equal(result, y) def test_sparse_metric_callable(): def sparse_metric(x, y): # Metric accepting sparse matrix input (only) assert issparse(x) and issparse(y) return x.dot(y.T).A.item() X = csr_matrix( [[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [0, 0, 1, 0, 0]] # Population matrix ) Y = csr_matrix([[1, 1, 0, 1, 1], [1, 0, 0, 0, 1]]) # Query matrix nn = neighbors.NearestNeighbors( algorithm="brute", n_neighbors=2, metric=sparse_metric ).fit(X) N = nn.kneighbors(Y, return_distance=False) # GS indices of nearest neighbours in `X` for `sparse_metric` gold_standard_nn = np.array([[2, 1], [2, 1]]) assert_array_equal(N, gold_standard_nn) # ignore conversion to boolean in pairwise_distances @ignore_warnings(category=DataConversionWarning) def test_pairwise_boolean_distance(): # Non-regression test for #4523 # 'brute': uses scipy.spatial.distance through pairwise_distances # 'ball_tree': uses sklearn.neighbors._dist_metrics rng = np.random.RandomState(0) X = rng.uniform(size=(6, 5)) NN = neighbors.NearestNeighbors nn1 = NN(metric="jaccard", algorithm="brute").fit(X) nn2 = NN(metric="jaccard", algorithm="ball_tree").fit(X) assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) def test_radius_neighbors_predict_proba(): for seed in range(5): X, y = datasets.make_classification( n_samples=50, n_features=5, n_informative=3, n_redundant=0, n_classes=3, random_state=seed, ) X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) outlier_label = int(2 - seed) clf = neighbors.RadiusNeighborsClassifier(radius=2, outlier_label=outlier_label) clf.fit(X_tr, y_tr) pred = clf.predict(X_te) proba = clf.predict_proba(X_te) proba_label = proba.argmax(axis=1) proba_label = np.where(proba.sum(axis=1) == 0, outlier_label, proba_label) assert_array_equal(pred, proba_label) def test_pipeline_with_nearest_neighbors_transformer(): # Test chaining KNeighborsTransformer and classifiers/regressors rng = np.random.RandomState(0) X = 2 * rng.rand(40, 5) - 1 X2 = 2 * rng.rand(40, 5) - 1 y = rng.rand(40, 1) n_neighbors = 12 radius = 1.5 # We precompute more neighbors than necessary, to have equivalence between # k-neighbors estimator after radius-neighbors transformer, and vice-versa. factor = 2 k_trans = neighbors.KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance") k_trans_factor = neighbors.KNeighborsTransformer( n_neighbors=int(n_neighbors * factor), mode="distance" ) r_trans = neighbors.RadiusNeighborsTransformer(radius=radius, mode="distance") r_trans_factor = neighbors.RadiusNeighborsTransformer( radius=int(radius * factor), mode="distance" ) k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors) r_reg = neighbors.RadiusNeighborsRegressor(radius=radius) test_list = [ (k_trans, k_reg), (k_trans_factor, r_reg), (r_trans, r_reg), (r_trans_factor, k_reg), ] for trans, reg in test_list: # compare the chained version and the compact version reg_compact = clone(reg) reg_precomp = clone(reg) reg_precomp.set_params(metric="precomputed") reg_chain = make_pipeline(clone(trans), reg_precomp) y_pred_chain = reg_chain.fit(X, y).predict(X2) y_pred_compact = reg_compact.fit(X, y).predict(X2) assert_array_almost_equal(y_pred_chain, y_pred_compact) @pytest.mark.parametrize( "X, metric, metric_params, expected_algo", [ (np.random.randint(10, size=(10, 10)), "precomputed", None, "brute"), (np.random.randn(10, 20), "euclidean", None, "brute"), (np.random.randn(8, 5), "euclidean", None, "brute"), (np.random.randn(10, 5), "euclidean", None, "kd_tree"), (np.random.randn(10, 5), "seuclidean", {"V": [2] * 5}, "ball_tree"), (np.random.randn(10, 5), "correlation", None, "brute"), ], ) def test_auto_algorithm(X, metric, metric_params, expected_algo): model = neighbors.NearestNeighbors( n_neighbors=4, algorithm="auto", metric=metric, metric_params=metric_params ) model.fit(X) assert model._fit_method == expected_algo # TODO: Remove in 1.1 @pytest.mark.parametrize( "NearestNeighbors", [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors, ], # type: ignore ) def test_pairwise_deprecated(NearestNeighbors): nn = NearestNeighbors(metric="precomputed") msg = r"Attribute `_pairwise` was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): nn._pairwise # TODO: Remove in 1.3 def test_neighbors_distance_metric_deprecation(): from sklearn.neighbors import DistanceMetric from sklearn.metrics import DistanceMetric as ActualDistanceMetric msg = r"This import path will be removed in 1\.3" with pytest.warns(FutureWarning, match=msg): dist_metric = DistanceMetric.get_metric("euclidean") assert isinstance(dist_metric, ActualDistanceMetric)