import numpy as np import pytest from scipy import stats from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.svm._bounds import l1_min_c from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap from sklearn.utils.fixes import CSR_CONTAINERS dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]] Y1 = [0, 1, 1, 1] Y2 = [2, 1, 0, 0] @pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array]) @pytest.mark.parametrize("loss", ["squared_hinge", "log"]) @pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"]) @pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"]) def test_l1_min_c(X_container, loss, Y_label, intercept_label): Ys = {"two-classes": Y1, "multi-class": Y2} intercepts = { "no-intercept": {"fit_intercept": False}, "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10}, } X = X_container(dense_X) Y = Ys[Y_label] intercept_params = intercepts[intercept_label] check_l1_min_c(X, Y, loss, **intercept_params) def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0): min_c = l1_min_c( X, y, loss=loss, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, ) clf = { "log": LogisticRegression(penalty="l1", solver="liblinear"), "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False), }[loss] clf.fit_intercept = fit_intercept clf.intercept_scaling = intercept_scaling clf.C = min_c clf.fit(X, y) assert (np.asarray(clf.coef_) == 0).all() assert (np.asarray(clf.intercept_) == 0).all() clf.C = min_c * 1.01 clf.fit(X, y) assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any() def test_ill_posed_min_c(): X = [[0, 0], [0, 0]] y = [0, 1] with pytest.raises(ValueError): l1_min_c(X, y) _MAX_UNSIGNED_INT = 4294967295 def test_newrand_default(): """Test that bounded_rand_int_wrap without seeding respects the range Note this test should pass either if executed alone, or in conjunctions with other tests that call set_seed explicit in any order: it checks invariants on the RNG instead of specific values. """ generated = [bounded_rand_int_wrap(100) for _ in range(10)] assert all(0 <= x < 100 for x in generated) assert not all(x == generated[0] for x in generated) @pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)]) def test_newrand_set_seed(seed, expected): """Test that `set_seed` produces deterministic results""" set_seed_wrap(seed) generated = bounded_rand_int_wrap(100) assert generated == expected @pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1]) def test_newrand_set_seed_overflow(seed): """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" with pytest.raises(OverflowError): set_seed_wrap(seed) @pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) def test_newrand_bounded_rand_int(range_, n_pts): """Test that `bounded_rand_int` follows a uniform distribution""" # XXX: this test is very seed sensitive: either it is wrong (too strict?) # or the wrapped RNG is not uniform enough, at least on some platforms. set_seed_wrap(42) n_iter = 100 ks_pvals = [] uniform_dist = stats.uniform(loc=0, scale=range_) # perform multiple samplings to make chance of outlier sampling negligible for _ in range(n_iter): # Deterministic random sampling sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] res = stats.kstest(sample, uniform_dist.cdf) ks_pvals.append(res.pvalue) # Null hypothesis = samples come from an uniform distribution. # Under the null hypothesis, p-values should be uniformly distributed # and not concentrated on low values # (this may seem counter-intuitive but is backed by multiple refs) # So we can do two checks: # (1) check uniformity of p-values uniform_p_vals_dist = stats.uniform(loc=0, scale=1) res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) assert res_pvals.pvalue > 0.05, ( "Null hypothesis rejected: generated random numbers are not uniform." " Details: the (meta) p-value of the test of uniform distribution" f" of p-values is {res_pvals.pvalue} which is not > 0.05" ) # (2) (safety belt) check that 90% of p-values are above 0.05 min_10pct_pval = np.percentile(ks_pvals, q=10) # lower 10th quantile pvalue <= 0.05 means that the test rejects the # null hypothesis that the sample came from the uniform distribution assert min_10pct_pval > 0.05, ( "Null hypothesis rejected: generated random numbers are not uniform. " f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." ) @pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1]) def test_newrand_bounded_rand_int_limits(range_): """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" with pytest.raises(OverflowError): bounded_rand_int_wrap(range_)