import pytest
import scipy
import numpy as np
from numpy.testing import assert_array_equal

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.datasets import make_regression, make_blobs
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.cluster import KMeans


@pytest.mark.parametrize("n_features_to_select", (0, 5, 0.0, -1, 1.1))
def test_bad_n_features_to_select(n_features_to_select):
    X, y = make_regression(n_features=5)
    sfs = SequentialFeatureSelector(
        LinearRegression(), n_features_to_select=n_features_to_select
    )
    with pytest.raises(ValueError, match="must be either None"):
        sfs.fit(X, y)


def test_bad_direction():
    X, y = make_regression(n_features=5)
    sfs = SequentialFeatureSelector(LinearRegression(), direction="bad")
    with pytest.raises(ValueError, match="must be either 'forward' or"):
        sfs.fit(X, y)


@pytest.mark.parametrize("direction", ("forward", "backward"))
@pytest.mark.parametrize("n_features_to_select", (1, 5, 9, None))
def test_n_features_to_select(direction, n_features_to_select):
    # Make sure n_features_to_select is respected

    X, y = make_regression(n_features=10)
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select=n_features_to_select,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    if n_features_to_select is None:
        n_features_to_select = 5  # n_features // 2
    assert sfs.get_support(indices=True).shape[0] == n_features_to_select
    assert sfs.n_features_to_select_ == n_features_to_select
    assert sfs.transform(X).shape[1] == n_features_to_select


@pytest.mark.parametrize("direction", ("forward", "backward"))
@pytest.mark.parametrize(
    "n_features_to_select, expected",
    (
        (0.1, 1),
        (1.0, 10),
        (0.5, 5),
        (None, 5),  # just to make sure .5 is equivalent to passing None
    ),
)
def test_n_features_to_select_float(direction, n_features_to_select, expected):
    # Test passing a float as n_features_to_select
    X, y = make_regression(n_features=10)
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select=n_features_to_select,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    assert sfs.n_features_to_select_ == expected


@pytest.mark.parametrize("seed", range(10))
@pytest.mark.parametrize("direction", ("forward", "backward"))
@pytest.mark.parametrize(
    "n_features_to_select, expected_selected_features",
    [
        (2, [0, 2]),  # f1 is dropped since it has no predictive power
        (1, [2]),  # f2 is more predictive than f0 so it's kept
    ],
)
def test_sanity(seed, direction, n_features_to_select, expected_selected_features):
    # Basic sanity check: 3 features, only f0 and f2 are correlated with the
    # target, f2 having a stronger correlation than f0. We expect f1 to be
    # dropped, and f2 to always be selected.

    rng = np.random.RandomState(seed)
    n_samples = 100
    X = rng.randn(n_samples, 3)
    y = 3 * X[:, 0] - 10 * X[:, 2]

    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select=n_features_to_select,
        direction=direction,
        cv=2,
    )
    sfs.fit(X, y)
    assert_array_equal(sfs.get_support(indices=True), expected_selected_features)


def test_sparse_support():
    # Make sure sparse data is supported

    X, y = make_regression(n_features=10)
    X = scipy.sparse.csr_matrix(X)
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)


def test_nan_support():
    # Make sure nans are OK if the underlying estimator supports nans

    rng = np.random.RandomState(0)
    n_samples, n_features = 40, 4
    X, y = make_regression(n_samples, n_features, random_state=0)
    nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
    X[nan_mask] = np.nan
    sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        # LinearRegression does not support nans
        SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)


def test_pipeline_support():
    # Make sure that pipelines can be passed into SFS and that SFS can be
    # passed into a pipeline

    n_samples, n_features = 50, 3
    X, y = make_regression(n_samples, n_features, random_state=0)

    # pipeline in SFS
    pipe = make_pipeline(StandardScaler(), LinearRegression())
    sfs = SequentialFeatureSelector(pipe, cv=2)
    sfs.fit(X, y)
    sfs.transform(X)

    # SFS in pipeline
    sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
    pipe = make_pipeline(StandardScaler(), sfs)
    pipe.fit(X, y)
    pipe.transform(X)


@pytest.mark.parametrize("n_features_to_select", (2, 3))
def test_unsupervised_model_fit(n_features_to_select):
    # Make sure that models without classification labels are not being
    # validated

    X, y = make_blobs(n_features=4)
    sfs = SequentialFeatureSelector(
        KMeans(n_init=1),
        n_features_to_select=n_features_to_select,
    )
    sfs.fit(X)
    assert sfs.transform(X).shape[1] == n_features_to_select


@pytest.mark.parametrize("y", ("no_validation", 1j, 99.9, np.nan, 3))
def test_no_y_validation_model_fit(y):
    # Make sure that other non-conventional y labels are not accepted

    X, clusters = make_blobs(n_features=6)
    sfs = SequentialFeatureSelector(
        KMeans(),
        n_features_to_select=3,
    )

    with pytest.raises((TypeError, ValueError)):
        sfs.fit(X, y)