# -*- coding: utf-8 -*-

import re

import numpy as np
from scipy import sparse
import pytest

from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import _convert_container
from sklearn.utils import is_scalar_nan

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


def test_one_hot_encoder_sparse_dense():
    # check that sparse and dense will give the same results

    X = np.array([[3, 2, 1], [0, 1, 1]])
    enc_sparse = OneHotEncoder()
    enc_dense = OneHotEncoder(sparse=False)

    X_trans_sparse = enc_sparse.fit_transform(X)
    X_trans_dense = enc_dense.fit_transform(X)

    assert X_trans_sparse.shape == (2, 5)
    assert X_trans_dense.shape == (2, 5)

    assert sparse.issparse(X_trans_sparse)
    assert not sparse.issparse(X_trans_dense)

    # check outcome
    assert_array_equal(
        X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]]
    )
    assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)


def test_one_hot_encoder_handle_unknown():
    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
    X2 = np.array([[4, 1, 1]])

    # Test that one hot encoder raises error for unknown features
    # present during transform.
    oh = OneHotEncoder(handle_unknown="error")
    oh.fit(X)
    with pytest.raises(ValueError, match="Found unknown categories"):
        oh.transform(X2)

    # Test the ignore option, ignores unknown features (giving all 0's)
    oh = OneHotEncoder(handle_unknown="ignore")
    oh.fit(X)
    X2_passed = X2.copy()
    assert_array_equal(
        oh.transform(X2_passed).toarray(),
        np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]),
    )
    # ensure transformed data was not modified in place
    assert_allclose(X2, X2_passed)

    # Raise error if handle_unknown is neither ignore or error.
    oh = OneHotEncoder(handle_unknown="42")
    with pytest.raises(ValueError, match="handle_unknown should be either"):
        oh.fit(X)


def test_one_hot_encoder_not_fitted():
    X = np.array([["a"], ["b"]])
    enc = OneHotEncoder(categories=["a", "b"])
    msg = (
        "This OneHotEncoder instance is not fitted yet. "
        "Call 'fit' with appropriate arguments before using this "
        "estimator."
    )
    with pytest.raises(NotFittedError, match=msg):
        enc.transform(X)


def test_one_hot_encoder_handle_unknown_strings():
    X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1))
    X2 = np.array(["55555", "22"]).reshape((-1, 1))
    # Non Regression test for the issue #12470
    # Test the ignore option, when categories are numpy string dtype
    # particularly when the known category strings are larger
    # than the unknown category strings
    oh = OneHotEncoder(handle_unknown="ignore")
    oh.fit(X)
    X2_passed = X2.copy()
    assert_array_equal(
        oh.transform(X2_passed).toarray(),
        np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]),
    )
    # ensure transformed data was not modified in place
    assert_array_equal(X2, X2_passed)


@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
    X = np.asarray([[0, 1]], dtype=input_dtype).T
    X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)

    oh = OneHotEncoder(categories="auto", dtype=output_dtype)
    assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
    assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)

    oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse=False)
    assert_array_equal(oh.fit_transform(X), X_expected)
    assert_array_equal(oh.fit(X).transform(X), X_expected)


@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype_pandas(output_dtype):
    pd = pytest.importorskip("pandas")

    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
    X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)

    oh = OneHotEncoder(dtype=output_dtype)
    assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
    assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)

    oh = OneHotEncoder(dtype=output_dtype, sparse=False)
    assert_array_equal(oh.fit_transform(X_df), X_expected)
    assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)


# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_one_hot_encoder_feature_names(get_names):
    enc = OneHotEncoder()
    X = [
        ["Male", 1, "girl", 2, 3],
        ["Female", 41, "girl", 1, 10],
        ["Male", 51, "boy", 12, 3],
        ["Male", 91, "girl", 21, 30],
    ]

    enc.fit(X)
    feature_names = getattr(enc, get_names)()

    if get_names == "get_feature_names":
        assert isinstance(feature_names, np.ndarray)

    assert_array_equal(
        [
            "x0_Female",
            "x0_Male",
            "x1_1",
            "x1_41",
            "x1_51",
            "x1_91",
            "x2_boy",
            "x2_girl",
            "x3_1",
            "x3_2",
            "x3_12",
            "x3_21",
            "x4_3",
            "x4_10",
            "x4_30",
        ],
        feature_names,
    )

    feature_names2 = enc.get_feature_names(["one", "two", "three", "four", "five"])
    feature_names2 = getattr(enc, get_names)(["one", "two", "three", "four", "five"])

    assert_array_equal(
        [
            "one_Female",
            "one_Male",
            "two_1",
            "two_41",
            "two_51",
            "two_91",
            "three_boy",
            "three_girl",
            "four_1",
            "four_2",
            "four_12",
            "four_21",
            "five_3",
            "five_10",
            "five_30",
        ],
        feature_names2,
    )

    with pytest.raises(ValueError, match="input_features should have length"):
        getattr(enc, get_names)(["one", "two"])


# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
def test_one_hot_encoder_feature_names_unicode(get_names):
    enc = OneHotEncoder()
    X = np.array([["c❤t1", "dat2"]], dtype=object).T
    enc.fit(X)
    feature_names = getattr(enc, get_names)()
    assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names)
    feature_names = getattr(enc, get_names)(input_features=["n👍me"])
    assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names)


def test_one_hot_encoder_set_params():
    X = np.array([[1, 2]]).T
    oh = OneHotEncoder()
    # set params on not yet fitted object
    oh.set_params(categories=[[0, 1, 2, 3]])
    assert oh.get_params()["categories"] == [[0, 1, 2, 3]]
    assert oh.fit_transform(X).toarray().shape == (2, 4)
    # set params on already fitted object
    oh.set_params(categories=[[0, 1, 2, 3, 4]])
    assert oh.fit_transform(X).toarray().shape == (2, 5)


def check_categorical_onehot(X):
    enc = OneHotEncoder(categories="auto")
    Xtr1 = enc.fit_transform(X)

    enc = OneHotEncoder(categories="auto", sparse=False)
    Xtr2 = enc.fit_transform(X)

    assert_allclose(Xtr1.toarray(), Xtr2)

    assert sparse.isspmatrix_csr(Xtr1)
    return Xtr1.toarray()


@pytest.mark.parametrize(
    "X",
    [
        [["def", 1, 55], ["abc", 2, 55]],
        np.array([[10, 1, 55], [5, 2, 55]]),
        np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object),
        np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object),
        np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object),
        np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object),
        np.array([[None, 1, None], ["a", np.nan, None]], dtype=object),
        np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object),
    ],
    ids=[
        "mixed",
        "numeric",
        "object",
        "mixed-nan",
        "mixed-float-nan",
        "mixed-None",
        "mixed-None-nan",
        "mixed-None-float-nan",
    ],
)
def test_one_hot_encoder(X):
    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
    assert_allclose(Xtr, [[0, 1], [1, 0]])

    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])

    Xtr = OneHotEncoder(categories="auto").fit_transform(X)
    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])


@pytest.mark.parametrize("sparse_", [False, True])
@pytest.mark.parametrize("drop", [None, "first"])
def test_one_hot_encoder_inverse(sparse_, drop):
    X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
    enc = OneHotEncoder(sparse=sparse_, drop=drop)
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    X = [[2, 55], [1, 55], [3, 55]]
    enc = OneHotEncoder(sparse=sparse_, categories="auto", drop=drop)
    X_tr = enc.fit_transform(X)
    exp = np.array(X)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    if drop is None:
        # with unknown categories
        # drop is incompatible with handle_unknown=ignore
        X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
        enc = OneHotEncoder(
            sparse=sparse_,
            handle_unknown="ignore",
            categories=[["abc", "def"], [1, 2], [54, 55, 56]],
        )
        X_tr = enc.fit_transform(X)
        exp = np.array(X, dtype=object)
        exp[2, 1] = None
        assert_array_equal(enc.inverse_transform(X_tr), exp)

        # with an otherwise numerical output, still object if unknown
        X = [[2, 55], [1, 55], [3, 55]]
        enc = OneHotEncoder(
            sparse=sparse_, categories=[[1, 2], [54, 56]], handle_unknown="ignore"
        )
        X_tr = enc.fit_transform(X)
        exp = np.array(X, dtype=object)
        exp[2, 0] = None
        exp[:, 1] = None
        assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1], [1, 0, 1]])
    msg = re.escape("Shape of the passed X data is not correct")
    with pytest.raises(ValueError, match=msg):
        enc.inverse_transform(X_tr)


@pytest.mark.parametrize("sparse_", [False, True])
@pytest.mark.parametrize(
    "X, X_trans",
    [
        ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
        (
            [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]],
            [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]],
        ),
    ],
)
def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
    X, X_trans, sparse_
):
    """Check that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    """
    enc = OneHotEncoder(sparse=sparse_).fit(X)
    msg = (
        r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
        r"handle_unknown='error' because they contain all zeros"
    )

    if sparse_:
        # emulate sparse data transform by a one-hot encoder sparse.
        X_trans = _convert_container(X_trans, "sparse")
    with pytest.raises(ValueError, match=msg):
        enc.inverse_transform(X_trans)


def test_one_hot_encoder_inverse_if_binary():
    X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
    ohe = OneHotEncoder(drop="if_binary", sparse=False)
    X_tr = ohe.fit_transform(X)
    assert_array_equal(ohe.inverse_transform(X_tr), X)


# check that resetting drop option without refitting does not throw an error
# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
@pytest.mark.parametrize("drop", ["if_binary", "first", None])
@pytest.mark.parametrize("reset_drop", ["if_binary", "first", None])
def test_one_hot_encoder_drop_reset(get_names, drop, reset_drop):
    X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
    ohe = OneHotEncoder(drop=drop, sparse=False)
    ohe.fit(X)
    X_tr = ohe.transform(X)
    feature_names = getattr(ohe, get_names)()
    ohe.set_params(drop=reset_drop)
    assert_array_equal(ohe.inverse_transform(X_tr), X)
    assert_allclose(ohe.transform(X), X_tr)
    assert_array_equal(getattr(ohe, get_names)(), feature_names)


@pytest.mark.parametrize("method", ["fit", "fit_transform"])
@pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])])
def test_X_is_not_1D(X, method):
    oh = OneHotEncoder()

    msg = "Expected 2D array, got 1D array instead"
    with pytest.raises(ValueError, match=msg):
        getattr(oh, method)(X)


@pytest.mark.parametrize("method", ["fit", "fit_transform"])
def test_X_is_not_1D_pandas(method):
    pd = pytest.importorskip("pandas")
    X = pd.Series([6, 3, 4, 6])
    oh = OneHotEncoder()

    msg = "Expected 2D array, got 1D array instead"
    with pytest.raises(ValueError, match=msg):
        getattr(oh, method)(X)


@pytest.mark.parametrize(
    "X, cat_exp, cat_dtype",
    [
        ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_),
        (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
        (
            np.array([["A", "cat"], ["B", "cat"]], dtype=object),
            [["A", "B"], ["cat"]],
            np.object_,
        ),
        (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_),
        (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float_),
        (
            np.array([["A", np.nan], [None, np.nan]], dtype=object),
            [["A", None], [np.nan]],
            np.object_,
        ),
        (
            np.array([["A", float("nan")], [None, float("nan")]], dtype=object),
            [["A", None], [float("nan")]],
            np.object_,
        ),
    ],
    ids=[
        "mixed",
        "numeric",
        "object",
        "string",
        "missing-float",
        "missing-np.nan-object",
        "missing-float-nan-object",
    ],
)
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
    # order of categories should not depend on order of samples
    for Xi in [X, X[::-1]]:
        enc = OneHotEncoder(categories="auto")
        enc.fit(Xi)
        # assert enc.categories == 'auto'
        assert isinstance(enc.categories_, list)
        for res, exp in zip(enc.categories_, cat_exp):
            res_list = res.tolist()
            if is_scalar_nan(exp[-1]):
                assert is_scalar_nan(res_list[-1])
                assert res_list[:-1] == exp[:-1]
            else:
                assert res.tolist() == exp
            assert np.issubdtype(res.dtype, cat_dtype)


@pytest.mark.parametrize(
    "X, X2, cats, cat_dtype",
    [
        (
            np.array([["a", "b"]], dtype=object).T,
            np.array([["a", "d"]], dtype=object).T,
            [["a", "b", "c"]],
            np.object_,
        ),
        (
            np.array([[1, 2]], dtype="int64").T,
            np.array([[1, 4]], dtype="int64").T,
            [[1, 2, 3]],
            np.int64,
        ),
        (
            np.array([["a", "b"]], dtype=object).T,
            np.array([["a", "d"]], dtype=object).T,
            [np.array(["a", "b", "c"])],
            np.object_,
        ),
        (
            np.array([[None, "a"]], dtype=object).T,
            np.array([[None, "b"]], dtype=object).T,
            [[None, "a", "z"]],
            object,
        ),
        (
            np.array([["a", "b"]], dtype=object).T,
            np.array([["a", np.nan]], dtype=object).T,
            [["a", "b", "z"]],
            object,
        ),
        (
            np.array([["a", None]], dtype=object).T,
            np.array([["a", np.nan]], dtype=object).T,
            [["a", None, "z"]],
            object,
        ),
        (
            np.array([["a", np.nan]], dtype=object).T,
            np.array([["a", None]], dtype=object).T,
            [["a", np.nan, "z"]],
            object,
        ),
    ],
    ids=[
        "object",
        "numeric",
        "object-string",
        "object-string-none",
        "object-string-nan",
        "object-None-and-nan",
        "object-nan-and-None",
    ],
)
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OneHotEncoder(categories=cats)
    exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OneHotEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)
    enc = OneHotEncoder(categories=cats, handle_unknown="ignore")
    exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
    assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)


def test_one_hot_encoder_unsorted_categories():
    X = np.array([["a", "b"]], dtype=object).T

    enc = OneHotEncoder(categories=[["b", "a", "c"]])
    exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert enc.categories_[0].tolist() == ["b", "a", "c"]
    assert np.issubdtype(enc.categories_[0].dtype, np.object_)

    # unsorted passed categories still raise for numerical values
    X = np.array([[1, 2]]).T
    enc = OneHotEncoder(categories=[[2, 1, 3]])
    msg = "Unsorted categories are not supported"
    with pytest.raises(ValueError, match=msg):
        enc.fit_transform(X)

    # np.nan must be the last category in categories[0] to be considered sorted
    X = np.array([[1, 2, np.nan]]).T
    enc = OneHotEncoder(categories=[[1, np.nan, 2]])
    with pytest.raises(ValueError, match=msg):
        enc.fit_transform(X)


def test_one_hot_encoder_specified_categories_mixed_columns():
    # multiple columns
    X = np.array([["a", "b"], [0, 2]], dtype=object).T
    enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]])
    exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]])
    assert_array_equal(enc.fit_transform(X).toarray(), exp)
    assert enc.categories_[0].tolist() == ["a", "b", "c"]
    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
    assert enc.categories_[1].tolist() == [0, 1, 2]
    # integer categories but from object dtype data
    assert np.issubdtype(enc.categories_[1].dtype, np.object_)


def test_one_hot_encoder_pandas():
    pd = pytest.importorskip("pandas")

    X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})

    Xtr = check_categorical_onehot(X_df)
    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])


# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
@pytest.mark.parametrize(
    "drop, expected_names",
    [
        ("first", ["x0_c", "x2_b"]),
        ("if_binary", ["x0_c", "x1_2", "x2_b"]),
        (["c", 2, "b"], ["x0_b", "x2_a"]),
    ],
    ids=["first", "binary", "manual"],
)
def test_one_hot_encoder_feature_names_drop(get_names, drop, expected_names):
    X = [["c", 2, "a"], ["b", 2, "b"]]

    ohe = OneHotEncoder(drop=drop)
    ohe.fit(X)
    feature_names = getattr(ohe, get_names)()
    if get_names == "get_feature_names":
        assert isinstance(feature_names, np.ndarray)
    assert_array_equal(expected_names, feature_names)


def test_one_hot_encoder_drop_equals_if_binary():
    # Canonical case
    X = [[10, "yes"], [20, "no"], [30, "yes"]]
    expected = np.array(
        [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]]
    )
    expected_drop_idx = np.array([None, 0])

    ohe = OneHotEncoder(drop="if_binary", sparse=False)
    result = ohe.fit_transform(X)
    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
    assert_allclose(result, expected)

    # with only one cat, the behaviour is equivalent to drop=None
    X = [["true", "a"], ["false", "a"], ["false", "a"]]
    expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
    expected_drop_idx = np.array([0, None])

    ohe = OneHotEncoder(drop="if_binary", sparse=False)
    result = ohe.fit_transform(X)
    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
    assert_allclose(result, expected)


@pytest.mark.parametrize(
    "X",
    [
        [["abc", 2, 55], ["def", 1, 55]],
        np.array([[10, 2, 55], [20, 1, 55]]),
        np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object),
    ],
    ids=["mixed", "numeric", "object"],
)
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64")
    assert_array_equal(enc.fit_transform(X), exp.astype("float64"))
    enc = OrdinalEncoder(dtype="int64")
    assert_array_equal(enc.fit_transform(X), exp)


@pytest.mark.parametrize(
    "X, X2, cats, cat_dtype",
    [
        (
            np.array([["a", "b"]], dtype=object).T,
            np.array([["a", "d"]], dtype=object).T,
            [["a", "b", "c"]],
            np.object_,
        ),
        (
            np.array([[1, 2]], dtype="int64").T,
            np.array([[1, 4]], dtype="int64").T,
            [[1, 2, 3]],
            np.int64,
        ),
        (
            np.array([["a", "b"]], dtype=object).T,
            np.array([["a", "d"]], dtype=object).T,
            [np.array(["a", "b", "c"])],
            np.object_,
        ),
    ],
    ids=["object", "numeric", "object-string-cat"],
)
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.0], [1.0]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)


def test_ordinal_encoder_inverse():
    X = [["abc", 2, 55], ["def", 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape("Shape of the passed X data is not correct")
    with pytest.raises(ValueError, match=msg):
        enc.inverse_transform(X_tr)


def test_ordinal_encoder_handle_unknowns_string():
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
    X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
    X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
    enc.fit(X_fit)

    X_trans_enc = enc.transform(X_trans)
    exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
    assert_array_equal(X_trans_enc, exp)

    X_trans_inv = enc.inverse_transform(X_trans_enc)
    inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
    assert_array_equal(X_trans_inv, inv_exp)


@pytest.mark.parametrize("dtype", [float, int])
def test_ordinal_encoder_handle_unknowns_numeric(dtype):
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
    X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
    X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
    enc.fit(X_fit)

    X_trans_enc = enc.transform(X_trans)
    exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
    assert_array_equal(X_trans_enc, exp)

    X_trans_inv = enc.inverse_transform(X_trans_enc)
    inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
    assert_array_equal(X_trans_inv, inv_exp)


@pytest.mark.parametrize(
    "params, err_type, err_msg",
    [
        (
            {"handle_unknown": "use_encoded_value"},
            TypeError,
            "unknown_value should be an integer or np.nan when handle_unknown "
            "is 'use_encoded_value', got None.",
        ),
        (
            {"unknown_value": -2},
            TypeError,
            "unknown_value should only be set when handle_unknown is "
            "'use_encoded_value', got -2.",
        ),
        (
            {"handle_unknown": "use_encoded_value", "unknown_value": "bla"},
            TypeError,
            "unknown_value should be an integer or np.nan when handle_unknown "
            "is 'use_encoded_value', got bla.",
        ),
        (
            {"handle_unknown": "use_encoded_value", "unknown_value": 1},
            ValueError,
            "The used value for unknown_value (1) is one of the values "
            "already used for encoding the seen categories.",
        ),
        (
            {"handle_unknown": "ignore"},
            ValueError,
            "handle_unknown should be either 'error' or 'use_encoded_value', "
            "got ignore.",
        ),
    ],
)
def test_ordinal_encoder_handle_unknowns_raise(params, err_type, err_msg):
    # Check error message when validating input parameters
    X = np.array([["a", "x"], ["b", "y"]], dtype=object)

    encoder = OrdinalEncoder(**params)
    with pytest.raises(err_type, match=err_msg):
        encoder.fit(X)


def test_ordinal_encoder_handle_unknowns_nan():
    # Make sure unknown_value=np.nan properly works

    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)

    X_fit = np.array([[1], [2], [3]])
    enc.fit(X_fit)
    X_trans = enc.transform([[1], [2], [4]])
    assert_array_equal(X_trans, [[0], [1], [np.nan]])


def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
    # Make sure an error is raised when unknown_value=np.nan and the dtype
    # isn't a float dtype
    enc = OrdinalEncoder(
        handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int
    )

    X_fit = np.array([[1], [2], [3]])
    with pytest.raises(ValueError, match="dtype parameter should be a float dtype"):
        enc.fit(X_fit)


def test_ordinal_encoder_raise_categories_shape():

    X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
    cats = ["Low", "Medium", "High"]
    enc = OrdinalEncoder(categories=cats)
    msg = "Shape mismatch: if categories is an array,"

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)


def test_encoder_dtypes():
    # check that dtypes are preserved when determining categories
    enc = OneHotEncoder(categories="auto")
    exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64")

    for X in [
        np.array([[1, 2], [3, 4]], dtype="int64"),
        np.array([[1, 2], [3, 4]], dtype="float64"),
        np.array([["a", "b"], ["c", "d"]]),  # str dtype
        np.array([[b"a", b"b"], [b"c", b"d"]]),  # bytes dtype
        np.array([[1, "a"], [3, "b"]], dtype="object"),
    ]:
        enc.fit(X)
        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
        assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, 2], [3, 4]]
    enc.fit(X)
    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = [[1, "a"], [3, "b"]]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == "object" for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)


def test_encoder_dtypes_pandas():
    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
    pd = pytest.importorskip("pandas")

    enc = OneHotEncoder(categories="auto")
    exp = np.array(
        [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]],
        dtype="float64",
    )

    X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64")
    enc.fit(X)
    assert all([enc.categories_[i].dtype == "int64" for i in range(2)])
    assert_array_equal(enc.transform(X).toarray(), exp)

    X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]})
    X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype]
    enc.fit(X)
    assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
    assert_array_equal(enc.transform(X).toarray(), exp)


def test_one_hot_encoder_warning():
    enc = OneHotEncoder()
    X = [["Male", 1], ["Female", 3]]
    np.testing.assert_no_warnings(enc.fit_transform, X)


@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
def test_one_hot_encoder_drop_manual(missing_value):
    cats_to_drop = ["def", 12, 3, 56, missing_value]
    enc = OneHotEncoder(drop=cats_to_drop)
    X = [
        ["abc", 12, 2, 55, "a"],
        ["def", 12, 1, 55, "a"],
        ["def", 12, 3, 56, missing_value],
    ]
    trans = enc.fit_transform(X).toarray()
    exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
    assert_array_equal(trans, exp)
    assert enc.drop is cats_to_drop

    dropped_cats = [
        cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
    ]
    X_inv_trans = enc.inverse_transform(trans)
    X_array = np.array(X, dtype=object)

    # last value is np.nan
    if is_scalar_nan(cats_to_drop[-1]):
        assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
        assert is_scalar_nan(dropped_cats[-1])
        assert is_scalar_nan(cats_to_drop[-1])
        # do not include the last column which includes missing values
        assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])

        # check last column is the missing value
        assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
        assert is_scalar_nan(X_array[-1, -1])
        assert is_scalar_nan(X_inv_trans[-1, -1])
    else:
        assert_array_equal(dropped_cats, cats_to_drop)
        assert_array_equal(X_array, X_inv_trans)


@pytest.mark.parametrize(
    "X_fit, params, err_msg",
    [
        (
            [["Male"], ["Female"]],
            {"drop": "second"},
            "Wrong input for parameter `drop`",
        ),
        (
            [["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]],
            {"drop": np.asarray("b", dtype=object)},
            "Wrong input for parameter `drop`",
        ),
        (
            [["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]],
            {"drop": ["ghi", 3, 59]},
            "The following categories were supposed",
        ),
    ],
)
def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
    enc = OneHotEncoder(**params)
    with pytest.raises(ValueError, match=err_msg):
        enc.fit(X_fit)


@pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]])
def test_invalid_drop_length(drop):
    enc = OneHotEncoder(drop=drop)
    err_msg = "`drop` should have length equal to the number"
    with pytest.raises(ValueError, match=err_msg):
        enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]])


@pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"])
@pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"])
def test_categories(density, drop):
    ohe_base = OneHotEncoder(sparse=density)
    ohe_test = OneHotEncoder(sparse=density, drop=drop)
    X = [["c", 1, "a"], ["a", 2, "b"]]
    ohe_base.fit(X)
    ohe_test.fit(X)
    assert_array_equal(ohe_base.categories_, ohe_test.categories_)
    if drop == "first":
        assert_array_equal(ohe_test.drop_idx_, 0)
    else:
        for drop_cat, drop_idx, cat_list in zip(
            drop, ohe_test.drop_idx_, ohe_test.categories_
        ):
            assert cat_list[int(drop_idx)] == drop_cat
    assert isinstance(ohe_test.drop_idx_, np.ndarray)
    assert ohe_test.drop_idx_.dtype == object


@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
def test_encoders_has_categorical_tags(Encoder):
    assert "categorical" in Encoder()._get_tags()["X_types"]


# TODO: Remove in 1.2 when get_feature_names is removed
def test_one_hot_encoder_get_feature_names_deprecated():
    X = np.array([["cat", "dog"]], dtype=object).T
    enc = OneHotEncoder().fit(X)

    msg = "get_feature_names is deprecated in 1.0"
    with pytest.warns(FutureWarning, match=msg):
        enc.get_feature_names()


# deliberately omit 'OS' as an invalid combo
@pytest.mark.parametrize(
    "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "US", "SO", "SU", "SS"]
)
@pytest.mark.parametrize("array_type", ["list", "array", "dataframe"])
def test_encoders_string_categories(input_dtype, category_dtype, array_type):
    """Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    """

    X = np.array([["b"], ["a"]], dtype=input_dtype)
    categories = [np.array(["b", "a"], dtype=category_dtype)]
    ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)

    X_test = _convert_container(
        [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype
    )
    X_trans = ohe.transform(X_test)

    expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
    assert_allclose(X_trans, expected)

    oe = OrdinalEncoder(categories=categories).fit(X)
    X_trans = oe.transform(X_test)

    expected = np.array([[1], [1], [0], [1]])
    assert_array_equal(X_trans, expected)


# TODO: Remove in 1.2 when get_feature_names is removed.
@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn")
@pytest.mark.parametrize("get_names", ["get_feature_names", "get_feature_names_out"])
@pytest.mark.parametrize("missing_value", [np.nan, None])
def test_ohe_missing_values_get_feature_names(get_names, missing_value):
    # encoder with missing values with object dtypes
    X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore").fit(X)
    names = getattr(ohe, get_names)()
    assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"])


def test_ohe_missing_value_support_pandas():
    # check support for pandas with mixed dtypes and missing values
    pd = pytest.importorskip("pandas")
    df = pd.DataFrame(
        {
            "col1": ["dog", "cat", None, "cat"],
            "col2": np.array([3, 0, 4, np.nan], dtype=float),
        },
        columns=["col1", "col2"],
    )
    expected_df_trans = np.array(
        [
            [0, 1, 0, 0, 1, 0, 0],
            [1, 0, 0, 1, 0, 0, 0],
            [0, 0, 1, 0, 0, 1, 0],
            [1, 0, 0, 0, 0, 0, 1],
        ]
    )

    Xtr = check_categorical_onehot(df)
    assert_allclose(Xtr, expected_df_trans)


@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
    # checks pandas dataframe with categorical features
    if pd_nan_type == "pd.NA":
        # pd.NA is in pandas 1.0
        pd = pytest.importorskip("pandas", minversion="1.0")
        pd_missing_value = pd.NA
    else:  # np.nan
        pd = pytest.importorskip("pandas")
        pd_missing_value = np.nan

    df = pd.DataFrame(
        {
            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
        }
    )
    expected_df_trans = np.array(
        [
            [0, 0, 1, 0],
            [1, 0, 0, 0],
            [0, 0, 0, 1],
            [0, 1, 0, 0],
            [1, 0, 0, 0],
        ]
    )

    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    df_trans = ohe.fit_transform(df)
    assert_allclose(expected_df_trans, df_trans)

    assert len(ohe.categories_) == 1
    assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"])
    assert np.isnan(ohe.categories_[0][-1])


def test_ohe_drop_first_handle_unknown_ignore_warns():
    """Check drop='first' and handle_unknown='ignore' during transform."""
    X = [["a", 0], ["b", 2], ["b", 1]]

    ohe = OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore")
    X_trans = ohe.fit_transform(X)

    X_expected = np.array(
        [
            [0, 0, 0],
            [1, 0, 1],
            [1, 1, 0],
        ]
    )
    assert_allclose(X_trans, X_expected)

    # Both categories are unknown
    X_test = [["c", 3]]
    X_expected = np.array([[0, 0, 0]])

    warn_msg = (
        r"Found unknown categories in columns \[0, 1\] during "
        "transform. These unknown categories will be encoded as all "
        "zeros"
    )
    with pytest.warns(UserWarning, match=warn_msg):
        X_trans = ohe.transform(X_test)
    assert_allclose(X_trans, X_expected)

    # inverse_transform maps to None
    X_inv = ohe.inverse_transform(X_expected)
    assert_array_equal(X_inv, np.array([["a", 0]], dtype=object))


def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
    """Check drop='if_binary' and handle_unknown='ignore' during transform."""
    X = [["a", 0], ["b", 2], ["b", 1]]

    ohe = OneHotEncoder(drop="if_binary", sparse=False, handle_unknown="ignore")
    X_trans = ohe.fit_transform(X)

    X_expected = np.array(
        [
            [0, 1, 0, 0],
            [1, 0, 0, 1],
            [1, 0, 1, 0],
        ]
    )
    assert_allclose(X_trans, X_expected)

    # Both categories are unknown
    X_test = [["c", 3]]
    X_expected = np.array([[0, 0, 0, 0]])

    warn_msg = (
        r"Found unknown categories in columns \[0, 1\] during "
        "transform. These unknown categories will be encoded as all "
        "zeros"
    )
    with pytest.warns(UserWarning, match=warn_msg):
        X_trans = ohe.transform(X_test)
    assert_allclose(X_trans, X_expected)

    # inverse_transform maps to None
    X_inv = ohe.inverse_transform(X_expected)
    assert_array_equal(X_inv, np.array([["a", None]], dtype=object))


def test_ohe_drop_first_explicit_categories():
    """Check drop='first' and handle_unknown='ignore' during fit with
    categories passed in."""

    X = [["a", 0], ["b", 2], ["b", 1]]

    ohe = OneHotEncoder(
        drop="first",
        sparse=False,
        handle_unknown="ignore",
        categories=[["b", "a"], [1, 2]],
    )
    ohe.fit(X)

    X_test = [["c", 1]]
    X_expected = np.array([[0, 0]])

    warn_msg = (
        r"Found unknown categories in columns \[0\] during transform. "
        r"These unknown categories will be encoded as all zeros"
    )
    with pytest.warns(UserWarning, match=warn_msg):
        X_trans = ohe.transform(X_test)
    assert_allclose(X_trans, X_expected)


def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
    """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""

    X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
    oe = OrdinalEncoder(dtype=np.int32)

    msg = (
        r"There are missing values in features \[0\]. For OrdinalEncoder "
        "to passthrough missing values, the dtype parameter must be a "
        "float"
    )
    with pytest.raises(ValueError, match=msg):
        oe.fit(X)


def test_ordinal_encoder_passthrough_missing_values_float():
    """Test ordinal encoder with nan on float dtypes."""

    X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
    oe = OrdinalEncoder().fit(X)

    assert len(oe.categories_) == 1
    assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])

    X_trans = oe.transform(X)
    assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])

    X_inverse = oe.inverse_transform(X_trans)
    assert_allclose(X_inverse, X)


@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
    """Check ordinal encoder is compatible with pandas."""
    # checks pandas dataframe with categorical features
    if pd_nan_type == "pd.NA":
        # pd.NA is in pandas 1.0
        pd = pytest.importorskip("pandas", minversion="1.0")
        pd_missing_value = pd.NA
    else:  # np.nan
        pd = pytest.importorskip("pandas")
        pd_missing_value = np.nan

    df = pd.DataFrame(
        {
            "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
        }
    )

    oe = OrdinalEncoder().fit(df)
    assert len(oe.categories_) == 1
    assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
    assert np.isnan(oe.categories_[0][-1])

    df_trans = oe.transform(df)

    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])

    X_inverse = oe.inverse_transform(df_trans)
    assert X_inverse.shape == (5, 1)
    assert_array_equal(X_inverse[:2, 0], ["c", "a"])
    assert_array_equal(X_inverse[3:, 0], ["b", "a"])
    assert np.isnan(X_inverse[2, 0])


@pytest.mark.parametrize(
    "X, X2, cats, cat_dtype",
    [
        (
            (
                np.array([["a", np.nan]], dtype=object).T,
                np.array([["a", "b"]], dtype=object).T,
                [np.array(["a", np.nan, "d"], dtype=object)],
                np.object_,
            )
        ),
        (
            (
                np.array([["a", np.nan]], dtype=object).T,
                np.array([["a", "b"]], dtype=object).T,
                [np.array(["a", np.nan, "d"], dtype=object)],
                np.object_,
            )
        ),
        (
            (
                np.array([[2.0, np.nan]], dtype=np.float64).T,
                np.array([[3.0]], dtype=np.float64).T,
                [np.array([2.0, 4.0, np.nan])],
                np.float64,
            )
        ),
    ],
    ids=[
        "object-None-missing-value",
        "object-nan-missing_value",
        "numeric-missing-value",
    ],
)
def test_ordinal_encoder_specified_categories_missing_passthrough(
    X, X2, cats, cat_dtype
):
    """Test ordinal encoder for specified categories."""
    oe = OrdinalEncoder(categories=cats)
    exp = np.array([[0.0], [np.nan]])
    assert_array_equal(oe.fit_transform(X), exp)
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert oe.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    oe = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        oe.fit(X2)


@pytest.mark.parametrize(
    "X, expected_X_trans, X_test",
    [
        (
            np.array([[1.0, np.nan, 3.0]]).T,
            np.array([[0.0, np.nan, 1.0]]).T,
            np.array([[4.0]]),
        ),
        (
            np.array([[1.0, 4.0, 3.0]]).T,
            np.array([[0.0, 2.0, 1.0]]).T,
            np.array([[np.nan]]),
        ),
        (
            np.array([["c", np.nan, "b"]], dtype=object).T,
            np.array([[1.0, np.nan, 0.0]]).T,
            np.array([["d"]], dtype=object),
        ),
        (
            np.array([["c", "a", "b"]], dtype=object).T,
            np.array([[2.0, 0.0, 1.0]]).T,
            np.array([[np.nan]], dtype=object),
        ),
    ],
)
def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
    """Test the interaction between missing values and handle_unknown"""

    oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    X_trans = oe.fit_transform(X)
    assert_allclose(X_trans, expected_X_trans)

    assert_allclose(oe.transform(X_test), [[-1.0]])


def test_ordinal_encoder_sparse():
    """Check that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    """
    X = np.array([[3, 2, 1], [0, 1, 1]])
    X_sparse = sparse.csr_matrix(X)

    encoder = OrdinalEncoder()

    err_msg = "A sparse matrix was passed, but dense data is required"
    with pytest.raises(TypeError, match=err_msg):
        encoder.fit(X_sparse)
    with pytest.raises(TypeError, match=err_msg):
        encoder.fit_transform(X_sparse)

    X_trans = encoder.fit_transform(X)
    X_trans_sparse = sparse.csr_matrix(X_trans)
    with pytest.raises(TypeError, match=err_msg):
        encoder.inverse_transform(X_trans_sparse)


def test_ordinal_encoder_fit_with_unseen_category():
    """Check OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    """
    X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis]
    oe = OrdinalEncoder(
        categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999
    )
    oe.fit(X)

    oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error")
    with pytest.raises(ValueError, match="Found unknown categories"):
        oe.fit(X)


@pytest.mark.parametrize(
    "X_train",
    [
        [["AA", "B"]],
        np.array([["AA", "B"]], dtype="O"),
        np.array([["AA", "B"]], dtype="U"),
    ],
)
@pytest.mark.parametrize(
    "X_test",
    [
        [["A", "B"]],
        np.array([["A", "B"]], dtype="O"),
        np.array([["A", "B"]], dtype="U"),
    ],
)
def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
    """Checks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    """
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9)
    enc.fit(X_train)

    X_trans = enc.transform(X_test)
    assert_allclose(X_trans, [[-9, 0]])


def test_ordinal_encoder_python_integer():
    """Check that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    """
    X = np.array(
        [
            44253463435747313673,
            9867966753463435747313673,
            44253462342215747313673,
            442534634357764313673,
        ]
    ).reshape(-1, 1)
    encoder = OrdinalEncoder().fit(X)
    assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
    X_trans = encoder.transform(X)
    assert_array_equal(X_trans, [[0], [3], [2], [1]])