import numpy as np import pytest from pandas.errors import InvalidIndexError import pandas as pd from pandas import ( CategoricalIndex, Index, IntervalIndex, Timestamp, ) import pandas._testing as tm class TestTake: def test_take_fill_value(self): # GH 12631 # numeric category idx = CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) expected = CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category idx = CategoricalIndex( list("CBA"), categories=list("ABC"), ordered=True, name="xxx" ) result = idx.take(np.array([1, 0, -1])) expected = CategoricalIndex( list("BCA"), categories=list("ABC"), ordered=True, name="xxx" ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = CategoricalIndex( ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = CategoricalIndex( list("BCA"), categories=list("ABC"), ordered=True, name="xxx" ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) msg = ( "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) msg = "index -5 is out of bounds for (axis 0 with )?size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_datetime(self): # datetime category idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") idx = CategoricalIndex(idx) result = idx.take(np.array([1, 0, -1])) expected = pd.DatetimeIndex( ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" ) expected = CategoricalIndex(expected) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) expected = CategoricalIndex(expected, categories=exp_cats) tm.assert_index_equal(result, expected) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.DatetimeIndex( ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" ) expected = CategoricalIndex(expected) tm.assert_index_equal(result, expected) msg = ( "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) msg = "index -5 is out of bounds for (axis 0 with )?size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): idx = CategoricalIndex([1, 2, 3], name="foo") indices = [1, 0, -1] msg = r"take\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): idx.take(indices, foo=2) msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): idx.take(indices, out=indices) msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): idx.take(indices, mode="clip") class TestGetLoc: def test_get_loc(self): # GH 12531 cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) idx1 = Index(list("abcde")) assert cidx1.get_loc("a") == idx1.get_loc("a") assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) idx2 = Index(list("aacded")) # results in bool array res = cidx2.get_loc("d") tm.assert_numpy_array_equal(res, idx2.get_loc("d")) tm.assert_numpy_array_equal( res, np.array([False, False, False, True, False, True]) ) # unique element results in scalar res = cidx2.get_loc("e") assert res == idx2.get_loc("e") assert res == 4 for i in [cidx2, idx2]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique, sliceable cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) idx3 = Index(list("aabbb")) # results in slice res = cidx3.get_loc("a") assert res == idx3.get_loc("a") assert res == slice(0, 2, None) res = cidx3.get_loc("b") assert res == idx3.get_loc("b") assert res == slice(2, 5, None) for i in [cidx3, idx3]: with pytest.raises(KeyError, match="'c'"): i.get_loc("c") def test_get_loc_unique(self): cidx = CategoricalIndex(list("abc")) result = cidx.get_loc("b") assert result == 1 def test_get_loc_monotonic_nonunique(self): cidx = CategoricalIndex(list("abbc")) result = cidx.get_loc("b") expected = slice(1, 3, None) assert result == expected def test_get_loc_nonmonotonic_nonunique(self): cidx = CategoricalIndex(list("abcb")) result = cidx.get_loc("b") expected = np.array([False, True, False, True], dtype=bool) tm.assert_numpy_array_equal(result, expected) def test_get_loc_nan(self): # GH#41933 ci = CategoricalIndex(["A", "B", np.nan]) res = ci.get_loc(np.nan) assert res == 2 class TestGetIndexer: def test_get_indexer_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) expected = np.arange(len(idx), dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) with pytest.raises(ValueError, match="Invalid fill method"): idx.get_indexer(idx, method="invalid") def test_get_indexer_requires_unique(self): np.random.seed(123456789) ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) oidx = Index(np.array(ci)) msg = "Reindexing only valid with uniquely valued Index objects" for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] with pytest.raises(InvalidIndexError, match=msg): ci.get_indexer(finder) # see gh-17323 # # Even when indexer is equal to the # members in the index, we should # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: with pytest.raises(InvalidIndexError, match=msg): ci.get_indexer(finder) def test_get_indexer_non_unique(self): idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) idx2 = CategoricalIndex(list("abf")) for indexer in [idx2, list("abf"), Index(list("abf"))]: msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): idx1.get_indexer(indexer) r1, _ = idx1.get_indexer_non_unique(indexer) expected = np.array([0, 1, 2, -1], dtype=np.intp) tm.assert_almost_equal(r1, expected) def test_get_indexer_method(self): idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) idx2 = CategoricalIndex(list("abf")) msg = "method pad not yet implemented for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") msg = "method backfill not yet implemented for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="backfill") msg = "method nearest not yet implemented for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") def test_get_indexer_array(self): arr = np.array( [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], dtype=object, ) cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") result = ci.get_indexer(arr) expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_same_order(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19551 ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) class TestWhere: def test_where(self, listlike_box): klass = listlike_box i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) cond = [True] * len(i) expected = i result = i.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) result = i.where(klass(cond)) tm.assert_index_equal(result, expected) def test_where_non_categories(self): ci = CategoricalIndex(["a", "b", "c", "d"]) mask = np.array([True, False, True, False]) result = ci.where(mask, 2) expected = Index(["a", 2, "c", 2], dtype=object) tm.assert_index_equal(result, expected) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(TypeError, match=msg): # Test the Categorical method directly ci._data._where(mask, 2) class TestContains: def test_contains(self): ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False) assert "a" in ci assert "z" not in ci assert "e" not in ci assert np.nan not in ci # assert codes NOT in index assert 0 not in ci assert 1 not in ci def test_contains_nan(self): ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci @pytest.mark.parametrize("unwrap", [True, False]) def test_contains_na_dtype(self, unwrap): dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT) pi = dti.to_period("D") tdi = dti - dti[-1] ci = CategoricalIndex(dti) obj = ci if unwrap: obj = ci._data assert np.nan in obj assert None in obj assert pd.NaT in obj assert np.datetime64("NaT") in obj assert np.timedelta64("NaT") not in obj obj2 = CategoricalIndex(tdi) if unwrap: obj2 = obj2._data assert np.nan in obj2 assert None in obj2 assert pd.NaT in obj2 assert np.datetime64("NaT") not in obj2 assert np.timedelta64("NaT") in obj2 obj3 = CategoricalIndex(pi) if unwrap: obj3 = obj3._data assert np.nan in obj3 assert None in obj3 assert pd.NaT in obj3 assert np.datetime64("NaT") not in obj3 assert np.timedelta64("NaT") not in obj3 @pytest.mark.parametrize( "item, expected", [ (pd.Interval(0, 1), True), (1.5, True), (pd.Interval(0.5, 1.5), False), ("a", False), (Timestamp(1), False), (pd.Timedelta(1), False), ], ids=str, ) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) result = item in ci assert result is expected def test_contains_list(self): # GH#21729 idx = CategoricalIndex([1, 2, 3]) assert "a" not in idx with pytest.raises(TypeError, match="unhashable type"): ["a"] in idx with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in idx