import numpy as np import pytest from pandas import ( Categorical, CategoricalIndex, Index, Interval, IntervalIndex, PeriodIndex, Series, Timedelta, Timestamp, ) import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical class TestCategoricalIndexingWithFactor(TestCategorical): def test_getitem(self): assert self.factor[0] == "a" assert self.factor[-1] == "c" subf = self.factor[[0, 1, 2]] tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8)) subf = self.factor[np.asarray(self.factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) def test_setitem(self): # int/positional c = self.factor.copy() c[0] = "b" assert c[0] == "b" c[-1] = "a" assert c[-1] == "a" # boolean c = self.factor.copy() indexer = np.zeros(len(c), dtype="bool") indexer[0] = True indexer[-1] = True c[indexer] = "c" expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(c, expected) @pytest.mark.parametrize( "other", [Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])], ) def test_setitem_same_but_unordered(self, other): # GH-24142 target = Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) target[mask] = other[mask] expected = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_categorical_equal(target, expected) @pytest.mark.parametrize( "other", [ Categorical(["b", "a"], categories=["b", "a", "c"]), Categorical(["b", "a"], categories=["a", "b", "c"]), Categorical(["a", "a"], categories=["a"]), Categorical(["b", "b"], categories=["b"]), ], ) def test_setitem_different_unordered_raises(self, other): # GH-24142 target = Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) msg = "Cannot set a Categorical with another, without identical categories" with pytest.raises(TypeError, match=msg): target[mask] = other[mask] @pytest.mark.parametrize( "other", [ Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"], ordered=True), Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), ], ) def test_setitem_same_ordered_raises(self, other): # Gh-24142 target = Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) msg = "Cannot set a Categorical with another, without identical categories" with pytest.raises(TypeError, match=msg): target[mask] = other[mask] def test_setitem_tuple(self): # GH#20439 cat = Categorical([(0, 1), (0, 2), (0, 1)]) # This should not raise cat[1] = cat[0] assert cat[1] == (0, 1) def test_setitem_listlike(self): # GH#9469 # properly coerce the input indexers np.random.seed(1) cat = Categorical( np.random.randint(0, 5, size=150000).astype(np.int8) ).add_categories([-1000]) indexer = np.array([100000]).astype(np.int64) cat[indexer] = -1000 # we are asserting the code result here # which maps to the -1000 category result = cat.codes[np.array([100000]).astype(np.int64)] tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) class TestCategoricalIndexing: def test_getitem_slice(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] assert sliced == "d" sliced = cat[3:5] expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(sliced, expected) def test_getitem_listlike(self): # GH 9469 # properly coerce the input indexers np.random.seed(1) c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8)) result = c.codes[np.array([100000]).astype(np.int64)] expected = c[np.array([100000]).astype(np.int64)].codes tm.assert_numpy_array_equal(result, expected) def test_periodindex(self): idx1 = PeriodIndex( ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" ) cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat1._codes, exp_arr) tm.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" ) cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat2._codes, exp_arr) tm.assert_index_equal(cat2.categories, exp_idx2) idx3 = PeriodIndex( [ "2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07", "2013-05", ], freq="M", ) cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) exp_idx = PeriodIndex( [ "2013-05", "2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12", ], freq="M", ) tm.assert_numpy_array_equal(cat3._codes, exp_arr) tm.assert_index_equal(cat3.categories, exp_idx) def test_categories_assignments(self): cat = Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) cat.categories = [1, 2, 3] tm.assert_numpy_array_equal(cat.__array__(), exp) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) def test_categories_assignments_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( "new categories need to have the same number of items " "as the old categories!" ) with pytest.raises(ValueError, match=msg): cat.categories = new_categories # Combinations of sorted/unique: @pytest.mark.parametrize( "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]] ) # Combinations of missing/unique @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) @pytest.mark.parametrize("dtype", [None, "category", "key"]) def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype): # GH 21448 key = key_class(key_values, categories=range(1, 5)) if dtype == "key": dtype = key.dtype # Test for flat index and CategoricalIndex with same/different cats: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) exp_unique = idx.unique().get_indexer(key_values) res_unique = idx.unique().get_indexer(key) tm.assert_numpy_array_equal(res_unique, exp_unique) def test_where_unobserved_nan(self): ser = Series(Categorical(["a", "b"])) result = ser.where([True, False]) expected = Series(Categorical(["a", None], categories=["a", "b"])) tm.assert_series_equal(result, expected) # all NA ser = Series(Categorical(["a", "b"])) result = ser.where([False, False]) expected = Series(Categorical([None, None], categories=["a", "b"])) tm.assert_series_equal(result, expected) def test_where_unobserved_categories(self): ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) result = ser.where([True, True, False], other="b") expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories)) tm.assert_series_equal(result, expected) def test_where_other_categorical(self): ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) result = ser.where([True, False, True], other) expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) tm.assert_series_equal(result, expected) def test_where_new_category_raises(self): ser = Series(Categorical(["a", "b", "c"])) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(TypeError, match=msg): ser.where([True, False, True], "d") def test_where_ordered_differs_rasies(self): ser = Series( Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) ) other = Categorical( ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True ) with pytest.raises(TypeError, match="without identical categories"): ser.where([True, False, True], other) class TestContains: def test_contains(self): # GH#21508 cat = Categorical(list("aabbca"), categories=list("cab")) assert "b" in cat assert "z" not in cat assert np.nan not in cat with pytest.raises(TypeError, match="unhashable type: 'list'"): assert [1] in cat # assert codes NOT in index assert 0 not in cat assert 1 not in cat cat = Categorical(list("aabbca") + [np.nan], categories=list("cab")) assert np.nan in cat @pytest.mark.parametrize( "item, expected", [ (Interval(0, 1), True), (1.5, True), (Interval(0.5, 1.5), False), ("a", False), (Timestamp(1), False), (Timedelta(1), False), ], ids=str, ) def test_contains_interval(self, item, expected): # GH#23705 cat = Categorical(IntervalIndex.from_breaks(range(3))) result = item in cat assert result is expected def test_contains_list(self): # GH#21729 cat = Categorical([1, 2, 3]) assert "a" not in cat with pytest.raises(TypeError, match="unhashable type"): ["a"] in cat with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in cat @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean(index): ser = Series(range(3)) idx = Categorical([True, False, True]) if index: idx = CategoricalIndex(idx) assert com.is_bool_indexer(idx) result = ser[idx] expected = ser[idx.astype("object")] tm.assert_series_equal(result, expected) @pytest.mark.parametrize("index", [True, False]) def test_mask_with_boolean_na_treated_as_false(index): # https://github.com/pandas-dev/pandas/issues/31503 ser = Series(range(3)) idx = Categorical([True, False, None]) if index: idx = CategoricalIndex(idx) result = ser[idx] expected = ser[idx.fillna(False)] tm.assert_series_equal(result, expected) @pytest.fixture def non_coercible_categorical(monkeypatch): """ Monkeypatch Categorical.__array__ to ensure no implicit conversion. Raises ------ ValueError When Categorical.__array__ is called. """ # TODO(Categorical): identify other places where this may be # useful and move to a conftest.py def array(self, dtype=None): raise ValueError("I cannot be converted.") with monkeypatch.context() as m: m.setattr(Categorical, "__array__", array) yield def test_series_at(non_coercible_categorical): arr = Categorical(["a", "b", "c"]) ser = Series(arr) result = ser.at[0] assert result == "a"