import re import numpy as np import pytest from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, Index, Interval, Series, Timedelta, Timestamp, ) import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT class TestCategoricalIndex: def setup_method(self, method): self.df = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), ) self.df2 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), ) def test_loc_scalar(self): dtype = CDT(list("cab")) result = self.df.loc["a"] bidx = Series(list("aaa"), name="B").astype(dtype) assert bidx.dtype == dtype expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx)) tm.assert_frame_equal(result, expected) df = self.df.copy() df.loc["a"] = 20 bidx2 = Series(list("aabbca"), name="B").astype(dtype) assert bidx2.dtype == dtype expected = DataFrame( { "A": [20, 20, 2, 3, 4, 20], }, index=Index(bidx2), ) tm.assert_frame_equal(df, expected) # value not in the categories with pytest.raises(KeyError, match=r"^'d'$"): df.loc["d"] df2 = df.copy() expected = df2.copy() expected.index = expected.index.astype(object) expected.loc["d"] = 10 df2.loc["d"] = 10 tm.assert_frame_equal(df2, expected) def test_loc_setitem_with_expansion_non_category(self): # Setting-with-expansion with a new key "d" that is not among caegories df = self.df df.loc["a"] = 20 # Setting a new row on an existing column df3 = df.copy() df3.loc["d", "A"] = 10 bidx3 = Index(list("aabbcad"), name="B") expected3 = DataFrame( { "A": [20, 20, 2, 3, 4, 20, 10.0], }, index=Index(bidx3), ) tm.assert_frame_equal(df3, expected3) # Settig a new row _and_ new column df4 = df.copy() df4.loc["d", "C"] = 10 expected3 = DataFrame( { "A": [20, 20, 2, 3, 4, 20, np.nan], "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10], }, index=Index(bidx3), ) tm.assert_frame_equal(df4, expected3) def test_loc_getitem_scalar_non_category(self): with pytest.raises(KeyError, match="^1$"): self.df.loc[1] def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reverse = cat[::-1] exp = np.array([4, 3, 2, 1], dtype=np.int64) tm.assert_numpy_array_equal(reverse.__array__(), exp) df = DataFrame({"value": (np.arange(100) + 1).astype("int64")}) df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) expected = DataFrame( {"value": np.arange(11, 21).astype("int64")}, index=np.arange(10, 20).astype("int64"), ) expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) result = df.iloc[10:20] tm.assert_frame_equal(result, expected) expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) def test_slicing_and_getting_ops(self): # systematically test the slicing operations: # for all slicing ops: # - returning a dataframe # - returning a column # - returning a row # - returning a single value cats = Categorical( ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"] ) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values cats2 = Categorical(["b", "c"], categories=["a", "b", "c"]) idx2 = Index(["j", "k"]) values2 = [3, 4] # 2:4,: | "j":"k",: exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 exp_col = Series(cats, index=idx, name="cats") # "j",: | 2,: exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" # iloc # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"].dtype) # row res_row = df.iloc[2, :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col.dtype) # single value res_val = df.iloc[2, 0] assert res_val == exp_val # loc # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"].dtype) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col.dtype) # single value res_val = df.loc["j", "cats"] assert res_val == exp_val # single value res_val = df.loc["j", df.columns[0]] assert res_val == exp_val # iat res_val = df.iat[2, 0] assert res_val == exp_val # at res_val = df.at["j", "cats"] assert res_val == exp_val # fancy indexing exp_fancy = df.iloc[[2]] res_fancy = df[df["cats"] == "b"] tm.assert_frame_equal(res_fancy, exp_fancy) res_fancy = df[df["values"] == 3] tm.assert_frame_equal(res_fancy, exp_fancy) # get_value res_val = df.at["j", "cats"] assert res_val == exp_val # i : int, slice, or sequence of integers res_row = df.iloc[2] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"].dtype) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"].dtype) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col.dtype) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"].dtype) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"].dtype) def test_slicing_doc_examples(self): # GH 7918 cats = Categorical( ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"] ) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( { "cats": Categorical(["b", "b"], categories=["a", "b", "c"]), "values": [2, 2], }, index=["j", "k"], ) tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes expected = Series(["category", "int64"], ["cats", "values"]) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] expected = Series( Categorical(["a", "b", "b"], categories=["a", "b", "c"]), index=["h", "i", "j"], name="cats", ) tm.assert_series_equal(result, expected) result = df.loc["h":"j", df.columns[0:1]] expected = DataFrame( {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])}, index=["h", "i", "j"], ) tm.assert_frame_equal(result, expected) def test_loc_getitem_listlike_labels(self): # list of labels result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] tm.assert_frame_equal(result, expected, check_index_type=True) def test_loc_getitem_listlike_unused_category(self): # GH#37901 a label that is in index.categories but not in index # listlike containing an element in the categories but not in the values with pytest.raises(KeyError, match=re.escape("['e'] not in index")): self.df2.loc[["a", "b", "e"]] def test_loc_getitem_label_unused_category(self): # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): self.df2.loc["e"] def test_loc_getitem_non_category(self): # not all labels in the categories with pytest.raises(KeyError, match=re.escape("['d'] not in index")): self.df2.loc[["a", "d"]] def test_loc_setitem_expansion_label_unused_category(self): # assigning with a label that is in the categories but not in the index df = self.df2.copy() df.loc["e"] = 20 result = df.loc[["a", "b", "e"]] exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(["a", "b", "c"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp_index = CategoricalIndex(["a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) with pytest.raises(KeyError, match=re.escape("['x'] not in index")): df.loc[["a", "x"]] def test_loc_listlike_dtypes_duplicated_categories_and_codes(self): # duplicated categories and codes index = CategoricalIndex(["a", "b", "a"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp = DataFrame( {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"]) ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]}, index=CategoricalIndex(["a", "a", "a", "a", "b"]), ) tm.assert_frame_equal(res, exp, check_index_type=True) with pytest.raises(KeyError, match=re.escape("['x'] not in index")): df.loc[["a", "x"]] def test_loc_listlike_dtypes_unused_category(self): # contains unused category index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) res = df.loc[["a", "b"]] exp = DataFrame( {"A": [1, 3, 2], "B": [5, 7, 6]}, index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]}, index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) with pytest.raises(KeyError, match=re.escape("['x'] not in index")): df.loc[["a", "x"]] def test_loc_getitem_listlike_unused_category_raises_keyerror(self): # key that is an *unused* category raises index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) with pytest.raises(KeyError, match="e"): # For comparison, check the scalar behavior df.loc["e"] with pytest.raises(KeyError, match=re.escape("['e'] not in index")): df.loc[["a", "e"]] def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) expect = Series(df.loc["A", :], index=cdf.columns, name="A") tm.assert_series_equal(cdf.loc["A", :], expect) expect = Series(df.loc[:, "X"], index=cdf.index, name="X") tm.assert_series_equal(cdf.loc[:, "X"], expect) exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"]) expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index) tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"]) expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_ix_categorical_index_non_unique(self): # non-unique df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) tm.assert_frame_equal(cdf.loc["A", :], expect) exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, "X"], expect) expect = DataFrame( df.loc[["A", "B"], :], columns=cdf.columns, index=CategoricalIndex(list("AAB")), ) tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) expect = DataFrame( df.loc[:, ["X", "Y"]], index=cdf.index, columns=CategoricalIndex(list("XXY")), ) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_loc_slice(self): # GH9748 msg = ( "cannot do slice indexing on CategoricalIndex with these " r"indexers \[1\] of type int" ) with pytest.raises(TypeError, match=msg): self.df.loc[1:5] result = self.df.loc["b":"c"] expected = self.df.iloc[[2, 3, 4]] tm.assert_frame_equal(result, expected) def test_loc_and_at_with_categorical_index(self): # GH 20629 df = DataFrame( [[1, 2], [3, 4], [5, 6]], index=CategoricalIndex(["A", "B", "C"]) ) s = df[0] assert s.loc["A"] == 1 assert s.at["A"] == 1 assert df.loc["B", 1] == 4 assert df.at["B", 1] == 4 @pytest.mark.parametrize( "idx_values", [ # python types [1, 2, 3], [-1, -2, -3], [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5], # numpy int/uint *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_NUMPY_DTYPES), # numpy floats *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_NUMPY_DTYPES), # numpy object np.array([1, "b", 3.5], dtype=object), # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], # pandas Integer arrays *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES), # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, pd.timedelta_range(start="1d", periods=3).array, ], ) def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): # GH-17569 cat_idx = CategoricalIndex(idx_values, ordered=ordered) df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) sl = slice(idx_values[0], idx_values[1]) # scalar selection result = df.loc[idx_values[0]] expected = Series(["foo"], index=["A"], name=idx_values[0]) tm.assert_series_equal(result, expected) # list selection result = df.loc[idx_values[:2]] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # slice selection result = df.loc[sl] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # scalar assignment result = df.copy() result.loc[idx_values[0]] = "qux" expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # list assignment result = df.copy() result.loc[idx_values[:2], "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # slice assignment result = df.copy() result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) def test_getitem_categorical_with_nan(self): # GH#41933 ci = CategoricalIndex(["A", "B", np.nan]) ser = Series(range(3), index=ci) assert ser[np.nan] == 2 assert ser.loc[np.nan] == 2 df = DataFrame(ser) assert df.loc[np.nan, 0] == 2 assert df.loc[np.nan][0] == 2