import re import numpy as np import pytest from pandas import ( Categorical, CategoricalDtype, CategoricalIndex, DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, concat, get_dummies, period_range, ) import pandas._testing as tm from pandas.core.arrays import SparseArray class TestGetitem: def test_getitem_unused_level_raises(self): # GH#20410 mi = MultiIndex( levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], codes=[[1, 0], [1, 0]], ) df = DataFrame(-1, index=range(3), columns=mi) with pytest.raises(KeyError, match="notevenone"): df["notevenone"] def test_getitem_periodindex(self): rng = period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) ts = df[rng[0]] tm.assert_series_equal(ts, df.iloc[:, 0]) # GH#1211; smoketest unrelated to the rest of this test repr(df) ts = df["1/1/2000"] tm.assert_series_equal(ts, df.iloc[:, 0]) def test_getitem_list_of_labels_categoricalindex_cols(self): # GH#16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) expected = DataFrame( [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats ) dummies = get_dummies(cats) result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) def test_getitem_sparse_column_return_type_and_dtype(self): # https://github.com/pandas-dev/pandas/issues/23559 data = SparseArray([0, 1]) df = DataFrame({"A": data}) expected = Series(data, name="A") result = df["A"] tm.assert_series_equal(result, expected) # Also check iloc and loc while we're here result = df.iloc[:, 0] tm.assert_series_equal(result, expected) result = df.loc[:, "A"] tm.assert_series_equal(result, expected) class TestGetitemListLike: def test_getitem_list_missing_key(self): # GH#13822, incorrect error string with non-unique columns when missing # column is accessed df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) df.columns = ["x", "x", "z"] # Check that we get the correct value in the KeyError with pytest.raises(KeyError, match=r"\['y'\] not in index"): df[["x", "y", "z"]] def test_getitem_list_duplicates(self): # GH#1943 df = DataFrame(np.random.randn(4, 4), columns=list("AABC")) df.columns.name = "foo" result = df[["B", "C"]] assert result.columns.name == "foo" expected = df.iloc[:, 2:] tm.assert_frame_equal(result, expected) def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] @pytest.mark.parametrize( "idx_type", [ list, iter, Index, set, lambda l: dict(zip(l, range(len(l)))), lambda l: dict(zip(l, range(len(l)))).keys(), ], ids=["list", "iter", "Index", "set", "dict", "dict_keys"], ) @pytest.mark.parametrize("levels", [1, 2]) def test_getitem_listlike(self, idx_type, levels, float_frame): # GH#21294 if levels == 1: frame, missing = float_frame, "food" else: # MultiIndex columns frame = DataFrame( np.random.randn(8, 3), columns=Index( [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], name=("sth", "sth2"), ), ) missing = ("good", "food") keys = [frame.columns[1], frame.columns[0]] idx = idx_type(keys) idx_check = list(idx_type(keys)) if isinstance(idx, (set, dict)): with tm.assert_produces_warning(FutureWarning): result = frame[idx] else: result = frame[idx] expected = frame.loc[:, idx_check] expected.columns.names = frame.columns.names tm.assert_frame_equal(result, expected) idx = idx_type(keys + [missing]) with pytest.raises(KeyError, match="not in index"): with tm.assert_produces_warning(FutureWarning): frame[idx] def test_getitem_iloc_generator(self): # GH#39614 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) indexer = (x for x in [1, 2]) result = df.iloc[indexer] expected = DataFrame({"a": [2, 3], "b": [5, 6]}, index=[1, 2]) tm.assert_frame_equal(result, expected) def test_getitem_iloc_two_dimensional_generator(self): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) indexer = (x for x in [1, 2]) result = df.iloc[indexer, 1] expected = Series([5, 6], name="b", index=[1, 2]) tm.assert_series_equal(result, expected) class TestGetitemCallable: def test_getitem_callable(self, float_frame): # GH#12533 result = float_frame[lambda x: "A"] expected = float_frame.loc[:, "A"] tm.assert_series_equal(result, expected) result = float_frame[lambda x: ["A", "B"]] expected = float_frame.loc[:, ["A", "B"]] tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]]) df = float_frame[:3] result = df[lambda x: [True, False, True]] expected = float_frame.iloc[[0, 2], :] tm.assert_frame_equal(result, expected) def test_loc_multiindex_columns_one_level(self): # GH#29749 df = DataFrame([[1, 2]], columns=[["a", "b"]]) expected = DataFrame([1], columns=[["a"]]) result = df["a"] tm.assert_frame_equal(result, expected) result = df.loc[:, "a"] tm.assert_frame_equal(result, expected) class TestGetitemBooleanMask: def test_getitem_bool_mask_categorical_index(self): df3 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=True), name="B", ), ) df4 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=False), name="B", ), ) result = df3[df3.index == "a"] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) result = df4[df4.index == "a"] expected = df4.iloc[[]] tm.assert_frame_equal(result, expected) result = df3[df3.index == 1] expected = df3.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) result = df4[df4.index == 1] expected = df4.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # since we have an ordered categorical # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=True, # name='B') result = df3[df3.index < 2] expected = df3.iloc[[4]] tm.assert_frame_equal(result, expected) result = df3[df3.index > 1] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) # unordered # cannot be compared # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=False, # name='B') msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): df4[df4.index < 2] with pytest.raises(TypeError, match=msg): df4[df4.index > 1] @pytest.mark.parametrize( "data1,data2,expected_data", ( ( [[1, 2], [3, 4]], [[0.5, 6], [7, 8]], [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]], ), ( [[1, 2], [3, 4]], [[5, 6], [7, 8]], [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]], ), ), ) def test_getitem_bool_mask_duplicate_columns_mixed_dtypes( self, data1, data2, expected_data, ): # GH#31954 df1 = DataFrame(np.array(data1)) df2 = DataFrame(np.array(data2)) df = concat([df1, df2], axis=1) result = df[df > 2] exdict = {i: np.array(col) for i, col in enumerate(expected_data)} expected = DataFrame(exdict).rename(columns={2: 0, 3: 1}) tm.assert_frame_equal(result, expected) @pytest.fixture def df_dup_cols(self): dups = ["A", "A", "C", "D"] df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") return df def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols): # `df.A > 6` is a DataFrame with a different shape from df # boolean with the duplicate raises df = df_dup_cols msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df[df.A > 6] def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols): # boolean indexing # GH#4879 df = DataFrame( np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" ) expected = df[df.C > 6] expected.columns = df_dup_cols.columns df = df_dup_cols result = df[df.C > 6] tm.assert_frame_equal(result, expected) result.dtypes str(result) def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols): # where df = DataFrame( np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" ) # `df > 6` is a DataFrame with the same shape+alignment as df expected = df[df > 6] expected.columns = df_dup_cols.columns df = df_dup_cols result = df[df > 6] tm.assert_frame_equal(result, expected) result.dtypes str(result) def test_getitem_empty_frame_with_boolean(self): # Test for issue GH#11859 df = DataFrame() df2 = df[df > 0] tm.assert_frame_equal(df, df2) class TestGetitemSlice: def test_getitem_slice_float64(self, frame_or_series): values = np.arange(10.0, 50.0, 2) index = Index(values) start, end = values[[5, 15]] data = np.random.randn(20, 3) if frame_or_series is not DataFrame: data = data[:, 0] obj = frame_or_series(data, index=index) result = obj[start:end] expected = obj.iloc[5:16] tm.assert_equal(result, expected) result = obj.loc[start:end] tm.assert_equal(result, expected) def test_getitem_datetime_slice(self): # GH#43223 df = DataFrame( {"a": 0}, index=DatetimeIndex( [ "11.01.2011 22:00", "11.01.2011 23:00", "12.01.2011 00:00", "2011-01-13 00:00", ] ), ) with tm.assert_produces_warning(FutureWarning): result = df["2011-01-01":"2011-11-01"] expected = DataFrame( {"a": 0}, index=DatetimeIndex( ["11.01.2011 22:00", "11.01.2011 23:00", "2011-01-13 00:00"] ), ) tm.assert_frame_equal(result, expected) class TestGetitemDeprecatedIndexers: @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}]) def test_getitem_dict_and_set_deprecated(self, key): # GH#42825 df = DataFrame( [[1, 2], [3, 4]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)]) ) with tm.assert_produces_warning(FutureWarning): df[key]