""" test partial slicing on Series/Frame """ from datetime import datetime import numpy as np import pytest from pandas import ( DataFrame, DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range, ) import pandas._testing as tm class TestSlicing: def test_string_index_series_name_converted(self): # GH#1644 df = DataFrame(np.random.randn(10, 4), index=date_range("1/1/2000", periods=10)) result = df.loc["1/3/2000"] assert result.name == df.index[2] result = df.T["1/3/2000"] assert result.name == df.index[2] def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here def test_return_type_doesnt_depend_on_monotonicity(self): # GH#24892 we get Series back regardless of whether our DTI is monotonic dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) ser = Series(range(3), index=dti) # non-monotonic index ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) # key with resolution strictly lower than "min" key = "2015-5-14 00" # monotonic increasing index result = ser.loc[key] expected = ser.iloc[1:] tm.assert_series_equal(result, expected) # monotonic decreasing index result = ser.iloc[::-1].loc[key] expected = ser.iloc[::-1][:-1] tm.assert_series_equal(result, expected) # non-monotonic index result2 = ser2.loc[key] expected2 = ser2.iloc[::2] tm.assert_series_equal(result2, expected2) def test_return_type_doesnt_depend_on_monotonicity_higher_reso(self): # GH#24892 we get Series back regardless of whether our DTI is monotonic dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) ser = Series(range(3), index=dti) # non-monotonic index ser2 = Series(range(3), index=[dti[1], dti[0], dti[2]]) # key with resolution strictly *higher) than "min" key = "2015-5-14 00:00:00" # monotonic increasing index result = ser.loc[key] assert result == 1 # monotonic decreasing index result = ser.iloc[::-1].loc[key] assert result == 1 # non-monotonic index result2 = ser2.loc[key] assert result2 == 0 def test_monotone_DTI_indexing_bug(self): # GH 19362 # Testing accessing the first element in a monotonic descending # partial string indexing. df = DataFrame(list(range(5))) date_list = [ "2018-01-02", "2017-02-10", "2016-03-10", "2015-03-15", "2014-03-16", ] date_index = DatetimeIndex(date_list) df["date"] = date_index expected = DataFrame({0: list(range(5)), "date": date_index}) tm.assert_frame_equal(df, expected) # We get a slice because df.index's resolution is hourly and we # are slicing with a daily-resolution string. If both were daily, # we would get a single item back dti = date_range("20170101 01:00:00", periods=3) df = DataFrame({"A": [1, 2, 3]}, index=dti[::-1]) expected = DataFrame({"A": 1}, index=dti[-1:][::-1]) result = df.loc["2017-01-03"] tm.assert_frame_equal(result, expected) result2 = df.iloc[::-1].loc["2017-01-03"] expected2 = expected.iloc[::-1] tm.assert_frame_equal(result2, expected2) def test_slice_year(self): dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) result = s["2005"] expected = s[s.index.year == 2005] tm.assert_series_equal(result, expected) df = DataFrame(np.random.rand(len(dti), 5), index=dti) result = df.loc["2005"] expected = df[df.index.year == 2005] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "partial_dtime", [ "2019", "2019Q4", "Dec 2019", "2019-12-31", "2019-12-31 23", "2019-12-31 23:59", ], ) def test_slice_end_of_period_resolution(self, partial_dtime): # GH#31064 dti = date_range("2019-12-31 23:59:55.999999999", periods=10, freq="s") ser = Series(range(10), index=dti) result = ser[partial_dtime] expected = ser.iloc[:5] tm.assert_series_equal(result, expected) def test_slice_quarter(self): dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) assert len(s["2001Q1"]) == 90 df = DataFrame(np.random.rand(len(dti), 5), index=dti) assert len(df.loc["1Q01"]) == 90 def test_slice_month(self): dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) assert len(s["2005-11"]) == 30 df = DataFrame(np.random.rand(len(dti), 5), index=dti) assert len(df.loc["2005-11"]) == 30 tm.assert_series_equal(s["2005-11"], s["11-2005"]) def test_partial_slice(self): rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-05":"2006-02"] expected = s["20050501":"20060228"] tm.assert_series_equal(result, expected) result = s["2005-05":] expected = s["20050501":] tm.assert_series_equal(result, expected) result = s[:"2006-02"] expected = s[:"20060228"] tm.assert_series_equal(result, expected) result = s["2005-1-1"] assert result == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31'$"): s["2004-12-31"] def test_partial_slice_daily(self): rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-31"] tm.assert_series_equal(result, s.iloc[:24]) with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"): s["2004-12-31 00"] def test_partial_slice_hourly(self): rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1"] tm.assert_series_equal(result, s.iloc[: 60 * 4]) result = s["2005-1-1 20"] tm.assert_series_equal(result, s.iloc[:60]) assert s["2005-1-1 20:00"] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"): s["2004-12-31 00:15"] def test_partial_slice_minutely(self): rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1 23:59"] tm.assert_series_equal(result, s.iloc[:60]) result = s["2005-1-1"] tm.assert_series_equal(result, s.iloc[:60]) assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"): s["2004-12-31 00:00:00"] def test_partial_slice_second_precision(self): rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, freq="US", ) s = Series(np.arange(20), rng) tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10]) tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10]) tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:]) tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:]) assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0] with pytest.raises(KeyError, match="2005-1-1 00:00:00"): s["2005-1-1 00:00:00"] def test_partial_slicing_dataframe(self): # GH14856 # Test various combinations of string slicing resolution vs. # index resolution # - If string resolution is less precise than index resolution, # string is considered a slice # - If string resolution is equal to or more precise than index # resolution, string is considered an exact match formats = [ "%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%d %H", "%Y-%m-%d %H:%M", "%Y-%m-%d %H:%M:%S", ] resolutions = ["year", "month", "day", "hour", "minute", "second"] for rnum, resolution in enumerate(resolutions[2:], 2): # we check only 'day', 'hour', 'minute' and 'second' unit = Timedelta("1 " + resolution) middate = datetime(2012, 1, 1, 0, 0, 0) index = DatetimeIndex([middate - unit, middate, middate + unit]) values = [1, 2, 3] df = DataFrame({"a": values}, index, dtype=np.int64) assert df.index.resolution == resolution # Timestamp with the same resolution as index # Should be exact match for Series (return scalar) # and raise KeyError for Frame for timestamp, expected in zip(index, values): ts_string = timestamp.strftime(formats[rnum]) # make ts_string as precise as index result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == expected msg = rf"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df[ts_string] # Timestamp with resolution less precise than index for fmt in formats[:rnum]: for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: ts_string = index[element].strftime(fmt) # Series should return slice result = df["a"][ts_string] expected = df["a"][theslice] tm.assert_series_equal(result, expected) # Frame should return slice as well with tm.assert_produces_warning(FutureWarning): # GH#36179 deprecated this indexing result = df[ts_string] expected = df[theslice] tm.assert_frame_equal(result, expected) # Timestamp with resolution more precise than index # Compatible with existing key # Should return scalar for Series # and raise KeyError for Frame for fmt in formats[rnum + 1 :]: ts_string = index[1].strftime(fmt) result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == 2 msg = rf"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df[ts_string] # Not compatible with existing key # Should raise KeyError for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: ts = index[1] + Timedelta("1 " + res) ts_string = ts.strftime(fmt) msg = rf"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df["a"][ts_string] with pytest.raises(KeyError, match=msg): df[ts_string] def test_partial_slicing_with_multiindex(self): # GH 4758 # partial string indexing with a multi-index buggy df = DataFrame( { "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], "val": [1, 2, 3, 4], }, index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), ) df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) expected = DataFrame( [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] ) result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] tm.assert_frame_equal(result, expected) expected = df_multi.loc[ (Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") ] result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) # partial string indexing on first level, scalar indexing on the other two result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")] expected = df_multi.iloc[:1].droplevel([1, 2]) tm.assert_frame_equal(result, expected) def test_partial_slicing_with_multiindex_series(self): # GH 4294 # partial slice on a series mi ser = DataFrame( np.random.rand(1000, 1000), index=date_range("2000-1-1", periods=1000) ).stack() s2 = ser[:-1].copy() expected = s2["2000-1-4"] result = s2[Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) result = ser[Timestamp("2000-1-4")] expected = ser["2000-1-4"] tm.assert_series_equal(result, expected) df2 = DataFrame(ser) expected = df2.xs("2000-1-4") result = df2.loc[Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected) def test_partial_slice_doesnt_require_monotonicity(self): # For historical reasons. ser = Series(np.arange(10), date_range("2014-01-01", periods=10)) nonmonotonic = ser[[3, 5, 4]] expected = nonmonotonic.iloc[:0] timestamp = Timestamp("2014-01-10") with tm.assert_produces_warning(FutureWarning): result = nonmonotonic["2014-01-10":] tm.assert_series_equal(result, expected) with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic[timestamp:] with tm.assert_produces_warning(FutureWarning): result = nonmonotonic.loc["2014-01-10":] tm.assert_series_equal(result, expected) with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic.loc[timestamp:] def test_loc_datetime_length_one(self): # GH16071 df = DataFrame( columns=["1"], index=date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"), ) result = df.loc[datetime(2016, 10, 1) :] tm.assert_frame_equal(result, df) result = df.loc["2016-10-01T00:00:00":] tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "start", [ "2018-12-02 21:50:00+00:00", Timestamp("2018-12-02 21:50:00+00:00"), Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(), ], ) @pytest.mark.parametrize( "end", [ "2018-12-02 21:52:00+00:00", Timestamp("2018-12-02 21:52:00+00:00"), Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(), ], ) def test_getitem_with_datestring_with_UTC_offset(self, start, end): # GH 24076 idx = date_range( start="2018-12-02 14:50:00-07:00", end="2018-12-02 14:50:00-07:00", freq="1min", ) df = DataFrame(1, index=idx, columns=["A"]) result = df[start:end] expected = df.iloc[0:3, :] tm.assert_frame_equal(result, expected) # GH 16785 start = str(start) end = str(end) with pytest.raises(ValueError, match="Both dates must"): df[start : end[:-4] + "1:00"] with pytest.raises(ValueError, match="The index must be timezone"): df = df.tz_localize(None) df[start:end] def test_slice_reduce_to_series(self): # GH 27516 df = DataFrame({"A": range(24)}, index=date_range("2000", periods=24, freq="M")) expected = Series( range(12), index=date_range("2000", periods=12, freq="M"), name="A" ) result = df.loc["2000", "A"] tm.assert_series_equal(result, expected)