from collections import ( OrderedDict, defaultdict, ) from datetime import datetime import numpy as np import pytest import pytz from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, ) import pandas._testing as tm class TestDataFrameToDict: def test_to_dict_timestamp(self): # GH#11247 # split/records producing np.datetime64 rather than Timestamps # on datetime64[ns] dtypes only tsmp = Timestamp("20130101") test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] assert test_data.to_dict(orient="records") == expected_records assert test_data_mixed.to_dict(orient="records") == expected_records_mixed expected_series = { "A": Series([tsmp, tsmp], name="A"), "B": Series([tsmp, tsmp], name="B"), } expected_series_mixed = { "A": Series([tsmp, tsmp], name="A"), "B": Series([1, 2], name="B"), } tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) tm.assert_dict_equal( test_data_mixed.to_dict(orient="series"), expected_series_mixed ) expected_split = { "index": [0, 1], "data": [[tsmp, tsmp], [tsmp, tsmp]], "columns": ["A", "B"], } expected_split_mixed = { "index": [0, 1], "data": [[tsmp, 1], [tsmp, 2]], "columns": ["A", "B"], } tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) tm.assert_dict_equal( test_data_mixed.to_dict(orient="split"), expected_split_mixed ) def test_to_dict_index_not_unique_with_index_orient(self): # GH#22801 # Data loss when indexes are not unique. Raise ValueError. df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) msg = "DataFrame index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): df.to_dict(orient="index") def test_to_dict_invalid_orient(self): df = DataFrame({"A": [0, 1]}) msg = "orient 'xinvalid' not understood" with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) def test_to_dict_short_orient_warns(self, orient): # GH#32515 df = DataFrame({"A": [0, 1]}) msg = "Using short name for 'orient' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): df.to_dict(orient=orient) @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): # orient= should only take the listed options # see GH#32515 test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} # GH#16122 recons_data = DataFrame(test_data).to_dict(into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("list", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] recons_data = DataFrame(test_data).to_dict("series", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("split", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], } tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("records", mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, {"A": np.nan, "B": "3"}, ] assert isinstance(recons_data, list) assert len(recons_data) == 3 for left, right in zip(recons_data, expected_records): tm.assert_dict_equal(left, right) # GH#10844 recons_data = DataFrame(test_data).to_dict("index") for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] df = DataFrame(test_data) df["duped"] = df[df.columns[0]] recons_data = df.to_dict("index") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] @pytest.mark.parametrize("mapping", [list, defaultdict, []]) def test_to_dict_errors(self, mapping): # GH#16122 df = DataFrame(np.random.randn(3, 3)) msg = "|".join( [ "unsupported type: ", r"to_dict\(\) only accepts initialized defaultdicts", ] ) with pytest.raises(TypeError, match=msg): df.to_dict(into=mapping) def test_to_dict_not_unique_warning(self): # GH#16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) with tm.assert_produces_warning(UserWarning): df.to_dict() # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @pytest.mark.parametrize( "orient,item_getter", [ ("dict", lambda d, col, idx: d[col][idx]), ("records", lambda d, col, idx: d[idx][col]), ("list", lambda d, col, idx: d[col][idx]), ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), ("index", lambda d, col, idx: d[idx][col]), ], ) def test_to_dict_box_scalars(self, orient, item_getter): # GH#14216, GH#23753 # make sure that we are boxing properly df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) result = df.to_dict(orient=orient) assert isinstance(item_getter(result, "a", 0), int) assert isinstance(item_getter(result, "b", 0), float) def test_to_dict_tz(self): # GH#18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) @pytest.mark.parametrize( "into, expected", [ ( dict, { 0: {"int_col": 1, "float_col": 1.0}, 1: {"int_col": 2, "float_col": 2.0}, 2: {"int_col": 3, "float_col": 3.0}, }, ), ( OrderedDict, OrderedDict( [ (0, {"int_col": 1, "float_col": 1.0}), (1, {"int_col": 2, "float_col": 2.0}), (2, {"int_col": 3, "float_col": 3.0}), ] ), ), ( defaultdict(dict), defaultdict( dict, { 0: {"int_col": 1, "float_col": 1.0}, 1: {"int_col": 2, "float_col": 2.0}, 2: {"int_col": 3, "float_col": 3.0}, }, ), ), ], ) def test_to_dict_index_dtypes(self, into, expected): # GH#18580 # When using to_dict(orient='index') on a dataframe with int # and float columns only the int columns were cast to float df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) result = df.to_dict(orient="index", into=into) cols = ["int_col", "float_col"] result = DataFrame.from_dict(result, orient="index")[cols] expected = DataFrame.from_dict(expected, orient="index")[cols] tm.assert_frame_equal(result, expected) def test_to_dict_numeric_names(self): # GH#24940 df = DataFrame({str(i): [i] for i in range(5)}) result = set(df.to_dict("records")[0].keys()) expected = set(df.columns) assert result == expected def test_to_dict_wide(self): # GH#24939 df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)}) result = df.to_dict("records")[0] expected = {f"A_{i:d}": i for i in range(256)} assert result == expected @pytest.mark.parametrize( "data,dtype", ( ([True, True, False], bool), [ [ datetime(2018, 1, 1), datetime(2019, 2, 2), datetime(2020, 3, 3), ], Timestamp, ], [[1.0, 2.0, 3.0], float], [[1, 2, 3], int], [["X", "Y", "Z"], str], ), ) def test_to_dict_orient_dtype(self, data, dtype): # GH22620 & GH21256 df = DataFrame({"a": data}) d = df.to_dict(orient="records") assert all(type(record["a"]) is dtype for record in d) @pytest.mark.parametrize( "data,expected_dtype", ( [np.uint64(2), int], [np.int64(-9), int], [np.float64(1.1), float], [np.bool_(True), bool], [np.datetime64("2005-02-25"), Timestamp], ), ) def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype): # GH22620 & GH21256 df = DataFrame({"a": data}, index=[0]) d = df.to_dict(orient="records") result = type(d[0]["a"]) assert result is expected_dtype def test_to_dict_mixed_numeric_frame(self): # GH 12859 df = DataFrame({"a": [1.0], "b": [9.0]}) result = df.reset_index().to_dict("records") expected = [{"index": 0, "a": 1.0, "b": 9.0}] assert result == expected @pytest.mark.parametrize( "index", [ None, Index(["aa", "bb"]), Index(["aa", "bb"], name="cc"), MultiIndex.from_tuples([("a", "b"), ("a", "c")]), MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]), ], ) @pytest.mark.parametrize( "columns", [ ["x", "y"], Index(["x", "y"]), Index(["x", "y"], name="z"), MultiIndex.from_tuples([("x", 1), ("y", 2)]), MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]), ], ) def test_to_dict_orient_tight(self, index, columns): df = DataFrame.from_records( [[1, 3], [2, 4]], columns=columns, index=index, ) roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight") tm.assert_frame_equal(df, roundtrip)