from datetime import ( date, datetime, timedelta, ) from functools import partial from io import BytesIO import os import re import numpy as np import pytest import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, get_option, set_option, ) import pandas._testing as tm from pandas.io.excel import ( ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, register_writer, ) @pytest.fixture def path(ext): """ Fixture to open file for use in each test case. """ with tm.ensure_clean(ext) as file_path: yield file_path @pytest.fixture def set_engine(engine, ext): """ Fixture to set engine for use in each test case. Rather than requiring `engine=...` to be provided explicitly as an argument in each test, this fixture sets a global option to dictate which engine should be used to write Excel files. After executing the test it rolls back said change to the global option. """ option_name = f"io.excel.{ext.strip('.')}.writer" prev_engine = get_option(option_name) set_option(option_name, engine) yield set_option(option_name, prev_engine) # Roll back option change @pytest.mark.parametrize( "ext", [ pytest.param(".xlsx", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), pytest.param(".xlsm", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), pytest.param(".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")]), pytest.param( ".xlsx", marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")] ), pytest.param(".ods", marks=td.skip_if_no("odf")), ], ) class TestRoundTrip: @pytest.mark.parametrize( "header,expected", [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], ) def test_read_one_empty_col_no_header(self, ext, header, expected): # xref gh-12292 filename = "no_header" df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: df.to_excel(path, filename, index=False, header=False) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "header,expected", [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], ) def test_read_one_empty_col_with_header(self, ext, header, expected): filename = "with_header" df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: df.to_excel(path, "with_header", index=False, header=True) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) tm.assert_frame_equal(result, expected) def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names refdf = DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: refdf.to_excel(writer, "Data_no_head", header=False, index=False) refdf.to_excel(writer, "Data_with_head", index=False) refdf.columns = ["A", "B"] with ExcelFile(pth) as reader: xlsdf_no_head = pd.read_excel( reader, sheet_name="Data_no_head", header=None, names=["A", "B"] ) xlsdf_with_head = pd.read_excel( reader, sheet_name="Data_with_head", index_col=None, names=["A", "B"], ) tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) def test_creating_and_reading_multiple_sheets(self, ext): # see gh-9450 # # Test reading multiple sheets, from a runtime # created Excel file with multiple sheets. def tdf(col_sheet_name): d, i = [11, 22, 33], [1, 2, 3] return DataFrame(d, i, columns=[col_sheet_name]) sheets = ["AAA", "BBB", "CCC"] dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in dfs.items(): df.to_excel(ew, sheetname) dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with tm.ensure_clean(ext) as path: df = DataFrame( { ("One", "x"): {0: 1}, ("Two", "X"): {0: 3}, ("Two", "Y"): {0: 7}, ("Zero", ""): {0: 0}, } ) expected = DataFrame( { ("One", "x"): {0: 1}, ("Two", "X"): {0: 3}, ("Two", "Y"): {0: 7}, ("Zero", "Unnamed: 4_level_1"): {0: 0}, } ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) df = DataFrame( { ("Beg", ""): {0: 0}, ("Middle", "x"): {0: 1}, ("Tail", "X"): {0: 3}, ("Tail", "Y"): {0: 7}, } ) expected = DataFrame( { ("Beg", "Unnamed: 1_level_1"): {0: 0}, ("Middle", "x"): {0: 1}, ("Tail", "X"): {0: 3}, ("Tail", "Y"): {0: 7}, } ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) @pytest.mark.parametrize("c_idx_names", [True, False]) @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) def test_excel_multindex_roundtrip( self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request ): # see gh-4679 with tm.ensure_clean(ext) as pth: if (c_idx_levels == 1 and c_idx_names) and not ( r_idx_levels == 3 and not r_idx_names ): mark = pytest.mark.xfail( reason="Column index name cannot be serialized unless " "it's a MultiIndex" ) request.node.add_marker(mark) # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 df = tm.makeCustomDataframe( 5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels ) df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} ) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") with tm.ensure_clean(ext) as pth: df2.to_excel(pth) res = pd.read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel( pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 ) tm.assert_frame_equal(df, res) def test_multiindex_interval_datetimes(self, ext): # GH 30986 midx = MultiIndex.from_arrays( [ range(4), pd.interval_range( start=pd.Timestamp("2020-01-01"), periods=4, freq="6M" ), ] ) df = DataFrame(range(4), index=midx) with tm.ensure_clean(ext) as pth: df.to_excel(pth) result = pd.read_excel(pth, index_col=[0, 1]) expected = DataFrame( range(4), MultiIndex.from_arrays( [ range(4), [ "(2020-01-31, 2020-07-31]", "(2020-07-31, 2021-01-31]", "(2021-01-31, 2021-07-31]", "(2021-07-31, 2022-01-31]", ], ] ), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "engine,ext", [ pytest.param( "openpyxl", ".xlsx", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], ), pytest.param( "openpyxl", ".xlsm", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], ), pytest.param( "xlwt", ".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")] ), pytest.param( "xlsxwriter", ".xlsx", marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")], ), pytest.param("odf", ".ods", marks=td.skip_if_no("odf")), ], ) @pytest.mark.usefixtures("set_engine") class TestExcelWriter: def test_excel_sheet_size(self, path): # GH 26080 breaking_row_count = 2**20 + 1 breaking_col_count = 2**14 + 1 # purposely using two arrays to prevent memory issues while testing row_arr = np.zeros(shape=(breaking_row_count, 1)) col_arr = np.zeros(shape=(1, breaking_col_count)) row_df = DataFrame(row_arr) col_df = DataFrame(col_arr) msg = "sheet is too large" with pytest.raises(ValueError, match=msg): row_df.to_excel(path) with pytest.raises(ValueError, match=msg): col_df.to_excel(path) def test_excel_sheet_by_name_raise(self, path, engine): gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) with ExcelFile(path) as xl: df = pd.read_excel(xl, sheet_name=0, index_col=0) tm.assert_frame_equal(gt, df) msg = "Worksheet named '0' not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: frame.to_excel(writer, "Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] frame2.to_excel(writer, "Data2") with ExcelFile(path) as reader: found_df = pd.read_excel(reader, sheet_name="Data1", index_col=0) found_df2 = pd.read_excel(reader, sheet_name="Data2", index_col=0) tm.assert_frame_equal(found_df, frame) tm.assert_frame_equal(found_df2, frame2) def test_roundtrip(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # test roundtrip frame.to_excel(path, "test1") recons = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", index=False) recons = pd.read_excel(path, sheet_name="test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", na_rep="NA") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 frame.to_excel(path, "test1", na_rep="88") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", na_rep="88") recons = pd.read_excel( path, sheet_name="test1", index_col=0, na_values=[88, 88.0] ) tm.assert_frame_equal(frame, recons) # GH 6573 frame.to_excel(path, "Sheet1") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "0") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method s = frame["A"] s.to_excel(path) recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) def test_mixed(self, frame, path): mixed_frame = frame.copy() mixed_frame["foo"] = "bar" mixed_frame.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) def test_ts_frame(self, tsframe, path): df = tsframe # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(df.index), freq=None) df.index = index df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons) def test_basics_with_nan(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, np_type, path): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) recons2 = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. float_frame = df.astype(float) float_frame.columns = float_frame.columns.astype(float) float_frame.index = float_frame.index.astype(float) with tm.assert_produces_warning( FutureWarning, match="convert_float is deprecated" ): recons = pd.read_excel( path, sheet_name="test1", convert_float=False, index_col=0 ) tm.assert_frame_equal(recons, float_frame) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np_type ) tm.assert_frame_equal(df, recons) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): # Test np.bool8 and np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np_type ) tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, path): df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons) def test_sheets(self, frame, tsframe, path): # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # Test writing to separate sheets with ExcelWriter(path) as writer: frame.to_excel(writer, "test1") tsframe.to_excel(writer, "test2") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, sheet_name="test2", index_col=0) tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) assert "test1" == reader.sheet_names[0] assert "test2" == reader.sheet_names[1] def test_colaliases(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) frame.to_excel(path, "test1", header=col_aliases) with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) def test_roundtrip_indexlabels(self, merge_cells, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # test index_label df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel( path, "test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 ) df.index.names = ["test"] tm.assert_frame_equal(df, recons.astype(bool)) frame.to_excel( path, "test1", columns=["A", "B", "C", "D"], index=False, merge_cells=merge_cells, ) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') df = frame.copy() df = df.set_index(["A", "B"]) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons) def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) df.index.name = "foo" df.to_excel(path, merge_cells=merge_cells) with ExcelFile(path) as xf: result = pd.read_excel(xf, sheet_name=xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == "foo" def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): # datetime.date, not sure what to test here exactly # freq does not round-trip index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] tsf.to_excel(path, "test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(tsframe, recons) def test_excel_date_datetime_format(self, engine, ext, path): # see gh-4133 # # Excel output format strings df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], ], index=["DATE", "DATETIME"], columns=["X", "Y"], ) df_expected = DataFrame( [ [datetime(2014, 1, 31), datetime(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], ], index=["DATE", "DATETIME"], columns=["X", "Y"], ) with tm.ensure_clean(ext) as filename2: with ExcelWriter(path) as writer1: df.to_excel(writer1, "test1") with ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", ) as writer2: df.to_excel(writer2, "test1") with ExcelFile(path) as reader1: rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) def test_to_excel_interval_no_labels(self, path): # see gh-19242 # # Test writing Interval without labels. df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_interval_labels(self, path): # see gh-19242 # # Test writing Interval with labels. df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() intervals = pd.cut( df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] ) df["new"] = intervals expected["new"] = pd.Series(list(intervals)) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_timedelta(self, path): # see gh-19242, gh-9155 # # Test writing timedelta to xls. df = DataFrame( np.random.randint(-10, 10, size=(20, 1)), columns=["A"], dtype=np.int64 ) expected = df.copy() df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / 86400 ) df.to_excel(path, "test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, tsframe, path): xp = tsframe.resample("M", kind="period").mean() xp.to_excel(path, "sht1") with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="sht1", index_col=0) tm.assert_frame_equal(xp, rs.to_period("M")) def test_to_excel_multiindex(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", columns=["A", "B"]) # round trip frame.to_excel(path, "test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 def test_to_excel_multiindex_nan_label(self, merge_cells, path): df = DataFrame({"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)}) df = df.set_index(["A", "B"]) df.to_excel(path, merge_cells=merge_cells) df1 = pd.read_excel(path, index_col=[0, 1]) tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells def test_to_excel_multiindex_cols(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) frame.columns = new_cols_index header = [0, 1] if not merge_cells: header = 0 # round trip frame.to_excel(path, "test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel( reader, sheet_name="test1", header=header, index_col=[0, 1] ) if not merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ["time", "foo"] tsframe.to_excel(path, "test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ("time", "foo") def test_to_excel_multiindex_no_write_index(self, path): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) # Add a MI. frame2 = frame1.copy() multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index # Write out to Excel without the index. frame2.to_excel(path, "test1", index=False) # Read it back in. with ExcelFile(path) as reader: frame3 = pd.read_excel(reader, sheet_name="test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) def test_to_excel_float_format(self, path): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) df.to_excel(path, "test1", float_format="%.2f") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) expected = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"], ) tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, ext): # Avoid mixed inferred_type. df = DataFrame( [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], index=["A\u0192", "B"], columns=["X\u0193", "Y", "Z"], ) with tm.ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") result = pd.read_excel(filename, sheet_name="TestSheet", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, ext, path): with tm.ensure_clean("\u0192u." + ext) as filename: try: f = open(filename, "wb") except UnicodeEncodeError: pytest.skip("No unicode file names on this system") finally: f.close() df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) df.to_excel(filename, "test1", float_format="%.2f") with ExcelFile(filename) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) expected = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"], ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("use_headers", [True, False]) @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) def test_excel_010_hemstring( self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path ): def roundtrip(data, header=True, parser_hdr=0, index=True): data.to_excel(path, header=header, merge_cells=merge_cells, index=index) with ExcelFile(path) as xf: return pd.read_excel( xf, sheet_name=xf.sheet_names[0], header=parser_hdr ) # Basic test. parser_header = 0 if use_headers else None res = roundtrip(DataFrame([0]), use_headers, parser_header) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan # More complex tests with multi-index. nrows = 5 ncols = 3 # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 df = tm.makeCustomDataframe( nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels ) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: msg = ( "Writing to Excel with MultiIndex columns and no index " "\\('index'=False\\) is not yet implemented." ) with pytest.raises(NotImplementedError, match=msg): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) if use_headers: assert res.shape == (nrows, ncols + r_idx_nlevels) else: # First row taken as columns. assert res.shape == (nrows - 1, ncols + r_idx_nlevels) # No NaNs. for r in range(len(res.index)): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan def test_duplicated_columns(self, path): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) df.to_excel(path, "test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) # By default, we mangle. result = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. result = pd.read_excel( path, sheet_name="test1", index_col=0, mangle_dupe_cols=True ) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(path, "test1") result = pd.read_excel(path, sheet_name="test1", index_col=0) expected = DataFrame( [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] ) tm.assert_frame_equal(result, expected) # see gh-10982 df.to_excel(path, "test1", index=False, header=False) result = pd.read_excel(path, sheet_name="test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): pd.read_excel(path, sheet_name="test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) write_frame.to_excel(path, "test1", columns=["B", "A"]) read_frame = pd.read_excel(path, sheet_name="test1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) tm.assert_series_equal(write_frame["B"], read_frame["B"]) def test_invalid_columns(self, path): # see gh-10982 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with pytest.raises(KeyError, match="Not all names specified"): write_frame.to_excel(path, "test1", columns=["B", "C"]) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): write_frame.to_excel(path, "test1", columns=["C", "D"]) @pytest.mark.parametrize( "to_excel_index,read_excel_index_col", [ (True, 0), # Include index in write to file (False, None), # Dont include index in write to file ], ) def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): # GH 31677 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) write_frame.to_excel( path, "col_subset_bug", columns=["A", "B"], index=to_excel_index ) expected = write_frame[["A", "B"]] read_frame = pd.read_excel( path, sheet_name="col_subset_bug", index_col=read_excel_index_col ) tm.assert_frame_equal(expected, read_frame) def test_comment_arg(self, path): # see gh-18735 # # Test the comment argument functionality to pd.read_excel. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Read file without comment arg. result1 = pd.read_excel(path, sheet_name="test_c", index_col=0) result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None result2 = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) def test_comment_default(self, path): # Re issue #18735 # Test the comment argument default to pd.read_excel # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Read file with default and explicit comment=None result1 = pd.read_excel(path, sheet_name="test_c") result2 = pd.read_excel(path, sheet_name="test_c", comment=None) tm.assert_frame_equal(result1, result2) def test_comment_used(self, path): # see gh-18735 # # Test the comment argument is working as expected when used. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) result = pd.read_excel(path, sheet_name="test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) def test_comment_empty_line(self, path): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) df.to_excel(path, index=False) # Test that all-comment lines at EoF are ignored expected = DataFrame({"a": [1], "b": [2]}) result = pd.read_excel(path, comment="#") tm.assert_frame_equal(result, expected) def test_datetimes(self, path): # Test writing and reading datetimes. For issue #9139. (xref #9185) datetimes = [ datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), datetime(2013, 1, 13, 4, 29, 49), datetime(2013, 1, 13, 6, 13, 42), datetime(2013, 1, 13, 7, 57, 35), datetime(2013, 1, 13, 9, 41, 28), datetime(2013, 1, 13, 11, 25, 21), datetime(2013, 1, 13, 13, 9, 14), datetime(2013, 1, 13, 14, 53, 7), datetime(2013, 1, 13, 16, 37, 0), datetime(2013, 1, 13, 18, 20, 52), ] write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") if path.endswith("xlsx") or path.endswith("xlsm"): pytest.skip( "Defaults to openpyxl and fails with floating point error on " "datetimes; may be fixed on newer versions of openpyxl - GH #38644" ) read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) def test_bytes_io(self, engine): # see gh-7074 with BytesIO() as bio: df = DataFrame(np.random.randn(10, 2)) # Pass engine explicitly, as there is no file path to infer from. with ExcelWriter(bio, engine=engine) as writer: df.to_excel(writer) bio.seek(0) reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): # see gh-8188. df = DataFrame( { "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], "numeric": [1, 2, 3.0], "str": ["apple", "banana", "cherry"], } ) df.to_excel(path, "Sheet1") read = pd.read_excel(path, sheet_name="Sheet1", header=0, index_col=0) expected = df.copy() expected.mixed = expected.mixed.apply(str) expected.numeric = expected.numeric.astype("int64") tm.assert_frame_equal(read, expected) def test_render_as_column_name(self, path): # see gh-34331 df = DataFrame({"render": [1, 2], "data": [3, 4]}) df.to_excel(path, "Sheet1") read = pd.read_excel(path, "Sheet1", index_col=0) expected = df tm.assert_frame_equal(read, expected) def test_true_and_false_value_options(self, path): # see gh-13347 df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) expected = df.replace({"foo": True, "bar": False}) df.to_excel(path) read_frame = pd.read_excel( path, true_values=["foo"], false_values=["bar"], index_col=0 ) tm.assert_frame_equal(read_frame, expected) def test_freeze_panes(self, path): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, merge_cells, path): # see GH-27006 mi = MultiIndex.from_tuples( [ (pd.Period("2018"), pd.Period("2018Q1")), (pd.Period("2018"), pd.Period("2018Q2")), ] ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(path) with tm.assert_produces_warning( FutureWarning, match="convert_float is deprecated" ): result = pd.read_excel( path, header=[0, 1], index_col=0, convert_float=False ) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], level=[0, 1], ) expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("dtype", [None, object]) def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): # GH 27008, GH 7056 tz = tz_aware_fixture data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(path) data = data.to_pydatetime() df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(path) def test_excel_duplicate_columns_with_names(self, path): # GH#39695 df = DataFrame({"A": [0, 1], "B": [10, 11]}) df.to_excel(path, columns=["A", "B", "A"], index=False) result = pd.read_excel(path) expected = DataFrame([[0, 10, 0], [1, 11, 1]], columns=["A", "B", "A.1"]) tm.assert_frame_equal(result, expected) def test_if_sheet_exists_raises(self, ext): # GH 40230 msg = "if_sheet_exists is only valid in append mode (mode='a')" with tm.ensure_clean(ext) as f: with pytest.raises(ValueError, match=re.escape(msg)): ExcelWriter(f, if_sheet_exists="replace") class TestExcelWriterEngineTests: @pytest.mark.parametrize( "klass,ext", [ pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), pytest.param(_XlwtWriter, ".xls", marks=td.skip_if_no("xlwt")), ], ) def test_ExcelWriter_dispatch(self, klass, ext): with tm.ensure_clean(ext) as path: with ExcelWriter(path) as writer: if ext == ".xlsx" and td.safe_import("xlsxwriter"): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): with pytest.raises(ValueError, match="No engine"): ExcelWriter("nothing") def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] class DummyClass(ExcelWriter): called_save = False called_write_cells = False supported_extensions = ["xlsx", "xls"] engine = "dummy" def save(self): called_save.append(True) def write_cells(self, *args, **kwargs): called_write_cells.append(True) def check_called(func): func() assert len(called_save) >= 1 assert len(called_write_cells) >= 1 del called_save[:] del called_write_cells[:] with pd.option_context("io.excel.xlsx.writer", "dummy"): path = "something.xlsx" with tm.ensure_clean(path) as filepath: register_writer(DummyClass) with ExcelWriter(filepath) as writer: assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) check_called(lambda: df.to_excel(filepath)) with tm.ensure_clean("something.xls") as filepath: check_called(lambda: df.to_excel(filepath, engine="dummy")) @pytest.mark.parametrize( "ext", [ pytest.param(".xlsx", marks=td.skip_if_no("xlsxwriter")), pytest.param(".xlsx", marks=td.skip_if_no("openpyxl")), pytest.param(".ods", marks=td.skip_if_no("odf")), ], ) def test_engine_kwargs_and_kwargs_raises(self, ext): # GH 40430 msg = re.escape("Cannot use both engine_kwargs and **kwargs") with pytest.raises(ValueError, match=msg): with ExcelWriter("", engine_kwargs={"a": 1}, b=2): pass @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") class TestFSPath: def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) with ExcelFile(path) as xl: result = os.fspath(xl) assert result == path def test_excelwriter_fspath(self): with tm.ensure_clean("foo.xlsx") as path: with ExcelWriter(path) as writer: assert os.fspath(writer) == str(path)