""" Tests that duplicate columns are handled appropriately when parsed by the CSV engine. In general, the expected result is that they are either thoroughly de-duplicated (if mangling requested) or ignored otherwise. """ from io import StringIO import pytest from pandas import DataFrame import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @skip_pyarrow @pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) def test_basic(all_parsers, kwargs): # TODO: add test for condition "mangle_dupe_cols=False" # once it is actually supported (gh-12935) parser = all_parsers data = "a,a,b,b,b\n1,2,3,4,5" result = parser.read_csv(StringIO(data), sep=",", **kwargs) expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) tm.assert_frame_equal(result, expected) @skip_pyarrow def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers data = "a,b,a\n0,1,2\n3,4,5" expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) def test_basic_names_raise(all_parsers): # See gh-7160 parser = all_parsers data = "0,1,2\n3,4,5" with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=["a", "b", "a"]) @skip_pyarrow @pytest.mark.parametrize( "data,expected", [ ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])), ( "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", DataFrame( [[1, 2, 3, 4, 5, 6]], columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], ), ), ( "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", DataFrame( [[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"], ), ), ], ) def test_thorough_mangle_columns(all_parsers, data, expected): # see gh-17060 parser = all_parsers result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @skip_pyarrow @pytest.mark.parametrize( "data,names,expected", [ ( "a,b,b\n1,2,3", ["a.1", "a.1", "a.1.1"], DataFrame( [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] ), ), ( "a,b,c,d,e,f\n1,2,3,4,5,6", ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], DataFrame( [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], ), ), ( "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", ["a", "a", "a.3", "a.1", "a.2", "a", "a"], DataFrame( [ ["a", "b", "c", "d", "e", "f", "g"], ["1", "2", "3", "4", "5", "6", "7"], ], columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], ), ), ], ) def test_thorough_mangle_names(all_parsers, data, names, expected): # see gh-17095 parser = all_parsers with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=names) @skip_pyarrow def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" parser = all_parsers orig_value = [1, 2, 3] df = DataFrame({orig_key: orig_value}) # This test recursively updates `df`. for i in range(3): expected = DataFrame() for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value df = parser.read_csv(StringIO(df.to_csv())) tm.assert_frame_equal(df, expected) @skip_pyarrow def test_mangle_dupe_cols_already_exists(all_parsers): # GH#14704 parser = all_parsers data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7" result = parser.read_csv(StringIO(data)) expected = DataFrame( [[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"], ) tm.assert_frame_equal(result, expected) @skip_pyarrow def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): # GH#14704 parser = all_parsers data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4" result = parser.read_csv(StringIO(data)) expected = DataFrame( [[1, 2, 3, 4]], columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"], ) tm.assert_frame_equal(result, expected)