import numpy as np import pytest from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd from pandas import ( CategoricalIndex, DataFrame, Index, MultiIndex, Series, crosstab, ) import pandas._testing as tm class TestCrosstab: def setup_method(self, method): df = DataFrame( { "A": [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo", ], "B": [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one", ], "C": [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], "D": np.random.randn(11), "E": np.random.randn(11), "F": np.random.randn(11), } ) self.df = pd.concat([df, df], ignore_index=True) def test_crosstab_single(self): df = self.df result = crosstab(df["A"], df["C"]) expected = df.groupby(["A", "C"]).size().unstack() tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) def test_crosstab_multiple(self): df = self.df result = crosstab(df["A"], [df["B"], df["C"]]) expected = df.groupby(["A", "B", "C"]).size() expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) result = crosstab([df["B"], df["C"]], df["A"]) expected = df.groupby(["B", "C", "A"]).size() expected = expected.unstack("A").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("box", [np.array, list, tuple]) def test_crosstab_ndarray(self, box): # GH 44076 a = box(np.random.randint(0, 5, size=100)) b = box(np.random.randint(0, 3, size=100)) c = box(np.random.randint(0, 10, size=100)) df = DataFrame({"a": a, "b": b, "c": c}) result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) expected = crosstab(df["a"], [df["b"], df["c"]]) tm.assert_frame_equal(result, expected) result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) expected = crosstab([df["b"], df["c"]], df["a"]) tm.assert_frame_equal(result, expected) # assign arbitrary names result = crosstab(a, c) expected = crosstab(df["a"], df["c"]) expected.index.names = ["row_0"] expected.columns.names = ["col_0"] tm.assert_frame_equal(result, expected) def test_crosstab_non_aligned(self): # GH 17005 a = Series([0, 1, 1], index=["a", "b", "c"]) b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) c = np.array([3, 4, 3]) expected = DataFrame( [[1, 0], [1, 1]], index=Index([0, 1], name="row_0"), columns=Index([3, 4], name="col_0"), ) result = crosstab(a, b) tm.assert_frame_equal(result, expected) result = crosstab(a, c) tm.assert_frame_equal(result, expected) def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({"a": a, "b": b, "c": c}) result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) assert result.index.names == ("a",) assert result.columns.names == ["b", "c"] all_cols = result["All", ""] exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name exp_margin = Series([len(df)], index=Index(["All"], name="a")) exp_cols = pd.concat([exp_cols, exp_margin]) exp_cols.name = ("All", "") tm.assert_series_equal(all_cols, exp_cols) all_rows = result.loc["All"] exp_rows = df.groupby(["b", "c"]).size().astype("i8") exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])]) exp_rows.name = "All" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) def test_crosstab_margins_set_margin_name(self): # GH 15972 a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({"a": a, "b": b, "c": c}) result = crosstab( a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True, margins_name="TOTAL", ) assert result.index.names == ("a",) assert result.columns.names == ["b", "c"] all_cols = result["TOTAL", ""] exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) exp_cols = pd.concat([exp_cols, exp_margin]) exp_cols.name = ("TOTAL", "") tm.assert_series_equal(all_cols, exp_cols) all_rows = result.loc["TOTAL"] exp_rows = df.groupby(["b", "c"]).size().astype("i8") exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])]) exp_rows.name = "TOTAL" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) msg = "margins_name argument must be a string" for margins_name in [666, None, ["a", "b"]]: with pytest.raises(ValueError, match=msg): crosstab( a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True, margins_name=margins_name, ) def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) values = np.random.randn(100) table = crosstab( [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] ) df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) expected = df.pivot_table( "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum ) tm.assert_frame_equal(table, expected) def test_crosstab_dropna(self): # GH 3820 a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) c = np.array( ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object ) res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) m = MultiIndex.from_tuples( [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], names=["b", "c"], ) tm.assert_index_equal(res.columns, m) def test_crosstab_no_overlap(self): # GS 10291 s1 = Series([1, 2, 3], index=[1, 2, 3]) s2 = Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) expected = DataFrame( index=Index([], dtype="int64", name="row_0"), columns=Index([], dtype="int64", name="col_0"), ) tm.assert_frame_equal(actual, expected) def test_margin_dropna(self): # GH 12577 # pivot_table counts null into margin ('All') # when margins=true and dropna=true df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna2(self): df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} ) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna3(self): df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} ) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna4(self): # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna5(self): df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} ) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna6(self): a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) c = np.array( ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object ) actual = crosstab( a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False ) m = MultiIndex.from_arrays( [ ["one", "one", "two", "two", "All"], ["dull", "shiny", "dull", "shiny", ""], ], names=["b", "c"], ) expected = DataFrame( [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m ) expected.index = Index(["bar", "foo", "All"], name="a") tm.assert_frame_equal(actual, expected) actual = crosstab( [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False ) m = MultiIndex.from_arrays( [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], names=["a", "b"], ) expected = DataFrame( [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m ) expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) actual = crosstab( [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True ) m = MultiIndex.from_arrays( [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], names=["a", "b"], ) expected = DataFrame( [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m ) expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) def test_crosstab_normalize(self): # Issue 12578 df = DataFrame( {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} ) rindex = Index([1, 2], name="a") cindex = Index([3, 4], name="b") full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex) col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) # Check all normalize args tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal) tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal) tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal) tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=1), crosstab(df.a, df.b, normalize="columns"), ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index") ) row_normal_margins = DataFrame( [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4], name="b", dtype="object"), ) col_normal_margins = DataFrame( [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], index=Index([1, 2], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b", dtype="object"), ) all_normal_margins = DataFrame( [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b", dtype="object"), ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins ) def test_crosstab_normalize_arrays(self): # GH#12578 df = DataFrame( {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} ) # Test arrays crosstab( [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) ) # Test with aggfunc norm_counts = DataFrame( [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b"), ) test_case = crosstab( df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True ) tm.assert_frame_equal(test_case, norm_counts) df = DataFrame( {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} ) norm_sum = DataFrame( [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b", dtype="object"), ) test_case = crosstab( df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True ) tm.assert_frame_equal(test_case, norm_sum) def test_crosstab_with_empties(self, using_array_manager): # Check handling of empties df = DataFrame( { "a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) empty = DataFrame( [[0.0, 0.0], [0.0, 0.0]], index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) for i in [True, "index", "columns"]: calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i) tm.assert_frame_equal(empty, calculated) nans = DataFrame( [[0.0, np.nan], [0.0, 0.0]], index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) if using_array_manager: # INFO(ArrayManager) column without NaNs can preserve int dtype nans[3] = nans[3].astype("int64") calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) tm.assert_frame_equal(nans, calculated) def test_crosstab_errors(self): # Issue 12578 df = DataFrame( {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} ) error = "values cannot be used without an aggfunc." with pytest.raises(ValueError, match=error): crosstab(df.a, df.b, values=df.c) error = "aggfunc cannot be used without values" with pytest.raises(ValueError, match=error): crosstab(df.a, df.b, aggfunc=np.mean) error = "Not a valid normalize argument" with pytest.raises(ValueError, match=error): crosstab(df.a, df.b, normalize="42") with pytest.raises(ValueError, match=error): crosstab(df.a, df.b, normalize=42) error = "Not a valid margins argument" with pytest.raises(ValueError, match=error): crosstab(df.a, df.b, normalize="all", margins=42) def test_crosstab_with_categorial_columns(self): # GH 8860 df = DataFrame( { "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], } ) categories = ["Sedan", "Electric", "Pickup"] df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) result = crosstab(df["MAKE"], df["MODEL"]) expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE") expected_columns = CategoricalIndex( categories, categories=categories, ordered=False, name="MODEL" ) expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] expected = DataFrame( expected_data, index=expected_index, columns=expected_columns ) tm.assert_frame_equal(result, expected) def test_crosstab_with_numpy_size(self): # GH 4003 df = DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, "D": np.random.randn(24), "E": np.random.randn(24), } ) result = crosstab( index=[df["A"], df["B"]], columns=[df["C"]], margins=True, aggfunc=np.size, values=df["D"], ) expected_index = MultiIndex( levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], names=["A", "B"], ) expected_column = Index(["bar", "foo", "All"], dtype="object", name="C") expected_data = np.array( [ [2.0, 2.0, 4.0], [2.0, 2.0, 4.0], [2.0, 2.0, 4.0], [2.0, np.nan, 2.0], [np.nan, 2.0, 2.0], [2.0, np.nan, 2.0], [np.nan, 2.0, 2.0], [2.0, np.nan, 2.0], [np.nan, 2.0, 2.0], [12.0, 12.0, 24.0], ] ) expected = DataFrame( expected_data, index=expected_index, columns=expected_column ) # aggfunc is np.size, resulting in integers expected["All"] = expected["All"].astype("int64") tm.assert_frame_equal(result, expected) def test_crosstab_duplicate_names(self): # GH 13279 / 22529 s1 = Series(range(3), name="foo") s2_foo = Series(range(1, 4), name="foo") s2_bar = Series(range(1, 4), name="bar") s3 = Series(range(3), name="waldo") # check result computed with duplicate labels against # result computed with unique labels, then relabelled mapper = {"bar": "foo"} # duplicate row, column labels result = crosstab(s1, s2_foo) expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1) tm.assert_frame_equal(result, expected) # duplicate row, unique column labels result = crosstab([s1, s2_foo], s3) expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0) tm.assert_frame_equal(result, expected) # unique row, duplicate column labels result = crosstab(s3, [s1, s2_foo]) expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): s1 = Series(range(3), name=names[0]) s2 = Series(range(1, 4), name=names[1]) mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names) expected = Series(1, index=mi).unstack(1, fill_value=0) result = crosstab(s1, s2) tm.assert_frame_equal(result, expected) def test_crosstab_both_tuple_names(self): # GH 18321 s1 = Series(range(3), name=("a", "b")) s2 = Series(range(3), name=("c", "d")) expected = DataFrame( np.eye(3, dtype="int64"), index=Index(range(3), name=("a", "b")), columns=Index(range(3), name=("c", "d")), ) result = crosstab(s1, s2) tm.assert_frame_equal(result, expected) def test_crosstab_unsorted_order(self): df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = crosstab(df.index, [df.b, df.a]) e_idx = Index(["A", "B", "C"], name="row_0") e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"]) expected = DataFrame( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns ) tm.assert_frame_equal(result, expected) def test_crosstab_normalize_multiple_columns(self): # GH 15150 df = DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, "D": [0] * 24, "E": [0] * 24, } ) result = crosstab( [df.A, df.B], df.C, values=df.D, aggfunc=np.sum, normalize=True, margins=True, ) expected = DataFrame( np.array([0] * 29 + [1], dtype=float).reshape(10, 3), columns=Index(["bar", "foo", "All"], dtype="object", name="C"), index=MultiIndex.from_tuples( [ ("one", "A"), ("one", "B"), ("one", "C"), ("three", "A"), ("three", "B"), ("three", "C"), ("two", "A"), ("two", "B"), ("two", "C"), ("All", ""), ], names=["A", "B"], ), ) tm.assert_frame_equal(result, expected) def test_margin_normalize(self): # GH 27500 df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) # normalize on index result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 ) expected = DataFrame( [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) expected.columns = Index(["large", "small"], dtype="object", name="C") tm.assert_frame_equal(result, expected) # normalize on columns result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 ) expected = DataFrame( [ [0.25, 0.2, 0.222222], [0.25, 0.2, 0.222222], [0.5, 0.2, 0.333333], [0, 0.4, 0.222222], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["bar", "foo"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["A", "B"], ) tm.assert_frame_equal(result, expected) # normalize on both index and column result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True ) expected = DataFrame( [ [0.111111, 0.111111, 0.222222], [0.111111, 0.111111, 0.222222], [0.222222, 0.111111, 0.333333], [0.000000, 0.222222, 0.222222], [0.444444, 0.555555, 1], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) tm.assert_frame_equal(result, expected) def test_margin_normalize_multiple_columns(self): # GH 35144 # use multiple columns with margins and normalization df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) result = crosstab( index=df.C, columns=[df.A, df.B], margins=True, margins_name="margin", normalize=True, ) expected = DataFrame( [ [0.111111, 0.111111, 0.222222, 0.000000, 0.444444], [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], [0.222222, 0.222222, 0.333333, 0.222222, 1.0], ], index=["large", "small", "margin"], ) expected.columns = MultiIndex( levels=[["bar", "foo", "margin"], ["", "one", "two"]], codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], names=["A", "B"], ) expected.index.name = "C" tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("a_dtype", ["category", "int64"]) @pytest.mark.parametrize("b_dtype", ["category", "int64"]) def test_categoricals(a_dtype, b_dtype): # https://github.com/pandas-dev/pandas/issues/37465 g = np.random.RandomState(25982704) a = Series(g.randint(0, 3, size=100)).astype(a_dtype) b = Series(g.randint(0, 2, size=100)).astype(b_dtype) result = crosstab(a, b, margins=True, dropna=False) columns = Index([0, 1, "All"], dtype="object", name="col_0") index = Index([0, 1, 2, "All"], dtype="object", name="row_0") values = [[18, 16, 34], [18, 16, 34], [16, 16, 32], [52, 48, 100]] expected = DataFrame(values, index, columns) tm.assert_frame_equal(result, expected) # Verify when categorical does not have all values present a.loc[a == 1] = 2 a_is_cat = is_categorical_dtype(a.dtype) assert not a_is_cat or a.value_counts().loc[1] == 0 result = crosstab(a, b, margins=True, dropna=False) values = [[18, 16, 34], [0, 0, 0], [34, 32, 66], [52, 48, 100]] expected = DataFrame(values, index, columns) if not a_is_cat: expected = expected.loc[[0, 2, "All"]] expected["All"] = expected["All"].astype("int64") repr(result) repr(expected) repr(expected.loc[[0, 2, "All"]]) tm.assert_frame_equal(result, expected)