from textwrap import dedent import numpy as np import pandas as pd import pytest import dask.array as da import dask.dataframe as dd style = """ """ def test_repr(): df = pd.DataFrame({"x": list(range(100))}) ddf = dd.from_pandas(df, 3) for x in [ddf, ddf.index, ddf.x]: assert type(x).__name__ in repr(x) assert str(x.npartitions) in repr(x) def test_repr_meta_mutation(): # Check that the repr changes when meta changes df = pd.DataFrame({"a": range(5), "b": ["a", "b", "c", "d", "e"]}) ddf = dd.from_pandas(df, npartitions=2) s1 = repr(ddf) assert repr(ddf) == s1 ddf.b = ddf.b.astype("category") assert repr(ddf) != s1 def test_dataframe_format(): pytest.importorskip("jinja2") df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8], "B": list("ABCDEFGH"), "C": pd.Categorical(list("AAABBBCC")), } ) ddf = dd.from_pandas(df, 3) exp = ( "Dask DataFrame Structure:\n" " A B C\n" "npartitions=3 \n" "0 int64 object category[known]\n" "3 ... ... ...\n" "6 ... ... ...\n" "7 ... ... ...\n" "Dask Name: from_pandas, 3 tasks" ) assert repr(ddf) == exp assert str(ddf) == exp exp = ( " A B C\n" "npartitions=3 \n" "0 int64 object category[known]\n" "3 ... ... ...\n" "6 ... ... ...\n" "7 ... ... ..." ) assert ddf.to_string() == exp exp_table = """
A B C
npartitions=3
0 int64 object category[known]
3 ... ... ...
6 ... ... ...
7 ... ... ...
""" exp = """
Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( exp_table=exp_table ) assert ddf.to_html() == exp # table is boxed with div and has style exp = """
Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( style=style, exp_table=exp_table ) assert ddf._repr_html_() == exp def test_dataframe_format_with_index(): pytest.importorskip("jinja2") df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8], "B": list("ABCDEFGH"), "C": pd.Categorical(list("AAABBBCC")), }, index=list("ABCDEFGH"), ) ddf = dd.from_pandas(df, 3) exp = ( "Dask DataFrame Structure:\n" " A B C\n" "npartitions=3 \n" "A int64 object category[known]\n" "D ... ... ...\n" "G ... ... ...\n" "H ... ... ...\n" "Dask Name: from_pandas, 3 tasks" ) assert repr(ddf) == exp assert str(ddf) == exp exp_table = """
A B C
npartitions=3
A int64 object category[known]
D ... ... ...
G ... ... ...
H ... ... ...
""" exp = """
Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( exp_table=exp_table ) assert ddf.to_html() == exp # table is boxed with div and has style exp = """
Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( style=style, exp_table=exp_table ) assert ddf._repr_html_() == exp def test_dataframe_format_unknown_divisions(): pytest.importorskip("jinja2") df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8], "B": list("ABCDEFGH"), "C": pd.Categorical(list("AAABBBCC")), } ) ddf = dd.from_pandas(df, 3) ddf = ddf.clear_divisions() assert not ddf.known_divisions exp = ( "Dask DataFrame Structure:\n" " A B C\n" "npartitions=3 \n" " int64 object category[known]\n" " ... ... ...\n" " ... ... ...\n" " ... ... ...\n" "Dask Name: from_pandas, 3 tasks" ) assert repr(ddf) == exp assert str(ddf) == exp exp = ( " A B C\n" "npartitions=3 \n" " int64 object category[known]\n" " ... ... ...\n" " ... ... ...\n" " ... ... ..." ) assert ddf.to_string() == exp exp_table = """
A B C
npartitions=3
int64 object category[known]
... ... ...
... ... ...
... ... ...
""" exp = """
Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( exp_table=exp_table ) assert ddf.to_html() == exp # table is boxed with div and has style exp = """
Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 3 tasks
""".format( style=style, exp_table=exp_table ) assert ddf._repr_html_() == exp def test_dataframe_format_long(): pytest.importorskip("jinja2") df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8] * 10, "B": list("ABCDEFGH") * 10, "C": pd.Categorical(list("AAABBBCC") * 10), } ) ddf = dd.from_pandas(df, 10) exp = ( "Dask DataFrame Structure:\n" " A B C\n" "npartitions=10 \n" "0 int64 object category[known]\n" "8 ... ... ...\n" "... ... ... ...\n" "72 ... ... ...\n" "79 ... ... ...\n" "Dask Name: from_pandas, 10 tasks" ) assert repr(ddf) == exp assert str(ddf) == exp exp = ( " A B C\n" "npartitions=10 \n" "0 int64 object category[known]\n" "8 ... ... ...\n" "... ... ... ...\n" "72 ... ... ...\n" "79 ... ... ..." ) assert ddf.to_string() == exp exp_table = """
A B C
npartitions=10
0 int64 object category[known]
8 ... ... ...
... ... ... ...
72 ... ... ...
79 ... ... ...
""" exp = """
Dask DataFrame Structure:
{exp_table}
Dask Name: from_pandas, 10 tasks
""".format( exp_table=exp_table ) assert ddf.to_html() == exp # table is boxed with div exp = """
Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 10 tasks
""".format( style=style, exp_table=exp_table ) assert ddf._repr_html_() == exp def test_series_format(): s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH")) ds = dd.from_pandas(s, 3) exp = """Dask Series Structure: npartitions=3 A int64 D ... G ... H ... dtype: int64 Dask Name: from_pandas, 3 tasks""" assert repr(ds) == exp assert str(ds) == exp exp = """npartitions=3 A int64 D ... G ... H ...""" assert ds.to_string() == exp s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH"), name="XXX") ds = dd.from_pandas(s, 3) exp = """Dask Series Structure: npartitions=3 A int64 D ... G ... H ... Name: XXX, dtype: int64 Dask Name: from_pandas, 3 tasks""" assert repr(ds) == exp assert str(ds) == exp def test_series_format_long(): s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 10, index=list("ABCDEFGHIJ") * 10) ds = dd.from_pandas(s, 10) exp = ( "Dask Series Structure:\nnpartitions=10\nA int64\nB ...\n" " ... \nJ ...\nJ ...\ndtype: int64\n" "Dask Name: from_pandas, 10 tasks" ) assert repr(ds) == exp assert str(ds) == exp exp = "npartitions=10\nA int64\nB ...\n ... \nJ ...\nJ ..." assert ds.to_string() == exp def test_index_format(): s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list("ABCDEFGH")) ds = dd.from_pandas(s, 3) exp = """Dask Index Structure: npartitions=3 A object D ... G ... H ... dtype: object Dask Name: from_pandas, 6 tasks""" assert repr(ds.index) == exp assert str(ds.index) == exp s = pd.Series( [1, 2, 3, 4, 5, 6, 7, 8], index=pd.CategoricalIndex([1, 2, 3, 4, 5, 6, 7, 8], name="YYY"), ) ds = dd.from_pandas(s, 3) exp = dedent( """\ Dask Index Structure: npartitions=3 1 category[known] 4 ... 7 ... 8 ... Name: YYY, dtype: category Dask Name: from_pandas, 6 tasks""" ) assert repr(ds.index) == exp assert str(ds.index) == exp def test_categorical_format(): s = pd.Series(["a", "b", "c"]).astype("category") known = dd.from_pandas(s, npartitions=1) unknown = known.cat.as_unknown() exp = ( "Dask Series Structure:\n" "npartitions=1\n" "0 category[known]\n" "2 ...\n" "dtype: category\n" "Dask Name: from_pandas, 1 tasks" ) assert repr(known) == exp exp = ( "Dask Series Structure:\n" "npartitions=1\n" "0 category[unknown]\n" "2 ...\n" "dtype: category\n" "Dask Name: from_pandas, 1 tasks" ) assert repr(unknown) == exp def test_duplicate_columns_repr(): arr = da.from_array(np.arange(10).reshape(5, 2), chunks=(5, 2)) frame = dd.from_dask_array(arr, columns=["a", "a"]) repr(frame) def test_empty_repr(): pytest.importorskip("jinja2") df = pd.DataFrame() ddf = dd.from_pandas(df, npartitions=1) exp = ( "Empty Dask DataFrame Structure:\n" "Columns: []\n" "Divisions: [, ]\n" "Dask Name: from_pandas, 1 tasks" ) assert repr(ddf) == exp exp_table = """
npartitions=1
""" exp = """
Dask DataFrame Structure:
{style}{exp_table}
Dask Name: from_pandas, 1 tasks
""".format( style=style, exp_table=exp_table ) assert ddf._repr_html_() == exp